library(arules) library(dplyr) library(arulesViz) library(cluster) library(fpc)
2024-04-15
library(arules) library(dplyr) library(arulesViz) library(cluster) library(fpc)
set.seed(1024)
tr. id | bread | butter | milk | potato | tomato |
---|---|---|---|---|---|
1 | 1 | 1 | 1 | 0 | 0 |
2 | 0 | 1 | 1 | 0 | 0 |
3 | 1 | 0 | 0 | 1 | 1 |
A | B | C |
---|---|---|
x | 1 | T |
x | 2 | T |
y | 3 | F |
\(\downarrow\) binarize
A.x | A.y | B.1 | B.2 | B.3 | C.T | C.F |
---|---|---|---|---|---|---|
1 | 0 | 1 | 0 | 0 | 1 | 0 |
1 | 0 | 0 | 1 | 0 | 1 | 0 |
0 | 1 | 0 | 0 | 1 | 0 | 1 |
\(\{bread,butter\} \rightarrow \{milk\}\)
Support of an item set \(sup(X)\) is the proportion of the transactions including \(X\).
\(sup(X) = n_X/n_D = P(X)\)
Support of an association rule is
\(sup(X \rightarrow Y) = n_{X \cup Y}/n_D = P(X\cup Y)\)
Confidence of an association rule is
\(\displaystyle conf(X \rightarrow Y) = \frac{sup(X \cup Y)} {sup(X)}=P(Y|X)\)
Each rule \(X \rightarrow Y\) must satisfy the following restrictions:
\[ supp(X \cup Y ) \geq \sigma \\ conf(X \rightarrow Y ) \geq \gamma \]
Find all frequent itemsets above a certain minimal support, minsup\(=\sigma\)
Generate candidate association rules from these itemsets
Problem:
Agrawal and Srikant, 1994
Basic idea
\(sup(X\cup Y) \geq minsup \implies sup(X) \geq minsup \wedge sup(Y) \geq minsup\)
because, \(sup(X) \geq sup(X \cup Y)\)
Second step: Find a partiton \((s, i-s)\) of a frequent itemset \(i\) such that
\(conf(s \rightarrow (i-s)) \geq minconf=\gamma\)
\(\displaystyle conf(s \rightarrow (i-s)) = \frac{sup(s \cup (i-s))}{sup(s)} \geq \gamma\)
\(\displaystyle \frac{sup(i)}{sup(s)} \geq \gamma\)
At \(\gamma = 0.7\) the following set of rules is generated:
Support | Confidence | |
---|---|---|
{eggs} → {flour} | 3/5 = 0.6 | 3/4 = 0.75 |
{flour} → {eggs} | 3/5 = 0.6 | 3/3 = 1 |
{eggs, milk} → {flour} | 2/5 = 0.4 | 2/2 = 1 |
{flour, milk} → {eggs} | 2/5 = 0.4 | 2/2 = 1 |
arules
provides an interface to some of these implementationsdata(Boston, package="MASS") b <- Boston head(b)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat ## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 ## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 ## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 ## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 ## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 ## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21 ## medv ## 1 24.0 ## 2 21.6 ## 3 34.7 ## 4 33.4 ## 5 36.2 ## 6 28.7
# apriori only works with categorical data b$chas <- factor(b$chas, labels = c("river", "noriver")) b$rad <- factor(b$rad) b$black <- cut(b$black, breaks = 4, labels = c(">31.5%", "18.5-31.5%", "8-18.5%", "<8%")) discr <- function(x) cut(x, breaks = 4, labels = c("low", "medLow", "medHigh", "high")) b <- select(b, -one_of(c("chas", "rad", "black"))) %>% mutate_all(list(~discr(.))) %>% bind_cols(select(b, one_of(c("chas", "rad", "black")))) b <- as(b, "transactions") b
## transactions in sparse format with ## 506 transactions (rows) and ## 59 items (columns)
summary(b)
## transactions as itemMatrix in sparse format with ## 506 rows (elements/itemsets/transactions) and ## 59 columns (items) and a density of 0.2372881 ## ## most frequent items: ## crim=low chas=river black=<8% zn=low dis=low (Other) ## 491 471 452 429 305 4936 ## ## element (itemset/transaction) length distribution: ## sizes ## 14 ## 506 ## ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 14 14 14 14 14 14 ## ## includes extended item information - examples: ## labels variables levels ## 1 crim=low crim low ## 2 crim=medLow crim medLow ## 3 crim=medHigh crim medHigh ## ## includes extended transaction information - examples: ## transactionID ## 1 1 ## 2 2 ## 3 3
itemFrequencyPlot(b, support = 0.3, cex.names = 0.8)
Now we can run the apriori algorithm:
ars <- apriori(b, parameter = list( support = 0.025, confidence = 0.75, maxlen = 5))
## Apriori ## ## Parameter specification: ## confidence minval smax arem aval originalSupport maxtime support minlen ## 0.75 0.1 1 none FALSE TRUE 5 0.025 1 ## maxlen target ext ## 5 rules TRUE ## ## Algorithmic control: ## filter tree heap memopt load sort verbose ## 0.1 TRUE TRUE FALSE TRUE 2 TRUE ## ## Absolute minimum support count: 12 ## ## set item appearances ...[0 item(s)] done [0.00s]. ## set transactions ...[59 item(s), 506 transaction(s)] done [0.00s]. ## sorting and recoding items ... [52 item(s)] done [0.00s]. ## creating transaction tree ... done [0.00s]. ## checking subsets of size 1 2 3 4 5 done [0.01s]. ## writing ... [76499 rule(s)] done [0.00s]. ## creating S4 object ... done [0.01s].
inspect(head(ars))
## lhs rhs support confidence coverage lift ## [1] {} => {zn=low} 0.84782609 0.8478261 1.00000000 1.000000 ## [2] {} => {black=<8%} 0.89328063 0.8932806 1.00000000 1.000000 ## [3] {} => {chas=river} 0.93083004 0.9308300 1.00000000 1.000000 ## [4] {} => {crim=low} 0.97035573 0.9703557 1.00000000 1.000000 ## [5] {black=8-18.5%} => {age=high} 0.02766798 0.9333333 0.02964427 1.802545 ## [6] {black=8-18.5%} => {dis=low} 0.02766798 0.9333333 0.02964427 1.548415 ## count ## [1] 429 ## [2] 452 ## [3] 471 ## [4] 491 ## [5] 14 ## [6] 14
inspect(head(subset(ars, rhs %in% "medv=high"), 5, by="confidence"))
## lhs rhs support confidence coverage lift count ## [1] {rm=high, ## ptratio=low} => {medv=high} 0.02964427 1 0.02964427 15.8125 15 ## [2] {rm=high, ## ptratio=low, ## lstat=low} => {medv=high} 0.02964427 1 0.02964427 15.8125 15 ## [3] {rm=high, ## ptratio=low, ## black=<8%} => {medv=high} 0.02964427 1 0.02964427 15.8125 15 ## [4] {crim=low, ## rm=high, ## ptratio=low} => {medv=high} 0.02964427 1 0.02964427 15.8125 15 ## [5] {rm=high, ## ptratio=low, ## lstat=low, ## black=<8%} => {medv=high} 0.02964427 1 0.02964427 15.8125 15
Typical support distribution (retail point-of-sale data with 169 items):
Confidence of the rule is relatively high with \(\hat{P} (E_Y |E_X ) = .8\). But the unconditional probability \(\hat{P} (E_Y) = n_Y /n_D = 90/100 = .9\) is higher!
\(\displaystyle conf(X \rightarrow Y) = \frac{sup(X \cup Y)} {sup(X)}\)
\(\displaystyle lift(X\rightarrow Y) = \frac{sup(X \cup Y)}{sup(X)sup(Y)}\)
\(\displaystyle lift(X\rightarrow Y) = \frac{conf(X \rightarrow Y)}{sup(Y)}\)
\(\displaystyle lift(X\rightarrow Y) = \frac{P(Y|X)}{P(Y)}\)
inspect(head(subset(ars, subset = lhs %in% "nox=high" | rhs %in% "nox=high"), 5, by="confidence"))
## lhs rhs support confidence coverage lift ## [1] {nox=high} => {indus=medHigh} 0.04743083 1 0.04743083 3.066667 ## [2] {nox=high} => {age=high} 0.04743083 1 0.04743083 1.931298 ## [3] {nox=high} => {dis=low} 0.04743083 1 0.04743083 1.659016 ## [4] {nox=high} => {zn=low} 0.04743083 1 0.04743083 1.179487 ## [5] {nox=high} => {crim=low} 0.04743083 1 0.04743083 1.030550 ## count ## [1] 24 ## [2] 24 ## [3] 24 ## [4] 24 ## [5] 24
arulesViz
packageplot(ars)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
goodRules <- ars[quality(ars)$confidence > 0.8 & quality(ars)$lift > 5] inspect(head(goodRules, 3, by="confidence"))
## lhs rhs support confidence coverage ## [1] {nox=high, rad=5} => {ptratio=low} 0.03162055 1 0.03162055 ## [2] {nox=high, tax=medLow} => {ptratio=low} 0.03162055 1 0.03162055 ## [3] {rm=high, ptratio=low} => {medv=high} 0.02964427 1 0.02964427 ## lift count ## [1] 8.724138 16 ## [2] 8.724138 16 ## [3] 15.812500 15
shortRules <- subset(ars, subset = size(lhs) + size(rhs) <= 3) inspect(head(shortRules, 3, by="confidence"))
## lhs rhs support confidence coverage lift ## [1] {black=8-18.5%} => {zn=low} 0.02964427 1 0.02964427 1.179487 ## [2] {black=8-18.5%} => {chas=river} 0.02964427 1 0.02964427 1.074310 ## [3] {zn=medHigh} => {nox=low} 0.03162055 1 0.03162055 2.530000 ## count ## [1] 15 ## [2] 15 ## [3] 16
lhs %in% X
: lhs contains an item from Xlhs %ain% X
: lhs contains all items in Xlhs %oin% X
: lhs contains only items from Xlhs %pin% X
: lhs contains items partially matching from Xsomerules <- subset(goodRules, subset = lhs %pin% "crim" & rhs %oin% c("medv=high", "medv=medHigh")) inspect(head(somerules, 3, by="confidence"))
## lhs rhs support confidence coverage lift count ## [1] {crim=low, ## rm=high, ## ptratio=low} => {medv=high} 0.02964427 1 0.02964427 15.8125 15 ## [2] {crim=low, ## rm=high, ## ptratio=low, ## lstat=low} => {medv=high} 0.02964427 1 0.02964427 15.8125 15 ## [3] {crim=low, ## rm=high, ## ptratio=low, ## black=<8%} => {medv=high} 0.02964427 1 0.02964427 15.8125 15
somerules <- subset(ars, subset = (lhs %in% "nox=high" & size(lhs) < 3 & lift >= 1)) plot(somerules, method="matrix", measure="lift")
somerules <- subset(ars, subset = rhs %in% "medv=high" & lift > 15) plot(somerules, method="graph")
\(\displaystyle d(x,y) = \sqrt{ \sum^d_{i=1}{(x_i-y_i)^2} }\)
where \(x_i\) is the value of variable \(i\) on observation \(x\).
\(\displaystyle d(x,y) = \left (\sum^d_{i=1}{|x_i - y_i|^p}\right )^{1/p}\)
where \(x_i\) is the value of variable \(i\) on observation \(x\).
randDat <- matrix(rnorm(50), nrow = 5) dist(randDat) # Euclidean distance by default
## 1 2 3 4 ## 2 5.814467 ## 3 4.721438 3.320670 ## 4 3.673762 3.837265 3.805705 ## 5 4.804807 4.952012 3.994814 5.413585
dist(randDat, method="manhattan")
## 1 2 3 4 ## 2 15.538209 ## 3 12.087300 7.881318 ## 4 10.006198 10.580025 10.408077 ## 5 12.736963 11.085177 10.111811 13.300550
dist(randDat, method="minkowski", p = 4)
## 1 2 3 4 ## 2 4.092154 ## 3 3.266844 2.597013 ## 4 2.469316 2.474234 2.594040 ## 5 3.240055 3.920720 3.050767 4.191781
What to do with categorical variables?
\(\delta_i(v_1, v_2)=\begin{cases} 1 & i \text{ is nominal and }v_1 \neq v_2 \\ 0 & i \text{ is nominal and }v_1 = v_2 \\ \frac{|v_1-v_2|}{range(i)} & i \text{ is numeric} \end{cases}\)
daisy()
function of the cluster
package.How to measure the quality of a clustering?
\(H(C,k) = \sum_{c\in C}{\frac{h(c)}{|C|}}\)
\(H(C,k) = \min_{c\in C}{h(c)}\)
\(H(C,k) = \max_{c\in C}{h(c)}\)
How to measure \(h(c)\) ?
\(\displaystyle h(c) = \sum_{x\in c}{\sum_{i=1}^d {(v_{x,i}-\hat{v}_i^c)^2}}\)
\(\displaystyle h(c) = \sum_{x\in c}{\sum_{i=1}^d {|v_{x,i}-\tilde{v}_i^c|}}\)
data(iris) ir3 <- kmeans(iris[, -5], centers = 3, iter.max = 200) ir3
## K-means clustering with 3 clusters of sizes 21, 33, 96 ## ## Cluster means: ## Sepal.Length Sepal.Width Petal.Length Petal.Width ## 1 4.738095 2.904762 1.790476 0.3523810 ## 2 5.175758 3.624242 1.472727 0.2727273 ## 3 6.314583 2.895833 4.973958 1.7031250 ## ## Clustering vector: ## [1] 2 1 1 1 2 2 2 2 1 1 2 2 1 1 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 1 1 2 2 2 1 2 2 ## [38] 2 1 2 2 1 1 2 2 1 2 1 2 2 3 3 3 3 3 3 3 1 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 ## [75] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 ## [112] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ## [149] 3 3 ## ## Within cluster sum of squares by cluster: ## [1] 17.669524 6.432121 118.651875 ## (between_SS / total_SS = 79.0 %) ## ## Available components: ## ## [1] "cluster" "centers" "totss" "withinss" "tot.withinss" ## [6] "betweenss" "size" "iter" "ifault"
plot(iris$Sepal.Length, iris$Sepal.Width, col = ir3$cluster)
table(ir3$cluster, iris$Species)
## ## setosa versicolor virginica ## 1 17 4 0 ## 2 33 0 0 ## 3 0 46 50
\(s_i=\frac{b_i-a_i}{\max(a_i, b_i)}\)
where
silhouette
is implemented in cluster
s <- silhouette(ir3$cluster, dist(iris[,-5])) s[1:5, 3]
## [1] 0.57933436 0.02148063 -0.05730046 0.15589126 0.57128261
plot(s)
clv
pam()
in package cluster
d <- dist(scale(iris[,-5])) h <- hclust(d) plot(h, hang = -0.1, labels = iris[["Species"]], cex = 0.5)
clus3 <- cutree(h, 3) clus3
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ## [38] 1 1 1 1 2 1 1 1 1 1 1 1 1 3 3 3 2 3 2 3 2 3 2 2 3 2 3 3 3 3 2 2 2 3 3 3 3 ## [75] 3 3 3 3 3 2 2 2 2 3 3 3 3 2 3 2 2 3 2 2 2 3 3 3 2 2 3 3 3 3 3 3 2 3 3 3 3 ## [112] 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ## [149] 3 3
cm <- table(clus3, iris$Species) cm
## ## clus3 setosa versicolor virginica ## 1 49 0 0 ## 2 1 21 2 ## 3 0 29 48
plot(h, hang = -0.1, labels = iris[["Species"]], cex = 0.5) rect.hclust(h, k = 3)
clus_methods <- c("complete", "single", "average") for (m in 1:3) { cat("Method", clus_methods[m], "\n") hc <- hclust(d, meth = clus_methods[m]) for (k in 2:5) { clusters <- cutree(hc, k) sil <- silhouette(clusters, d) cat(" k=", k, " -> s=", mean(sil[,3]), "\n") } }
## Method complete ## k= 2 -> s= 0.4408121 ## k= 3 -> s= 0.4496185 ## k= 4 -> s= 0.4106071 ## k= 5 -> s= 0.352063 ## Method single ## k= 2 -> s= 0.58175 ## k= 3 -> s= 0.5046456 ## k= 4 -> s= 0.4067465 ## k= 5 -> s= 0.3424089 ## Method average ## k= 2 -> s= 0.58175 ## k= 3 -> s= 0.4802669 ## k= 4 -> s= 0.4067465 ## k= 5 -> s= 0.3746013
cluster
diana()
di <- diana(iris[, -5], metric = "euclidean", stand = TRUE) clusters <- cutree(di, 3) table(clusters, iris$Species)
## ## clusters setosa versicolor virginica ## 1 50 0 0 ## 2 0 11 33 ## 3 0 39 17
fpc
as dbscan()
d <- scale(iris[,-5]) db <- dbscan(d, eps=0.9, MinPts=5) db
## dbscan Pts=150 MinPts=5 eps=0.9 ## 0 1 2 ## border 4 1 4 ## seed 0 48 93 ## total 4 49 97
table(db$cluster,iris$Species)
## ## setosa versicolor virginica ## 0 1 0 3 ## 1 49 0 0 ## 2 0 50 47