library(arules) library(dplyr) library(arulesViz) library(cluster) library(fpc)
19/04/2020
library(arules) library(dplyr) library(arulesViz) library(cluster) library(fpc)
set.seed(1024)
Support of an item set \(sup(X)\) is the proportion of the transactions including \(X\).
\(sup(X) = P(X)\)
Support of an association rule is
\(sup(X \rightarrow Y) = P(X \cup Y)\)
Confidence of an association rule is
\(\displaystyle conf(X \rightarrow Y) = \frac{sup(X \cup Y)} {sup(X)}\)
\(=P(Y|X)\)
Find all frequent itemsets above a certain minimal support, minsup
Generate candidate association rules from these itemsets
Problem:
Agrawal and Srikant, 1994
Basic idea
\(sup(X\cup Y) \geq minsup \implies sup(X) \geq minsup \wedge sup(Y) \geq minsup\)
because, \(sup(X) \geq sup(X \cup Y)\)
\(conf(s \rightarrow (i-s)) \geq minconf\)
\(\displaystyle conf(s \rightarrow (i-s)) = \frac{sup(s \cup (i-s))}{sup(s)} \geq minconf\)
\(\displaystyle \frac{sup(i)}{sup(s)} \geq minconf\)
arules
provides an interface to some of these implementationsdata(Boston, package="MASS") b <- Boston head(b)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat ## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 ## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 ## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 ## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 ## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 ## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21 ## medv ## 1 24.0 ## 2 21.6 ## 3 34.7 ## 4 33.4 ## 5 36.2 ## 6 28.7
b$chas <- factor(b$chas, labels = c("river", "noriver")) b$rad <- factor(b$rad) b$black <- cut(b$black, breaks = 4, labels = c(">31.5%", "18.5-31.5%", "8-18.5%", "<8%")) discr <- function(x) cut(x, breaks = 4, labels = c("low", "medLow", "medHigh", "high")) b <- select(b, -one_of(c("chas", "rad", "black"))) %>% mutate_all(list(~discr(.))) %>% bind_cols(select(b, one_of(c("chas", "rad", "black")))) b <- as(b, "transactions") b
## transactions in sparse format with ## 506 transactions (rows) and ## 59 items (columns)
summary(b)
## transactions as itemMatrix in sparse format with ## 506 rows (elements/itemsets/transactions) and ## 59 columns (items) and a density of 0.2372881 ## ## most frequent items: ## crim=low chas=river black=<8% zn=low dis=low (Other) ## 491 471 452 429 305 4936 ## ## element (itemset/transaction) length distribution: ## sizes ## 14 ## 506 ## ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 14 14 14 14 14 14 ## ## includes extended item information - examples: ## labels variables levels ## 1 crim=low crim low ## 2 crim=medLow crim medLow ## 3 crim=medHigh crim medHigh ## ## includes extended transaction information - examples: ## transactionID ## 1 1 ## 2 2 ## 3 3
itemFrequencyPlot(b, support = 0.3, cex.names = 0.8)
Now we can run the apriori algorithm:
ars <- apriori(b, parameter = list( support = 0.025, confidence = 0.75, maxlen = 5))
## Apriori ## ## Parameter specification: ## confidence minval smax arem aval originalSupport maxtime support minlen ## 0.75 0.1 1 none FALSE TRUE 5 0.025 1 ## maxlen target ext ## 5 rules FALSE ## ## Algorithmic control: ## filter tree heap memopt load sort verbose ## 0.1 TRUE TRUE FALSE TRUE 2 TRUE ## ## Absolute minimum support count: 12 ## ## set item appearances ...[0 item(s)] done [0.00s]. ## set transactions ...[59 item(s), 506 transaction(s)] done [0.00s]. ## sorting and recoding items ... [52 item(s)] done [0.00s]. ## creating transaction tree ... done [0.00s]. ## checking subsets of size 1 2 3 4 5 done [0.01s]. ## writing ... [76499 rule(s)] done [0.01s]. ## creating S4 object ... done [0.01s].
inspect(head(ars))
## lhs rhs support confidence lift count ## [1] {} => {zn=low} 0.84782609 0.8478261 1.000000 429 ## [2] {} => {black=<8%} 0.89328063 0.8932806 1.000000 452 ## [3] {} => {chas=river} 0.93083004 0.9308300 1.000000 471 ## [4] {} => {crim=low} 0.97035573 0.9703557 1.000000 491 ## [5] {black=8-18.5%} => {age=high} 0.02766798 0.9333333 1.802545 14 ## [6] {black=8-18.5%} => {dis=low} 0.02766798 0.9333333 1.548415 14
inspect(head(subset(ars, rhs %in% "medv=high"), 5, by="confidence"))
## lhs rhs support confidence lift count ## [1] {rm=high, ## ptratio=low} => {medv=high} 0.02964427 1 15.8125 15 ## [2] {rm=high, ## ptratio=low, ## lstat=low} => {medv=high} 0.02964427 1 15.8125 15 ## [3] {rm=high, ## ptratio=low, ## black=<8%} => {medv=high} 0.02964427 1 15.8125 15 ## [4] {crim=low, ## rm=high, ## ptratio=low} => {medv=high} 0.02964427 1 15.8125 15 ## [5] {rm=high, ## ptratio=low, ## lstat=low, ## black=<8%} => {medv=high} 0.02964427 1 15.8125 15
\(\displaystyle conf(X \rightarrow Y) = \frac{sup(X \cup Y)} {sup(X)}\)
\(\displaystyle lift(X\rightarrow Y) = \frac{sup(X \cup Y)}{sup(X)sup(Y)}\)
\(\displaystyle lift(X\rightarrow Y) = \frac{conf(X \rightarrow Y)}{sup(Y)}\)
\(\displaystyle lift(X\rightarrow Y) = \frac{P(Y|X)}{P(Y)}\)
inspect(head(subset(ars, subset = lhs %in% "nox=high" | rhs %in% "nox=high"), 5, by="confidence"))
## lhs rhs support confidence lift count ## [1] {nox=high} => {indus=medHigh} 0.04743083 1 3.066667 24 ## [2] {nox=high} => {age=high} 0.04743083 1 1.931298 24 ## [3] {nox=high} => {dis=low} 0.04743083 1 1.659016 24 ## [4] {nox=high} => {zn=low} 0.04743083 1 1.179487 24 ## [5] {nox=high} => {crim=low} 0.04743083 1 1.030550 24
arulesViz
packageplot(ars)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
goodRules <- ars[quality(ars)$confidence > 0.8 & quality(ars)$lift > 5] inspect(head(goodRules, 3, by="confidence"))
## lhs rhs support confidence lift ## [1] {nox=high,rad=5} => {ptratio=low} 0.03162055 1 8.724138 ## [2] {nox=high,tax=medLow} => {ptratio=low} 0.03162055 1 8.724138 ## [3] {rm=high,ptratio=low} => {medv=high} 0.02964427 1 15.812500 ## count ## [1] 16 ## [2] 16 ## [3] 15
shortRules <- subset(ars, subset = size(lhs) + size(rhs) <= 3) inspect(head(shortRules, 3, by="confidence"))
## lhs rhs support confidence lift count ## [1] {black=8-18.5%} => {zn=low} 0.02964427 1 1.179487 15 ## [2] {black=8-18.5%} => {chas=river} 0.02964427 1 1.074310 15 ## [3] {zn=medHigh} => {nox=low} 0.03162055 1 2.530000 16
lhs %in% X
: lhs contains an item from Xlhs %ain% X
: lhs contains all items in Xlhs %oin% X
: lhs contains only items from Xlhs %pin% X
: lhs contains items partially matching from Xsomerules <- subset(goodRules, subset = lhs %pin% "crim" & rhs %oin% c("medv=high", "medv=medHigh")) inspect(head(somerules, 3, by="confidence"))
## lhs rhs support ## [1] {crim=low,rm=high,ptratio=low} => {medv=high} 0.02964427 ## [2] {crim=low,rm=high,ptratio=low,lstat=low} => {medv=high} 0.02964427 ## [3] {crim=low,rm=high,ptratio=low,black=<8%} => {medv=high} 0.02964427 ## confidence lift count ## [1] 1 15.8125 15 ## [2] 1 15.8125 15 ## [3] 1 15.8125 15
somerules <- subset(ars, subset = (lhs %in% "nox=high" & size(lhs) < 3 & lift >= 1)) plot(somerules, method="matrix", measure="lift")
somerules <- subset(ars, subset = rhs %in% "medv=high" & lift > 15) plot(somerules, method="graph")
\(\displaystyle d(x,y) = \sqrt{ \sum^d_{i=1}{(x_i-y_i)^2} }\)
where \(x_i\) is the value of variable \(i\) on observation \(x\).
\(\displaystyle d(x,y) = \left (\sum^d_{i=1}{|x_i - y_i|^p}\right )^{1/p}\)
where \(x_i\) is the value of variable \(i\) on observation \(x\).
randDat <- matrix(rnorm(50), nrow = 5) dist(randDat) # Euclidean distance by default
## 1 2 3 4 ## 2 5.751444 ## 3 4.167964 4.087795 ## 4 6.320828 4.483649 5.698951 ## 5 5.842614 4.734937 4.558000 5.846534
dist(randDat, method="manhattan")
## 1 2 3 4 ## 2 14.829505 ## 3 9.538933 10.806397 ## 4 15.929926 10.904098 16.781092 ## 5 14.420665 10.754002 11.112255 16.059797
dist(randDat, method="minkowski", p = 4)
## 1 2 3 4 ## 2 4.296051 ## 3 3.499051 2.868943 ## 4 4.331027 3.532843 3.475607 ## 5 4.028562 3.911365 3.583005 3.827518
What to do with categorical variables?
\(\delta_i(v_1, v_2)=\begin{cases} 1 & i \text{ is nominal and }v_1 \neq v_2 \\ 0 & i \text{ is nominal and }v_1 = v_2 \\ \frac{|v_1-v_2|}{range(i)} & i \text{ is numeric} \end{cases}\)
daisy()
function of the cluster
package.How to measure the quality of a clustering?
\(H(C,k) = \sum_{c\in C}{\frac{h(c)}{|C|}}\)
\(H(C,k) = \min_{c\in C}{h(c)}\)
\(H(C,k) = \max_{c\in C}{h(c)}\)
How to measure \(h(c)\) ?
\(\displaystyle h(c) = \sum_{x\in c}{\sum_{i=1}^d {(v_{x,i}-\hat{v}_i^c)^2}}\)
\(\displaystyle h(c) = \sum_{x\in c}{\sum_{i=1}^d {|v_{x,i}-\tilde{v}_i^c|}}\)
data(iris) ir3 <- kmeans(iris[, -5], centers = 3, iter.max = 200) ir3
## K-means clustering with 3 clusters of sizes 38, 50, 62 ## ## Cluster means: ## Sepal.Length Sepal.Width Petal.Length Petal.Width ## 1 6.850000 3.073684 5.742105 2.071053 ## 2 5.006000 3.428000 1.462000 0.246000 ## 3 5.901613 2.748387 4.393548 1.433871 ## ## Clustering vector: ## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ## [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ## [75] 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 1 1 1 3 1 1 1 1 ## [112] 1 1 3 3 1 1 1 1 3 1 3 1 3 1 1 3 3 1 1 1 1 1 3 1 1 1 1 3 1 1 1 3 1 1 1 3 1 ## [149] 1 3 ## ## Within cluster sum of squares by cluster: ## [1] 23.87947 15.15100 39.82097 ## (between_SS / total_SS = 88.4 %) ## ## Available components: ## ## [1] "cluster" "centers" "totss" "withinss" "tot.withinss" ## [6] "betweenss" "size" "iter" "ifault"
plot(iris$Sepal.Length, iris$Sepal.Width, col = ir3$cluster)
table(ir3$cluster, iris$Species)
## ## setosa versicolor virginica ## 1 0 2 36 ## 2 50 0 0 ## 3 0 48 14
\(s_i=\frac{b_i-a_i}{\max(a_i, b_i)}\)
where
silhouette
is implemented in cluster
s <- silhouette(ir3$cluster, dist(iris[,-5])) s[1:20, 3]
## [1] 0.8529551 0.8154948 0.8293151 0.8050139 0.8493016 0.7482804 0.8216509 ## [8] 0.8539051 0.7521501 0.8252940 0.8031030 0.8359126 0.8105639 0.7461505 ## [15] 0.7025937 0.6437716 0.7756839 0.8510183 0.7068578 0.8203012
plot(s)