--- title: "ClusterPractice" author: "Mathias Bourel" date: "19/11/2018" output: html_document --- Este práctico está basado en el libro de Everitt y Hothorn "A Handbook of Statistical Analyses using R" ```{r} library(HSAUR2) #Conjuntos de datos: attach(pottery) data(pottery) ?pottery head(pottery) summary(pottery) dim(pottery) plot(pottery) data(planets) ?planets head(planets) summary(planets) dim(planets) plot(planets) ``` #1- Cluster Jerarquicos. ```{r} # Classifying Romano-British Pottery (hierarchical-agglomerative) library(lattice) pottery_dist <- dist(pottery[, colnames(pottery) != "kiln"]) round(as.matrix(pottery_dist)[1:10, 1:10], 3) levelplot(as.matrix(pottery_dist), xlab = "Pot Number", ylab = "Pot Number") pottery_single <- hclust(pottery_dist, method = "single") pottery_single summary(pottery_single) names(pottery_single) pottery_complete <- hclust(pottery_dist, method = "complete") pottery_average <- hclust(pottery_dist, method = "average") layout(matrix(1:3, ncol = 3)) plot(pottery_single, main = "Single Linkage", sub = "", xlab = "") plot(pottery_complete, main = "Complete Linkage", sub = "", xlab = "") plot(pottery_average, main = "Average Linkage", sub = "", xlab = "") pottery_cluster <- cutree(pottery_average, h = 4) pottery_cluster xtabs(~ pottery_cluster + kiln, data = pottery) ``` # K-means ```{r} library("scatterplot3d") scatterplot3d(log(planets$mass), log(planets$period), log(planets$eccen), type = "h", angle = 55, pch = 16, y.ticklabs = seq(0, 10, by = 2), y.margin.add = 0.1, scale.y = 0.7) rge <- apply(planets, 2, max) - apply(planets, 2, min) c(-1, 1) %*% apply(planets, 2, range) planet.dat <- sweep(planets, 2, rge, FUN = "/") n <- nrow(planet.dat) wss <- rep(0, 10) wss[1] <- (n - 1) * sum(apply(planet.dat, 2, var)) for (i in 2:10) wss[i] <- sum(kmeans(planet.dat, centers = i)$withinss) plot(1:10, wss, type = "b", xlab = "Number of groups", ylab = "Within groups sum of squares") planet_kmeans3 <- kmeans(planet.dat, centers = 3) planet_kmeans3 table(planet_kmeans3$cluster) ccent <- function(cl) { f <- function(i) colMeans(planets[cl == i,]) x <- sapply(sort(unique(cl)), f) colnames(x) <- sort(unique(cl)) return(x) } ccent(planet_kmeans3$cluster) planet_kmeans5 <- kmeans(planet.dat, centers = 5) table(planet_kmeans5$cluster) ccent(planet_kmeans5$cluster) ``` Estudiar el argumento nstart de la función kmeans que realiza varias inicializaciones y devuelve el mejor modelo en el sentido que tendrá la mínima varianza intraclusters. #3 - Clusters basados en modelos ```{r} library("mclust") planet_mclust <- Mclust(planet.dat) plot(planet_mclust, planet.dat, what = "BIC", col = "black", ylab = "-BIC", ylim = c(0, 350)) print(planet_mclust) clPairs(planet.dat, classification = planet_mclust$classification, symbols = 1:3) table(planet_mclust$classification) ccent(planet_mclust$classification) scatterplot3d(log(planets$mass), log(planets$period), log(planets$eccen), type = "h", angle = 55, scale.y = 0.7, pch = planet_mclust$classification, y.ticklabs = seq(0, 10, by = 2), y.margin.add = 0.1, color = as.numeric(planet_mclust$classification)) ``` Otra posibilidad: ```{r} #install.packages("EMCluster") library(EMCluster) set.seed(1234) x1 <- da1$da emobj <- simple.init(x1, nclass = 10) emobj <- shortemcluster(x1, emobj) summary(emobj) ret <- emcluster(x1, emobj, assign.class = TRUE) summary(ret) ?emcluster ``` #4- Spectral Clustering [Luxburg - A Tutorial on Spectral Clustering](http://www.kyb.mpg.de/fileadmin/user_upload/files/publications/attachments/Luxburg07_tutorial_4488%5b0%5d.pdf) [Jao Neto](http://www.di.fc.ul.pt/~jpn/r/spectralclustering/spectralclustering.html) The goal of spectral clustering is to cluster data that is connected but not necessarily clustered within convex boundaries. ```{r} library(mlbench) set.seed(111) obj <- mlbench.spirals(100,1,0.025) my.data <- 4 * obj$x plot(my.data) ``` We compute the similarity Matrix with a gaussian kernel: ```{r} s <- function(x1, x2, alpha=1) { exp(- alpha * norm(as.matrix(x1-x2), type="F")) } make.similarity <- function(my.data, similarity) { N <- nrow(my.data) S <- matrix(rep(NA,N^2), ncol=N) for(i in 1:N) { for(j in 1:N) { S[i,j] <- similarity(my.data[i,], my.data[j,]) } } S } S <- make.similarity(my.data, s) S[1:8,1:8] ``` We compute the Affinity Matrix applying a k nearest neighboor: ```{r} make.affinity <- function(S, n.neighboors=2) { N <- length(S[,1]) if (n.neighboors >= N) { # fully connected A <- S } else { A <- matrix(rep(0,N^2), ncol=N) for(i in 1:N) { # for each line # only connect to those points with larger similarity best.similarities <- sort(S[i,], decreasing=TRUE)[1:n.neighboors] for (s in best.similarities) { j <- which(S[i,] == s) A[i,j] <- S[i,j] A[j,i] <- S[i,j] # to make an undirected graph, ie, the matrix becomes symmetric } } } A } A <- make.affinity(S, 3) # use 3 neighboors (includes self) A[1:8,1:8] ``` With this affinity matrix, clustering is replaced by a graph-partition problem, where connected graph components are interpreted as clusters. The graph must be partitioned such that edges connecting different clusters should have low weigths, and edges within the same cluster must have high values. Spectral clustering tries to construct this type of graph. There is the need of a degree matrix D where each diagonal value is the degree of the respective vertex and all other positions are zero: ```{r} #Degree Matrix D <- diag(apply(A, 1, sum)) # sum rows D[1:8,1:8] ``` ```{r} #Laplacian U <- D - A round(U[1:12,1:12],1) ``` Assuming we want k clusters, the next step is to find the k smallest eigenvectors (ignoring the trivial constant eigenvector). ```{r} k <- 2 evL <- eigen(U, symmetric=TRUE) ``` The ith row of Z defines a transformation for observation xi. Let’s check that they are well-separated. So, in this transformed space it becomes easy for a standard k-means clustering to find the appropriate clusters: ```{r} Z <- evL$vectors[,(ncol(evL$vectors)-k+1):ncol(evL$vectors)] plot(Z, col=obj$classes, pch=20) ``` ```{r} library(stats) km <- kmeans(Z, centers=k, nstart=5) plot(my.data, col=km$cluster) ``` If we don’t know how much clusters there are, the eigenvalue spectrum has a gap that give us the value of k. ```{r} signif(evL$values,2) # eigenvalues are in decreasing order ``` ```{r} plot(1:10, rev(evL$values)[1:10], log="y") abline(v=2.25, col="red", lty=2) # there are just 2 clusters as expected ``` Obviamente esto ya está hecho en R: ```{r} library(kernlab) sc <- specc(my.data, centers=2) plot(my.data, col=sc, pch=4) # estimated classes (x) points(my.data, col=obj$classes, pch=5) # true classes (<>) ``` #5- Validación externa ```{r} ##Minimo Error Clasificaci??n library("clue") library(clv) data(iris) iris.data <- iris[,1:4] pam.mod <- pam(iris.data,3) # create three clusters v.pred <- as.integer(pam.mod$clustering) # get cluster ids associated to given data objects v.real <- as.integer(iris$Species) # get also real cluster ids CE<-function(K1,K2){ n=length(K1) M=table(K1,K2) k=dim(M)[1] l=dim(M)[2] m=rep(0,k) if (k<=l) {y = solve_LSAP(M, maximum = TRUE) tr = sum(M[cbind(seq_along(y), y)]) CE = 1-(tr/n) } else { if (l == 1) { CE = 1-(max(M)/n) } else { if (k>l) { M2<-t(M) y = solve_LSAP(M2, maximum = TRUE) tr = sum(M2[cbind(seq_along(y), y)]) CE = 1-(tr/n) }}} names(CE)<-"CE" CE} std <- std.ext(v.pred, v.real) A=as.matrix(c(CE(v.pred,v.real),clv.Rand(std),clv.Jaccard(std),clv.Folkes.Mallows(std))) rownames(A)=c( "MCE","Rand","Jaccard","FM") A ```