Este prĆ”ctico estĆ” basado en el libro de Everitt y Hothorn āA Handbook of Statistical Analyses using Rā
library(HSAUR2)
## Loading required package: tools
#Conjuntos de datos:
attach(pottery)
data(pottery)
?pottery
head(pottery)
## Al2O3 Fe2O3 MgO CaO Na2O K2O TiO2 MnO BaO kiln
## 1 18.8 9.52 2.00 0.79 0.40 3.20 1.01 0.077 0.015 1
## 2 16.9 7.33 1.65 0.84 0.40 3.05 0.99 0.067 0.018 1
## 3 18.2 7.64 1.82 0.77 0.40 3.07 0.98 0.087 0.014 1
## 4 16.9 7.29 1.56 0.76 0.40 3.05 1.00 0.063 0.019 1
## 5 17.8 7.24 1.83 0.92 0.43 3.12 0.93 0.061 0.019 1
## 6 18.8 7.45 2.06 0.87 0.25 3.26 0.98 0.072 0.017 1
summary(pottery)
## Al2O3 Fe2O3 MgO CaO
## Min. :10.10 Min. :0.920 Min. :0.530 Min. :0.0100
## 1st Qu.:13.80 1st Qu.:5.390 1st Qu.:1.560 1st Qu.:0.1300
## Median :16.50 Median :6.920 Median :1.920 Median :0.3000
## Mean :15.71 Mean :5.756 Mean :2.488 Mean :0.5136
## 3rd Qu.:18.00 3rd Qu.:7.330 3rd Qu.:3.770 3rd Qu.:0.8300
## Max. :20.80 Max. :9.520 Max. :7.230 Max. :1.7300
## Na2O K2O TiO2 MnO
## Min. :0.0300 Min. :1.75 Min. :0.5600 Min. :0.00100
## 1st Qu.:0.1000 1st Qu.:2.93 1st Qu.:0.7200 1st Qu.:0.03500
## Median :0.2000 Median :3.15 Median :0.9100 Median :0.07200
## Mean :0.2429 Mean :3.20 Mean :0.8767 Mean :0.07051
## 3rd Qu.:0.3800 3rd Qu.:3.70 3rd Qu.:0.9800 3rd Qu.:0.09400
## Max. :0.8300 Max. :4.89 Max. :1.3400 Max. :0.16300
## BaO kiln
## Min. :0.00900 1:21
## 1st Qu.:0.01500 2:12
## Median :0.01700 3: 2
## Mean :0.01651 4: 5
## 3rd Qu.:0.01900 5: 5
## Max. :0.02300
dim(pottery)
## [1] 45 10
plot(pottery)
data(planets)
?planets
head(planets)
## mass period eccen
## 1 0.120 4.950 0.00
## 2 0.197 3.971 0.00
## 3 0.210 44.280 0.34
## 4 0.220 75.800 0.28
## 5 0.230 6.403 0.08
## 6 0.250 3.024 0.02
summary(planets)
## mass period eccen
## Min. : 0.050 Min. : 2.985 Min. :0.0000
## 1st Qu.: 0.930 1st Qu.: 44.280 1st Qu.:0.1000
## Median : 1.760 Median : 337.110 Median :0.2700
## Mean : 3.327 Mean : 666.531 Mean :0.2815
## 3rd Qu.: 4.140 3rd Qu.:1089.000 3rd Qu.:0.4100
## Max. :17.500 Max. :5360.000 Max. :0.9270
dim(planets)
## [1] 101 3
plot(planets)
# Classifying Romano-British Pottery (hierarchical-agglomerative)
library(lattice)
pottery_dist <- dist(pottery[, colnames(pottery) != "kiln"])
round(as.matrix(pottery_dist)[1:10, 1:10], 3)
## 1 2 3 4 5 6 7 8 9 10
## 1 0.000 2.925 1.986 2.967 2.502 2.079 3.510 2.268 3.844 4.981
## 2 2.925 0.000 1.349 0.127 0.931 1.965 1.042 1.237 1.142 2.349
## 3 1.986 1.349 0.000 1.372 0.591 0.723 2.045 0.551 2.466 3.686
## 4 2.967 0.127 1.372 0.000 0.960 1.991 1.118 1.286 1.132 2.343
## 5 2.502 0.931 0.591 0.960 0.000 1.074 1.549 0.467 2.029 3.231
## 6 2.079 1.965 0.723 1.991 1.074 0.000 2.503 0.819 3.054 4.265
## 7 3.510 1.042 2.045 1.118 1.549 2.503 0.000 1.736 1.258 2.150
## 8 2.268 1.237 0.551 1.286 0.467 0.819 1.736 0.000 2.284 3.489
## 9 3.844 1.142 2.466 1.132 2.029 3.054 1.258 2.284 0.000 1.250
## 10 4.981 2.349 3.686 2.343 3.231 4.265 2.150 3.489 1.250 0.000
levelplot(as.matrix(pottery_dist), xlab = "Pot Number",
ylab = "Pot Number")
pottery_single <- hclust(pottery_dist, method = "single")
pottery_single
##
## Call:
## hclust(d = pottery_dist, method = "single")
##
## Cluster method : single
## Distance : euclidean
## Number of objects: 45
summary(pottery_single)
## Length Class Mode
## merge 88 -none- numeric
## height 44 -none- numeric
## order 45 -none- numeric
## labels 45 -none- character
## method 1 -none- character
## call 3 -none- call
## dist.method 1 -none- character
names(pottery_single)
## [1] "merge" "height" "order" "labels" "method"
## [6] "call" "dist.method"
pottery_complete <- hclust(pottery_dist, method = "complete")
pottery_average <- hclust(pottery_dist, method = "average")
layout(matrix(1:3, ncol = 3))
plot(pottery_single, main = "Single Linkage",
sub = "", xlab = "")
plot(pottery_complete, main = "Complete Linkage",
sub = "", xlab = "")
plot(pottery_average, main = "Average Linkage",
sub = "", xlab = "")
pottery_cluster <- cutree(pottery_average, h = 4)
pottery_cluster
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
## 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3
xtabs(~ pottery_cluster + kiln, data = pottery)
## kiln
## pottery_cluster 1 2 3 4 5
## 1 21 0 0 0 0
## 2 0 12 2 0 0
## 3 0 0 0 5 5
library("scatterplot3d")
## Warning: package 'scatterplot3d' was built under R version 3.4.4
scatterplot3d(log(planets$mass), log(planets$period),
log(planets$eccen), type = "h", angle = 55,
pch = 16, y.ticklabs = seq(0, 10, by = 2),
y.margin.add = 0.1, scale.y = 0.7)
rge <- apply(planets, 2, max) - apply(planets, 2, min)
c(-1, 1) %*% apply(planets, 2, range)
## mass period eccen
## [1,] 17.45 5357.015 0.927
planet.dat <- sweep(planets, 2, rge, FUN = "/")
n <- nrow(planet.dat)
wss <- rep(0, 10)
wss[1] <- (n - 1) * sum(apply(planet.dat, 2, var))
for (i in 2:10)
wss[i] <- sum(kmeans(planet.dat,
centers = i)$withinss)
plot(1:10, wss, type = "b", xlab = "Number of groups",
ylab = "Within groups sum of squares")
planet_kmeans3 <- kmeans(planet.dat, centers = 3)
planet_kmeans3
## K-means clustering with 3 clusters of sizes 53, 14, 34
##
## Cluster means:
## mass period eccen
## 1 0.09576256 0.07984122 0.1315524
## 2 0.60560786 0.31606632 0.3953614
## 3 0.16777347 0.11500361 0.5343613
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 1 1 3 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 1 3 3 1 3 1 1 1 1 1 3 3 3 3 1 3 1 1
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 1 1 3 3 1 1 3 1 1 3 3 3 3 1 1 1 1 3
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 1 1 1 1 1 1 1 1 1 3 3 1 1 3 3 1 3 3
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 1 2 3 1 3 3 3 3 3 1 1 1 3 2 3 2 3 2
## 91 92 93 94 95 96 97 98 99 100 101
## 2 3 2 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 1.755694 1.719130 1.787017
## (between_SS / total_SS = 57.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
table(planet_kmeans3$cluster)
##
## 1 2 3
## 53 14 34
ccent <- function(cl) {
f <- function(i) colMeans(planets[cl == i,])
x <- sapply(sort(unique(cl)), f)
colnames(x) <- sort(unique(cl))
return(x)
}
ccent(planet_kmeans3$cluster)
## 1 2 3
## mass 1.6710566 10.56786 2.9276471
## period 427.7105892 1693.17201 616.0760882
## eccen 0.1219491 0.36650 0.4953529
planet_kmeans5 <- kmeans(planet.dat, centers = 5)
table(planet_kmeans5$cluster)
##
## 1 2 3 4 5
## 33 33 15 14 6
ccent(planet_kmeans5$cluster)
## 1 2 3 4 5
## mass 1.8224242 1.680182 2.167333 10.56786 6.6683333
## period 539.8069697 358.270340 820.628067 1693.17201 278.2126667
## eccen 0.2871818 0.052100 0.542200 0.36650 0.6626667
Estudiar el argumento nstart de la funciĆ³n kmeans que realiza varias inicializaciones y devuelve el mejor modelo en el sentido que tendrĆ” la mĆnima varianza intraclusters.
library("mclust")
## Package 'mclust' version 5.4
## Type 'citation("mclust")' for citing this R package in publications.
planet_mclust <- Mclust(planet.dat)
plot(planet_mclust, planet.dat, what = "BIC", col = "black",
ylab = "-BIC", ylim = c(0, 350))
print(planet_mclust)
## 'Mclust' model object:
## best model: diagonal, varying volume and shape (VVI) with 3 components
clPairs(planet.dat, classification = planet_mclust$classification,
symbols = 1:3)
table(planet_mclust$classification)
##
## 1 2 3
## 14 44 43
ccent(planet_mclust$classification)
## 1 2 3
## mass 0.52085714 1.6759545 5.9307442
## period 5.16207357 272.5978000 1284.9554465
## eccen 0.02385714 0.2884545 0.3583791
scatterplot3d(log(planets$mass), log(planets$period),
log(planets$eccen), type = "h", angle = 55,
scale.y = 0.7, pch = planet_mclust$classification,
y.ticklabs = seq(0, 10, by = 2), y.margin.add = 0.1,
color = as.numeric(planet_mclust$classification))
Otra posibilidad:
#install.packages("EMCluster")
library(EMCluster)
## Loading required package: MASS
## Loading required package: Matrix
set.seed(1234)
x1 <- da1$da
emobj <- simple.init(x1, nclass = 10)
emobj <- shortemcluster(x1, emobj)
summary(emobj)
## Method:
## n = 500, p = 2, nclass = 10, flag = , total parameters = 59,
## logL = -5827.1582, AIC = 11772.3164, BIC = 12020.9783.
## nc:
## [1] 10
## pi:
## [1] 0.07731 0.05203 0.01943 0.02477 0.19800 0.02424 0.29374 0.12548
## [9] 0.02601 0.15897
ret <- emcluster(x1, emobj, assign.class = TRUE)
summary(ret)
## Method:
## n = 500, p = 2, nclass = 10, flag = , total parameters = 59,
## logL = -5826.9679, AIC = 11771.9358, BIC = 12020.5977.
## nc:
## [1] 40 29 13 12 99 12 136 68 13 78
## pi:
## [1] 0.07732 0.05291 0.02013 0.02406 0.19800 0.02422 0.29371 0.12484
## [9] 0.02601 0.15878
?emcluster
Luxburg - A Tutorial on Spectral Clustering
The goal of spectral clustering is to cluster data that is connected but not necessarily clustered within convex boundaries.
library(mlbench)
set.seed(111)
obj <- mlbench.spirals(100,1,0.025)
my.data <- 4 * obj$x
plot(my.data)
We compute the similarity Matrix with a gaussian kernel:
s <- function(x1, x2, alpha=1) {
exp(- alpha * norm(as.matrix(x1-x2), type="F"))
}
make.similarity <- function(my.data, similarity) {
N <- nrow(my.data)
S <- matrix(rep(NA,N^2), ncol=N)
for(i in 1:N) {
for(j in 1:N) {
S[i,j] <- similarity(my.data[i,], my.data[j,])
}
}
S
}
S <- make.similarity(my.data, s)
S[1:8,1:8]
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.00000000 0.064179185 0.74290158 0.63193426 0.098831073 0.094897744
## [2,] 0.06417919 1.000000000 0.06938066 0.04276431 0.214229495 0.275123731
## [3,] 0.74290158 0.069380663 1.00000000 0.61054893 0.089569089 0.088641808
## [4,] 0.63193426 0.042764307 0.61054893 1.00000000 0.062517586 0.059982837
## [5,] 0.09883107 0.214229495 0.08956909 0.06251759 1.000000000 0.776556494
## [6,] 0.09489774 0.275123731 0.08864181 0.05998284 0.776556494 1.000000000
## [7,] 0.56549848 0.048335201 0.66577557 0.71959220 0.059731778 0.059016049
## [8,] 0.03355084 0.008796359 0.04342047 0.04426067 0.005091154 0.005548028
## [,7] [,8]
## [1,] 0.56549848 0.033550836
## [2,] 0.04833520 0.008796359
## [3,] 0.66577557 0.043420466
## [4,] 0.71959220 0.044260673
## [5,] 0.05973178 0.005091154
## [6,] 0.05901605 0.005548028
## [7,] 1.00000000 0.058059785
## [8,] 0.05805979 1.000000000
We compute the Affinity Matrix applying a k nearest neighboor:
make.affinity <- function(S, n.neighboors=2) {
N <- length(S[,1])
if (n.neighboors >= N) { # fully connected
A <- S
} else {
A <- matrix(rep(0,N^2), ncol=N)
for(i in 1:N) { # for each line
# only connect to those points with larger similarity
best.similarities <- sort(S[i,], decreasing=TRUE)[1:n.neighboors]
for (s in best.similarities) {
j <- which(S[i,] == s)
A[i,j] <- S[i,j]
A[j,i] <- S[i,j] # to make an undirected graph, ie, the matrix becomes symmetric
}
}
}
A
}
A <- make.affinity(S, 3) # use 3 neighboors (includes self)
A[1:8,1:8]
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 1.0000000 0 0.7429016 0.6319343 0.0000000 0.0000000 0.0000000 0
## [2,] 0.0000000 1 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0
## [3,] 0.7429016 0 1.0000000 0.0000000 0.0000000 0.0000000 0.6657756 0
## [4,] 0.6319343 0 0.0000000 1.0000000 0.0000000 0.0000000 0.7195922 0
## [5,] 0.0000000 0 0.0000000 0.0000000 1.0000000 0.7765565 0.0000000 0
## [6,] 0.0000000 0 0.0000000 0.0000000 0.7765565 1.0000000 0.0000000 0
## [7,] 0.0000000 0 0.6657756 0.7195922 0.0000000 0.0000000 1.0000000 0
## [8,] 0.0000000 0 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 1
With this affinity matrix, clustering is replaced by a graph-partition problem, where connected graph components are interpreted as clusters. The graph must be partitioned such that edges connecting different clusters should have low weigths, and edges within the same cluster must have high values. Spectral clustering tries to construct this type of graph.
There is the need of a degree matrix D where each diagonal value is the degree of the respective vertex and all other positions are zero:
#Degree Matrix
D <- diag(apply(A, 1, sum)) # sum rows
D[1:8,1:8]
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 2.374836 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [2,] 0.000000 2.597451 0.000000 0.000000 0.000000 0.000000 0.000000
## [3,] 0.000000 0.000000 2.408677 0.000000 0.000000 0.000000 0.000000
## [4,] 0.000000 0.000000 0.000000 2.351526 0.000000 0.000000 0.000000
## [5,] 0.000000 0.000000 0.000000 0.000000 2.523175 0.000000 0.000000
## [6,] 0.000000 0.000000 0.000000 0.000000 0.000000 2.519936 0.000000
## [7,] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.170424
## [8,] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [,8]
## [1,] 0.000000
## [2,] 0.000000
## [3,] 0.000000
## [4,] 0.000000
## [5,] 0.000000
## [6,] 0.000000
## [7,] 0.000000
## [8,] 2.302241
#Laplacian
U <- D - A
round(U[1:12,1:12],1)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## [1,] 1.4 0.0 -0.7 -0.6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## [2,] 0.0 1.6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## [3,] -0.7 0.0 1.4 0.0 0.0 0.0 -0.7 0.0 0.0 0.0 0.0 0.0
## [4,] -0.6 0.0 0.0 1.4 0.0 0.0 -0.7 0.0 0.0 0.0 0.0 0.0
## [5,] 0.0 0.0 0.0 0.0 1.5 -0.8 0.0 0.0 0.0 0.0 0.0 0.0
## [6,] 0.0 0.0 0.0 0.0 -0.8 1.5 0.0 0.0 0.0 0.0 0.0 0.0
## [7,] 0.0 0.0 -0.7 -0.7 0.0 0.0 2.2 0.0 0.0 -0.8 0.0 0.0
## [8,] 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.3 0.0 0.0 0.0 0.0
## [9,] 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.5 0.0 0.0 0.0
## [10,] 0.0 0.0 0.0 0.0 0.0 0.0 -0.8 0.0 0.0 1.6 -0.8 0.0
## [11,] 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -0.8 1.5 -0.8
## [12,] 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -0.8 1.5
Assuming we want k clusters, the next step is to find the k smallest eigenvectors (ignoring the trivial constant eigenvector).
k <- 2
evL <- eigen(U, symmetric=TRUE)
The ith row of Z defines a transformation for observation xi. Letās check that they are well-separated. So, in this transformed space it becomes easy for a standard k-means clustering to find the appropriate clusters:
Z <- evL$vectors[,(ncol(evL$vectors)-k+1):ncol(evL$vectors)]
plot(Z, col=obj$classes, pch=20)
library(stats)
km <- kmeans(Z, centers=k, nstart=5)
plot(my.data, col=km$cluster)
If we donāt know how much clusters there are, the eigenvalue spectrum has a gap that give us the value of k.
signif(evL$values,2) # eigenvalues are in decreasing order
## [1] 3.3e+00 3.3e+00 3.0e+00 3.0e+00 2.9e+00 2.9e+00 2.8e+00
## [8] 2.8e+00 2.7e+00 2.7e+00 2.7e+00 2.7e+00 2.6e+00 2.6e+00
## [15] 2.6e+00 2.5e+00 2.5e+00 2.5e+00 2.5e+00 2.5e+00 2.4e+00
## [22] 2.4e+00 2.4e+00 2.3e+00 2.3e+00 2.3e+00 2.3e+00 2.3e+00
## [29] 2.2e+00 2.2e+00 2.2e+00 2.1e+00 2.1e+00 2.0e+00 2.0e+00
## [36] 2.0e+00 2.0e+00 1.9e+00 1.9e+00 1.8e+00 1.8e+00 1.7e+00
## [43] 1.7e+00 1.7e+00 1.6e+00 1.6e+00 1.6e+00 1.5e+00 1.5e+00
## [50] 1.4e+00 1.4e+00 1.4e+00 1.3e+00 1.3e+00 1.2e+00 1.2e+00
## [57] 1.1e+00 1.1e+00 1.1e+00 1.0e+00 9.7e-01 9.4e-01 8.8e-01
## [64] 8.6e-01 7.9e-01 7.7e-01 7.1e-01 6.9e-01 6.2e-01 6.1e-01
## [71] 5.5e-01 5.3e-01 4.7e-01 4.6e-01 4.1e-01 4.0e-01 3.4e-01
## [78] 3.3e-01 2.8e-01 2.8e-01 2.3e-01 2.2e-01 1.8e-01 1.8e-01
## [85] 1.4e-01 1.3e-01 1.0e-01 9.9e-02 7.0e-02 6.9e-02 4.4e-02
## [92] 4.4e-02 2.5e-02 2.5e-02 1.1e-02 1.1e-02 2.8e-03 2.7e-03
## [99] 3.2e-16 -3.9e-17
plot(1:10, rev(evL$values)[1:10], log="y")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted
## from logarithmic plot
abline(v=2.25, col="red", lty=2) # there are just 2 clusters as expected
Obviamente esto ya estĆ” hecho en R:
library(kernlab)
sc <- specc(my.data, centers=2)
plot(my.data, col=sc, pch=4) # estimated classes (x)
points(my.data, col=obj$classes, pch=5) # true classes (<>)
##Minimo Error Clasificaci??n
library("clue")
## Warning: package 'clue' was built under R version 3.4.4
library(clv)
## Loading required package: cluster
## Loading required package: class
data(iris)
iris.data <- iris[,1:4]
pam.mod <- pam(iris.data,3) # create three clusters
v.pred <- as.integer(pam.mod$clustering) # get cluster ids associated to given data objects
v.real <- as.integer(iris$Species) # get also real cluster ids
CE<-function(K1,K2){
n=length(K1)
M=table(K1,K2)
k=dim(M)[1]
l=dim(M)[2]
m=rep(0,k)
if (k<=l)
{y = solve_LSAP(M, maximum = TRUE)
tr = sum(M[cbind(seq_along(y), y)])
CE = 1-(tr/n)
}
else {
if (l == 1) {
CE = 1-(max(M)/n)
}
else {
if (k>l) {
M2<-t(M)
y = solve_LSAP(M2, maximum = TRUE)
tr = sum(M2[cbind(seq_along(y), y)])
CE = 1-(tr/n)
}}}
names(CE)<-"CE"
CE}
std <- std.ext(v.pred, v.real)
A=as.matrix(c(CE(v.pred,v.real),clv.Rand(std),clv.Jaccard(std),clv.Folkes.Mallows(std)))
rownames(A)=c( "MCE","Rand","Jaccard","FM")
A
## [,1]
## MCE 0.1066667
## Rand 0.8797315
## Jaccard 0.6958588
## FM 0.8208081