背景
聚类分析的目的是将观测数据聚为若干个簇(cluster)。与其他簇的数据相比,簇中的每个数据和本簇的数据更加相似。这里讨论两种方法:层次聚类和k-means聚类算法。
数据
本数据集为1973年欧洲25个国家里9个不同的食品组关于蛋白质消费的小型数据集,目标是根据蛋白质消费的模式将这些国家分组。数据集在这里
实现过程
加载数据
首先加载数据集,并查看数据概况。
protein <- read.table("D:/zmPDSwR-master/Protein/protein.txt", sep = "\t", header = T)
summary(protein)
Country RedMeat WhiteMeat Eggs Milk Fish
Albania : 1 Min. : 4.400 Min. : 1.400 Min. :0.500 Min. : 4.90 Min. : 0.200
Austria : 1 1st Qu.: 7.800 1st Qu.: 4.900 1st Qu.:2.700 1st Qu.:11.10 1st Qu.: 2.100
Belgium : 1 Median : 9.500 Median : 7.800 Median :2.900 Median :17.60 Median : 3.400
Bulgaria : 1 Mean : 9.828 Mean : 7.896 Mean :2.936 Mean :17.11 Mean : 4.284
Czechoslovakia: 1 3rd Qu.:10.600 3rd Qu.:10.800 3rd Qu.:3.700 3rd Qu.:23.30 3rd Qu.: 5.800
Denmark : 1 Max. :18.000 Max. :14.000 Max. :4.700 Max. :33.70 Max. :14.200
(Other) :19
Cereals Starch Nuts Fr.Veg
Min. :18.60 Min. :0.600 Min. :0.700 Min. :1.400
1st Qu.:24.30 1st Qu.:3.100 1st Qu.:1.500 1st Qu.:2.900
Median :28.00 Median :4.700 Median :2.400 Median :3.800
Mean :32.25 Mean :4.276 Mean :3.072 Mean :4.136
3rd Qu.:40.10 3rd Qu.:5.700 3rd Qu.:4.700 3rd Qu.:4.900
Max. :56.70 Max. :6.500 Max. :7.800 Max. :7.900
数据预处理
这里的预处理主要是归一化。
vars.to.use <- colnames(protein)[-1]
pmatrix <- scale(protein[, vars.to.use])
pcenter <- attr(pmatrix, "scaled:center")
pscale <- attr(pmatrix, "scaled:scale")
层次聚类
R语言中是hclust()
函数来进行层次聚类。
d <- dist(pmatrix, method = "euclidean")
pfit <- hclust(d, method = "ward.D")
plot(pfit, labels = protein$Country)
rect.hclust(pfit, k = 5)
提取聚类的结果:
groups <- cutree(pfit, k = 5)
print_clusters <- function(labels, k) {
for (i in 1:k) {
print(paste("cluster", i))
print(protein[labels == i, c("Country", "RedMeat", "Fish", "Fr.Veg")])
}
}
> print_clusters(groups, 5)
[1] "cluster 1"
Country RedMeat Fish Fr.Veg
1 Albania 10.1 0.2 1.7
4 Bulgaria 7.8 1.2 4.2
18 Romania 6.2 1.0 2.8
25 Yugoslavia 4.4 0.6 3.2
[1] "cluster 2"
Country RedMeat Fish Fr.Veg
2 Austria 8.9 2.1 4.3
3 Belgium 13.5 4.5 4.0
9 France 18.0 5.7 6.5
12 Ireland 13.9 2.2 2.9
14 Netherlands 9.5 2.5 3.7
21 Switzerland 13.1 2.3 4.9
22 UK 17.4 4.3 3.3
24 W Germany 11.4 3.4 3.8
[1] "cluster 3"
Country RedMeat Fish Fr.Veg
5 Czechoslovakia 9.7 2.0 4.0
7 E Germany 8.4 5.4 3.6
11 Hungary 5.3 0.3 4.2
16 Poland 6.9 3.0 6.6
23 USSR 9.3 3.0 2.9
[1] "cluster 4"
Country RedMeat Fish Fr.Veg
6 Denmark 10.6 9.9 2.4
8 Finland 9.5 5.8 1.4
15 Norway 9.4 9.7 2.7
20 Sweden 9.9 7.5 2.0
[1] "cluster 5"
Country RedMeat Fish Fr.Veg
10 Greece 10.2 5.9 6.5
13 Italy 9.0 3.4 6.7
17 Portugal 6.2 14.2 7.9
19 Spain 7.1 7.0 7.2
簇的可视化:
library(ggplot2)
princ <- prcomp(pmatrix)
nComp <- 2
project <- predict(princ, newdata = pmatrix)[, 1:nComp]
project.plus <- cbind(as.data.frame(project),
cluster = as.factor(groups),
country = protein$Country)
ggplot(project.plus, aes(x = PC1, y = PC2)) +
geom_point(aes(shape = cluster)) +
geom_text(aes(label = country), hjust = 0, vjust = 1)
对于聚类出的结果是否稳定,可以采用bootstrp重抽样来评价模型的稳定性,在R语言中是用clusterboot()
函数来实现:
library(fpc)
kbest.p <- 5 # 设置期望的簇的数量
> cboot.hclust <- clusterboot(pmatrix, clustermethod = hclustCBI, method = "ward.D", k = 5)
boot 1
boot 2
boot 3
...
boot 100
> summary(cboot.hclust$result)
Length Class Mode
result 7 hclust list
noise 1 -none- logical
nc 1 -none- numeric
clusterlist 5 -none- list
partition 25 -none- numeric
clustermethod 1 -none- character
nccl 1 -none- numeric
> groups <- cboot.hclust$result$partition
> print_clusters(groups, kbest.p)
[1] "cluster 1"
Country RedMeat Fish Fr.Veg
1 Albania 10.1 0.2 1.7
4 Bulgaria 7.8 1.2 4.2
18 Romania 6.2 1.0 2.8
25 Yugoslavia 4.4 0.6 3.2
[1] "cluster 2"
Country RedMeat Fish Fr.Veg
2 Austria 8.9 2.1 4.3
3 Belgium 13.5 4.5 4.0
9 France 18.0 5.7 6.5
12 Ireland 13.9 2.2 2.9
14 Netherlands 9.5 2.5 3.7
21 Switzerland 13.1 2.3 4.9
22 UK 17.4 4.3 3.3
24 W Germany 11.4 3.4 3.8
[1] "cluster 3"
Country RedMeat Fish Fr.Veg
5 Czechoslovakia 9.7 2.0 4.0
7 E Germany 8.4 5.4 3.6
11 Hungary 5.3 0.3 4.2
16 Poland 6.9 3.0 6.6
23 USSR 9.3 3.0 2.9
[1] "cluster 4"
Country RedMeat Fish Fr.Veg
6 Denmark 10.6 9.9 2.4
8 Finland 9.5 5.8 1.4
15 Norway 9.4 9.7 2.7
20 Sweden 9.9 7.5 2.0
[1] "cluster 5"
Country RedMeat Fish Fr.Veg
10 Greece 10.2 5.9 6.5
13 Italy 9.0 3.4 6.7
17 Portugal 6.2 14.2 7.9
19 Spain 7.1 7.0 7.2
> cboot.hclust$bootmean # 簇的稳定性向量
[1] 0.8405000 0.7686746 0.6144325 0.8562976 0.8080000
> cboot.hclust$bootbrd # 每一个簇被消除的次数
[1] 17 14 46 18 21
关于簇数量的选择,一般是要利用领域知识来选,也有各种启发式的方法来可以来帮助选择,下面的R语言实现的就是一种简单的方法是计算不同k值得总的簇内平方和(WSS),找到曲线的“肘弯”。
# 计算2个向量之间的平方距离
sqr_edist <- function(x, y) {
sum((x - y)^2)
}
# 计算单个簇的WSS,表示为一个矩阵(每个点一行)
wss.cluster <- function(clustermat) {
c0 <- apply(clustermat, 2, FUN = mean)
sum(apply(clustermat, 1, FUN = function(row) {sqr_edist(row, c0)}))
}
# 根据数据点集合和簇标签,计算总的WSS
wss.total <- function(dmatrix, labels) {
wsstot <- 0
k <- length(unique(labels))
for(i in 1:k)
wsstot <- wsstot + wss.cluster(subset(dmatrix, labels == i))
wsstot
}
想要发现这个“肘弯”不能靠肉眼,这里介绍的是Calinski-Harabasz指数,意思是簇间方差(本质上是所有簇的中心点到数据集的大中心点的方差)与总的簇内方差(基本上是聚类中簇的平均WSS)之间的比率。
totss <- function(dmatrix) {
grandmean <- apply(dmatrix, 2, FUN = mean)
sum(apply(dmatrix, 1, FUN = function(row){sqr_edist(row, grandmean)}))
}
ch_criterion <- function(dmatrix, kmax, method = "kmeans") {
if(!(method %in% c("kmeans", "hclust"))) {
stop("method must be one of c('kmeans', 'hclust')")
}
npts <- dim(dmatrix)[1]
totss <- totss(dmatrix)
wss <- numeric(kmax)
crit <- numeric(kmax)
wss[1] <- (npts - 1) * sum(apply(dmatrix, 2, var))
for (k in 2:kmax) {
if(method == "kmeans") {
clustering <- kmeans(dmatrix, k, nstart = 10, iter.max = 100)
wss[k] <- clustering$tot.withinss
} else {
d <- dist(dmatrix, method = "euclidean")
pfit <- hclust(d, method = "ward.D")
labels <- cutree(pfit, k = k)
wss[k] <- wss.total(dmatrix, labels)
}
}
bss <- totss - wss
crit.num <- bss / (0 : (kmax - 1))
crit.denom <- wss / (npts - 1:kmax)
list(crit = crit.num / crit.denom, wss = wss, totss = totss)
}
可视化来看一下:
library(reshape2)
clustcrit <- ch_criterion(pmatrix, 10, method = "hclust")
critframe <- data.frame(k = 1:10, ch = scale(clustcrit$crit), wss = scale(clustcrit$wss))
critframe <- melt(critframe, id.vars = c("k"), variable.name = "measure", value.name = "score")
ggplot(critframe, aes(x = k, y = score, color = measure)) +
geom_point(aes(shape = measure)) +
geom_line(aes(linetype = measure)) +
scale_x_continuous(breaks = 1:10, labels = 1:10) +
scale_colour_hue()
k-means聚类
> pclusters <- kmeans(pmatrix, kbest.p, nstart = 100, iter.max = 100)
> summary(pclusters)
Length Class Mode
cluster 25 -none- numeric
centers 45 -none- numeric
totss 1 -none- numeric
withinss 5 -none- numeric
tot.withinss 1 -none- numeric
betweenss 1 -none- numeric
size 5 -none- numeric
iter 1 -none- numeric
ifault 1 -none- numeric
> pclusters$centers
RedMeat WhiteMeat Eggs Milk Fish Cereals Starch Nuts Fr.Veg
1 -0.807569986 -0.8719354 -1.55330561 -1.0783324 -1.0386379 1.7200335 -1.4234267 0.9961313 -0.64360439
2 0.006572897 -0.2290150 0.19147892 1.3458748 1.1582546 -0.8722721 0.1676780 -0.9553392 -1.11480485
3 -0.570049402 0.5803879 -0.08589708 -0.4604938 -0.4537795 0.3181839 0.7857609 -0.2679180 0.06873983
4 1.011180399 0.7421332 0.94084150 0.5700581 -0.2671539 -0.6877583 0.2288743 -0.5083895 0.02161979
5 -0.508801956 -1.1088009 -0.41248496 -0.8320414 0.9819154 0.1300253 -0.1842010 1.3108846 1.62924487
> pclusters$size
[1] 4 4 5 8 4
> groups <- pclusters$cluster
> print_clusters(groups, kbest.p)
[1] "cluster 1"
Country RedMeat Fish Fr.Veg
1 Albania 10.1 0.2 1.7
4 Bulgaria 7.8 1.2 4.2
18 Romania 6.2 1.0 2.8
25 Yugoslavia 4.4 0.6 3.2
[1] "cluster 2"
Country RedMeat Fish Fr.Veg
6 Denmark 10.6 9.9 2.4
8 Finland 9.5 5.8 1.4
15 Norway 9.4 9.7 2.7
20 Sweden 9.9 7.5 2.0
[1] "cluster 3"
Country RedMeat Fish Fr.Veg
5 Czechoslovakia 9.7 2.0 4.0
7 E Germany 8.4 5.4 3.6
11 Hungary 5.3 0.3 4.2
16 Poland 6.9 3.0 6.6
23 USSR 9.3 3.0 2.9
[1] "cluster 4"
Country RedMeat Fish Fr.Veg
2 Austria 8.9 2.1 4.3
3 Belgium 13.5 4.5 4.0
9 France 18.0 5.7 6.5
12 Ireland 13.9 2.2 2.9
14 Netherlands 9.5 2.5 3.7
21 Switzerland 13.1 2.3 4.9
22 UK 17.4 4.3 3.3
24 W Germany 11.4 3.4 3.8
[1] "cluster 5"
Country RedMeat Fish Fr.Veg
10 Greece 10.2 5.9 6.5
13 Italy 9.0 3.4 6.7
17 Portugal 6.2 14.2 7.9
19 Spain 7.1 7.0 7.2
绘制簇的准则:
> clustering.ch <- kmeansruns(pmatrix, krange = 1:10, criterion = "ch")
> clustering.ch$bestk
[1] 2
> clustering.asw <- kmeansruns(pmatrix, krange = 1:10, criterion = "asw")
> clustering.asw$bestk
[1] 3
> clustering.ch$crit
[1] 0.000000 14.094814 11.417985 10.418801 10.011797 9.964967 9.861682 9.412089 9.166676 9.075569
> clustcrit$crit
[1] NaN 12.215107 10.359587 9.690891 10.011797 9.964967 9.506978 9.092065 8.822406 8.695065
> critframe <- data.frame(k = 1:10, ch = scale(clustering.ch$crit), asw = scale(clustering.asw$crit))
> critframe <- melt(critframe, id.vars = c("k"), variable.name = "measure", value.name = "score")
> ggplot(critframe, aes(x = k, y = score, color = measure)) +
geom_point(aes(shape = measure)) +
geom_line(aes(linetype = measure)) +
scale_x_continuous(breaks = 1:10, labels = 1:10) +
scale_colour_hue()
> summary(clustering.ch)
Length Class Mode
cluster 25 -none- numeric
centers 18 -none- numeric
totss 1 -none- numeric
withinss 2 -none- numeric
tot.withinss 1 -none- numeric
betweenss 1 -none- numeric
size 2 -none- numeric
iter 1 -none- numeric
ifault 1 -none- numeric
crit 10 -none- numeric
bestk 1 -none- numeric
这里也用k-means算法来运行一下clusterboot()函数:
kbest.p <- 5
cboot <- clusterboot(pmatrix, clustermethod = kmeansCBI, runs = 100, iter.max = 100, krange = kbest.p, seed = 15555)
boot 1
boot 2
...
boot 100
> groups <- cboot$result$partition
> print_clusters(cboot$result$partition, kbest.p)
[1] "cluster 1"
Country RedMeat Fish Fr.Veg
1 Albania 10.1 0.2 1.7
4 Bulgaria 7.8 1.2 4.2
18 Romania 6.2 1.0 2.8
25 Yugoslavia 4.4 0.6 3.2
[1] "cluster 2"
Country RedMeat Fish Fr.Veg
6 Denmark 10.6 9.9 2.4
8 Finland 9.5 5.8 1.4
15 Norway 9.4 9.7 2.7
20 Sweden 9.9 7.5 2.0
[1] "cluster 3"
Country RedMeat Fish Fr.Veg
5 Czechoslovakia 9.7 2.0 4.0
7 E Germany 8.4 5.4 3.6
11 Hungary 5.3 0.3 4.2
16 Poland 6.9 3.0 6.6
23 USSR 9.3 3.0 2.9
[1] "cluster 4"
Country RedMeat Fish Fr.Veg
2 Austria 8.9 2.1 4.3
3 Belgium 13.5 4.5 4.0
9 France 18.0 5.7 6.5
12 Ireland 13.9 2.2 2.9
14 Netherlands 9.5 2.5 3.7
21 Switzerland 13.1 2.3 4.9
22 UK 17.4 4.3 3.3
24 W Germany 11.4 3.4 3.8
[1] "cluster 5"
Country RedMeat Fish Fr.Veg
10 Greece 10.2 5.9 6.5
13 Italy 9.0 3.4 6.7
17 Portugal 6.2 14.2 7.9
19 Spain 7.1 7.0 7.2
> cboot$bootmean
[1] 0.8670000 0.8420714 0.6147024 0.7647341 0.7508333
> cboot$bootbrd
[1] 15 20 49 17 32
分派点到新的簇
assign_cluster <- function(newpt, centers, xcenter = 0, xscale = 1) {
xpt <- (newpt - xcenter) / xscale
dists <- apply(centers, 1, FUN = function(c0) {sqr_edist(c0, xpt)})
which.min(dists) # 返回最接近的中心点编号
}
rnorm.multidim <- function(n, mean, sd, colstr = "x") {
ndim <- length(mean)
data <- NULL
for(i in 1:ndim) {
col <- rnorm(n, mean = mean[[i]], sd = sd[[i]])
data <- cbind(data, col)
}
cnames <- paste(colstr, 1:ndim, sep = '')
colnames(data) <- cnames
data
}
mean1 <- c(1, 1, 1) # 高斯分布的参数
sd1 <- c(1, 2, 1)
mean2 <- c(10, -3, 5)
sd2 <- c(2, 1, 2)
mean3 <- c(-5, -5, -5)
sd3 <- c(1.5, 2, 1)
clust1 <- rnorm.multidim(100, mean1, sd1) # 创建100个点的数据集,每个点都从上述数据分布中产生
clust2 <- rnorm.multidim(100, mean2, sd2)
clust3 <- rnorm.multidim(100, mean3, sd3)
toydata <- rbind(clust3, rbind(clust1, clust2))
tmatrix <- scale(toydata)
tcenter <- attr(tmatrix, "scaled:center")
tscale <- attr(tmatrix, "scaled:scale")
kbest.t <- 3
tclusters <- kmeans(tmatrix, kbest.t, nstart = 100, iter.max = 100) # 采用k-means算法,用3个簇对数据集进行聚类
> tclusters$size
[1] 101 99 100
unscale <- function(scaledpt, centervec, scalevec) { # “还原”数据点(将它们放回到原始数据集的坐标中)
scaledpt * scalevec + centervec
}
> unscale(tclusters$centers[1, ], tcenter, tscale)
x1 x2 x3
9.630898 -3.084541 4.949446
> mean2
[1] 10 -3 5
> unscale(tclusters$centers[2, ], tcenter, tscale)
x1 x2 x3
-4.833429 -5.018246 -5.064261
> mean3
[1] -5 -5 -5
> unscale(tclusters$centers[3, ], tcenter, tscale)
x1 x2 x3
1.0029535 0.7651268 0.8445897
> mean1
[1] 1 1 1
> assign_cluster(rnorm.multidim(1, mean1, sd1),
tclusters$centers,
tcenter, tscale)
3
3
> assign_cluster(rnorm.multidim(1, mean2, sd1),
tclusters$centers,
tcenter, tscale)
1
1
> assign_cluster(rnorm.multidim(1, mean3, sd1),
tclusters$centers,
tcenter, tscale)
2
2
聚类总结
- 聚类的目的是发现或找出数据子集之间的相似性。
- 在一个好的聚类中,同一个簇中的点,应该比这些点与其他簇中的点更相似(相接近)。
- 聚类时,每一个变量单位是变量的一种度量方式。不同的单位会导致计算的距离不同,进而潜在地导致了不同的聚类结果。
- 理想情况下,你想要的每一个坐标单位代表相同程度的变化。一种近似方法是把所有的列转化成均值为0,标准差为1.0的数据分布,比如
scale()
函数。 - 聚类常常被当作是数据探索的一部分,或者是监督学习方法的前提。
- 像可视化这样的方法一样,聚类具有更多迭代和交互过程,并且和监督方法相比,自动化程度更低。
- 不同的聚类算法会给出不同的结果,你应该考虑不同的聚类方法,并采用不同的簇的数目。
- 有很多估计最佳簇数的启发式算法。再次重申,你应该仔细考虑不同的启发式算法的结果,并探索各种不同的簇数。
参考文献
[1][数据科学 理论、方法与R语言实践]