关注微信公共号:小程在线
关注CSDN博客:程志伟的博客
R版本:v_3.6.1
主要讲述5类聚类:
K-means聚类
K-中心聚类
系谱聚类
密度聚类
EM聚类
5种聚类的应用实例以及详细的参数说明如下:
数据导入数据:
> countries = read.csv('G:\\R语言\\大三下半年\\数据挖掘:R语言实战\\数据挖掘:R语言实战(案例数据集)\\07 聚类分析\\data.csv')
> head(countries)
V1 V2 V3
1 ALGERIA 36.4 14.6
2 CONGO 37.3 8.0
3 EGYPT 42.1 15.3
4 GHANA 55.8 25.6
5 IVORY COAST 56.1 33.1
6 MALAGASY 41.8 15.8
> #重命名
> names(countries) <- c('country','birth','death')
> var <- countries$country
> var <- as.character(var)
> head(var)
[1] "ALGERIA" "CONGO" "EGYPT" "GHANA" "IVORY COAST"
[6] "MALAGASY"
> #将数据的国家名字作为行名字
> for(i in 1:68) row.names(countries)[i] = var[i]
> head(countries)
country birth death
ALGERIA ALGERIA 36.4 14.6
CONGO CONGO 37.3 8.0
EGYPT EGYPT 42.1 15.3
GHANA GHANA 55.8 25.6
IVORY COAST IVORY COAST 56.1 33.1
MALAGASY MALAGASY 41.8 15.8
> #画出所有68个国家与地区的样本点
> plot(countries$birth,countries$death)
> C1 <- which(countries$country=='CHINA')
> #T1 <- which(countries$country='TAIWAN')
> #I1 <- which(countries$country='INDIA')
> #U1 <- which(countries$country='UNITED STATES')
> J1 <- which(countries$country=='JAPAN')
> M <- which.max(countries$country)
> points(countries[c(C1,J1,M),-1],pch=16)
> legend(countries$birth[C1],countries$death[C1],'CHINA',bty='n',xjust=0.5,cex=0.8)
> legend(countries$birth[J1],countries$death[J1],'JAPAN',bty='n',xjust=1,cex=0.8)
> ##############1.K-均值聚类######################
> fit_km1 <- kmeans(countries[,-1],centers = 3)
> #
> print(fit_km1)
K-means clustering with 3 clusters of sizes 15, 17, 36
Cluster means:
birth death
1 33.99333 8.860000
2 45.85294 14.305882
3 19.54722 9.172222
Clustering vector:
ALGERIA CONGO EGYPT GHANA IVORY COAST
1 1 2 2 2
MALAGASY MOROCCO TUNISIA CAMBODIA CEYLON
2 2 2 2 1
CHINA TAIWAN HONG KONG INDIA INDONESIA
1 1 1 3 1
IRAQ JAPAN JORDAN KOREA MALAYSIA
3 3 2 3 1
MONGOLIA PHILLLIPINES SYRIA THAILAND VIETNAM
1 1 3 1 3
CANADA COSTA RICA DOMINICAN R GUATEMALA HONDURAS
3 2 1 2 2
MEXICO NICARAGUA PANAMA UNITED STATES ARGENTINA
2 2 1 3 3
BOLIVIA BRAZIL CHILE COLOMBIA ECUADOR
3 2 1 2 2
PERU URUGUAY VENEZUELA AUSTRIA BElGIUM
1 3 2 3 3
BRITAIN BULGARIA CZECHOSLOVAKIA DENMARK FINLAND
3 3 3 3 3
E.GERMANY W.GERMANY GREECE HUNGARY IRELAND
3 3 3 3 3
ITALY NETHERLANDS NORWAY POLAND PORTUGAL
3 3 3 3 3
ROMANIA SPAIN SWEDEN SWITZERLAND U.S.S.R.
3 3 3 3 3
YUGOSLAVIA AUSTRALIA NEW ZEALAND
3 3 3
Within cluster sum of squares by cluster:
[1] 290.5053 1126.4718 640.1819
(between_SS / total_SS = 81.0 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
#查看中心点
> fit_km1$centers
birth death
1 33.99333 8.860000
2 45.85294 14.305882
3 19.54722 9.172222
#总平方和,组内平方和,组间平方和
> fit_km1$totss;fit_km1$tot.withinss;fit_km1$betweenss
[1] 10818.94
[1] 2057.159
[1] 8761.782
> plot(countries[,-1],pch=(fit_km1$centers-1))
> points(fit_km1$centers,pch=8)
> legend(fit_km1$centers[1,1],fit_km1$centers[1,2],'Center_1',bty='n',xjust=1,yjust = 0,cex=0.8)
> legend(fit_km1$centers[2,1],fit_km1$centers[2,2],'Center_2',bty='n',xjust=0,yjust = 0,cex=0.8)
> legend(fit_km1$centers[3,1],fit_km1$centers[3,2],'Center_3',bty='n',xjust=0.5,,cex=0.8)
#选择最优类别数,可以看出在10类的时候趋于稳定
> result <- rep(0,67)
> for(k in 1:67)
+ {
+ fit_km = kmeans(countries[,-1],centers = k)
+ result[k] = fit_km$betweenss/fit_km$totss
+ }
> round(result,2)
[1] 0.00 0.72 0.81 0.85 0.86 0.92 0.94 0.95 0.95 0.96 0.94 0.95 0.97 0.97 0.95
[16] 0.95 0.98 0.98 0.98 0.99 0.99 0.98 0.99 0.99 0.98 0.99 0.99 0.99 0.99 0.99
[31] 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 1.00 1.00 1.00 0.99 1.00
[46] 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
[61] 1.00 1.00 1.00 1.00 1.00 1.00 1.00
> #当K=10时,查看聚类
> fit_km2 <- kmeans(countries[,-1],centers = 10)
> cluster_china <-fit_km2$cluster[which(countries$country=='CHINA')]
> which(countries$country=='CHINA')
[1] 11
> which(fit_km2$cluster==cluster_china)
ALGERIA CHINA CHILE
1 11 38
> ####################2.K-中心聚类####################
> library(cluster)
> fit_pam <- pam(countries[,-1],3)
> print(fit_pam)
Medoids:
ID birth death
DOMINICAN R 28 33.0 8.4
COLOMBIA 39 44.0 11.7
SWITZERLAND 64 18.9 9.6
Clustering vector:
ALGERIA CONGO EGYPT GHANA IVORY COAST
1 1 2 2 2
MALAGASY MOROCCO TUNISIA CAMBODIA CEYLON
2 2 2 2 1
CHINA TAIWAN HONG KONG INDIA INDONESIA
1 1 1 3 1
IRAQ JAPAN JORDAN KOREA MALAYSIA
3 3 2 3 1
MONGOLIA PHILLLIPINES SYRIA THAILAND VIETNAM
2 1 1 1 3
CANADA COSTA RICA DOMINICAN R GUATEMALA HONDURAS
3 2 1 2 2
MEXICO NICARAGUA PANAMA UNITED STATES ARGENTINA
2 2 2 3 3
BOLIVIA BRAZIL CHILE COLOMBIA ECUADOR
3 2 1 2 2
PERU URUGUAY VENEZUELA AUSTRIA BElGIUM
1 3 2 3 3
BRITAIN BULGARIA CZECHOSLOVAKIA DENMARK FINLAND
3 3 3 3 3
E.GERMANY W.GERMANY GREECE HUNGARY IRELAND
3 3 3 3 3
ITALY NETHERLANDS NORWAY POLAND PORTUGAL
3 3 3 3 3
ROMANIA SPAIN SWEDEN SWITZERLAND U.S.S.R.
3 3 3 3 3
YUGOSLAVIA AUSTRALIA NEW ZEALAND
3 3 3
Objective function:
build swap
4.751737 4.378433
Available components:
[1] "medoids" "id.med" "clustering" "objective" "isolation"
[6] "clusinfo" "silinfo" "diss" "call" "data"
> #查看该聚类的数据集
> head(fit_pam$data)
birth death
ALGERIA 36.4 14.6
CONGO 37.3 8.0
EGYPT 42.1 15.3
GHANA 55.8 25.6
IVORY COAST 56.1 33.1
MALAGASY 41.8 15.8
> #查看该类结果的函数设置
> fit_pam$call
pam(x = countries[, -1], k = 3)
#keep.data=F时,无法获取聚类的数据集信息
#ciuster.only=F时,只显示各样本的类别
> which(fit_km1$cluster!=fit_pam$clustering)
MONGOLIA SYRIA PANAMA
21 23 33
> ########################3.系谱聚类#####################
> fit_hc <- hclust(dist(countries[,-1]))
> print(fit_hc)
Call:
hclust(d = dist(countries[, -1]))
Cluster method : complete
Distance : euclidean
Number of objects: 68
> plot(fit_hc)
> #利用剪枝,控制K的类别
> group_k3 <- cutree(fit_hc,k=3)
> group_k3
ALGERIA CONGO EGYPT GHANA IVORY COAST
1 1 1 2 2
MALAGASY MOROCCO TUNISIA CAMBODIA CEYLON
1 1 1 1 1
CHINA TAIWAN HONG KONG INDIA INDONESIA
1 1 1 3 3
IRAQ JAPAN JORDAN KOREA MALAYSIA
3 3 1 3 1
MONGOLIA PHILLLIPINES SYRIA THAILAND VIETNAM
1 3 3 1 3
CANADA COSTA RICA DOMINICAN R GUATEMALA HONDURAS
3 1 1 1 1
MEXICO NICARAGUA PANAMA UNITED STATES ARGENTINA
1 1 1 3 3
BOLIVIA BRAZIL CHILE COLOMBIA ECUADOR
3 1 1 1 1
PERU URUGUAY VENEZUELA AUSTRIA BElGIUM
3 3 1 3 3
BRITAIN BULGARIA CZECHOSLOVAKIA DENMARK FINLAND
3 3 3 3 3
E.GERMANY W.GERMANY GREECE HUNGARY IRELAND
3 3 3 3 3
ITALY NETHERLANDS NORWAY POLAND PORTUGAL
3 3 3 3 3
ROMANIA SPAIN SWEDEN SWITZERLAND U.S.S.R.
3 3 3 3 3
YUGOSLAVIA AUSTRALIA NEW ZEALAND
3 3 3
> table(group_k3)
group_k3
1 2 3
27 2 39
> #利用H参数控制高度
> group_h18 <- cutree(fit_hc,h=18)
> group_h18
ALGERIA CONGO EGYPT GHANA IVORY COAST
1 1 2 3 3
MALAGASY MOROCCO TUNISIA CAMBODIA CEYLON
2 2 2 2 1
CHINA TAIWAN HONG KONG INDIA INDONESIA
1 1 1 4 4
IRAQ JAPAN JORDAN KOREA MALAYSIA
4 4 2 4 1
MONGOLIA PHILLLIPINES SYRIA THAILAND VIETNAM
2 4 4 1 4
CANADA COSTA RICA DOMINICAN R GUATEMALA HONDURAS
4 2 1 2 2
MEXICO NICARAGUA PANAMA UNITED STATES ARGENTINA
2 2 2 4 4
BOLIVIA BRAZIL CHILE COLOMBIA ECUADOR
4 2 1 2 2
PERU URUGUAY VENEZUELA AUSTRIA BElGIUM
4 4 2 4 4
BRITAIN BULGARIA CZECHOSLOVAKIA DENMARK FINLAND
4 4 4 4 4
E.GERMANY W.GERMANY GREECE HUNGARY IRELAND
4 4 4 4 4
ITALY NETHERLANDS NORWAY POLAND PORTUGAL
4 4 4 4 4
ROMANIA SPAIN SWEDEN SWITZERLAND U.S.S.R.
4 4 4 4 4
YUGOSLAVIA AUSTRALIA NEW ZEALAND
4 4 4
> table(group_h18)
group_h18
1 2 3 4
10 17 2 39
> plot(fit_hc)
> rect.hclust(fit_hc,k=4,border = 'light grey')
> rect.hclust(fit_hc,k=3,border = 'dark grey')
> rect.hclust(fit_hc,k=7,which=c(2,6),border = 'light grey')
> ########################4.密度聚类#########################
> install.packages('fpc')
> library(fpc)
Warning message:
程辑包‘fpc’是用R版本3.6.2 来建造的
> ds1 <- dbscan(countries[,-1],eps=1,MinPts = 5)
> ds2 <- dbscan(countries[,-1],eps=4,MinPts = 5)
> ds3 <- dbscan(countries[,-1],eps=4,MinPts = 2)
> ds4 <- dbscan(countries[,-1],eps=8,MinPts = 5)
#当eps=1,MinPts = 5时,聚类分为2类,其中1类6个样本,互相密度可达,seed行,
#border对应3,类别密度边缘构成的类别,border对应的59,表示噪声
#当eps=4,MinPts = 5时,聚类分为4类,噪音5个
#当eps=4,MinPts = 2时,聚类为3,噪音3
#当eps=8,MinPts = 5时,聚类为1,噪音2
> ds1;ds2;ds3;ds4
dbscan Pts=68 MinPts=5 eps=1
0 1
border 59 3
seed 0 6
total 59 9
dbscan Pts=68 MinPts=5 eps=4
0 1 2
border 5 7 1
seed 0 18 37
total 5 25 38
dbscan Pts=68 MinPts=2 eps=4
0 1 2 3
border 3 0 0 0
seed 0 25 2 38
total 3 25 2 38
dbscan Pts=68 MinPts=5 eps=8
0 1
border 2 0
seed 0 66
total 2 66
#根据上面发现,半径参数与阈值的取值差距越大,聚类数越少,噪音少
> par(mfcol=c(2,2))
> plot(ds1,countries[,-1],main='1:eps=1,MinPts = 5')
> plot(ds3,countries[,-1],main='3:eps=4,MinPts = 2')
> plot(ds2,countries[,-1],main='2:eps=4,MinPts = 5')
> plot(ds4,countries[,-1],main='4:eps=8,MinPts = 5')
> d <- dist(countries[,-1])
> max(d);min(d)
[1] 49.56259
[1] 0.2236068
> library(ggplot2)
Warning message:
程辑包‘ggplot2’是用R版本3.6.2 来建造的
> interval <- cut_interval(d,30)
> table(interval)
interval
[0.224,1.87] (1.87,3.51] (3.51,5.16] (5.16,6.8] (6.8,8.45] (8.45,10.1]
78 156 222 201 151 121
(10.1,11.7] (11.7,13.4] (13.4,15] (15,16.7] (16.7,18.3] (18.3,20]
141 100 93 104 104 89
(20,21.6] (21.6,23.2] (23.2,24.9] (24.9,26.5] (26.5,28.2] (28.2,29.8]
101 97 101 100 83 75
(29.8,31.5] (31.5,33.1] (33.1,34.8] (34.8,36.4] (36.4,38.1] (38.1,39.7]
38 30 12 8 8 12
(39.7,41.3] (41.3,43] (43,44.6] (44.6,46.3] (46.3,47.9] (47.9,49.6]
11 14 13 8 5 2
> #样本点最多的区间
> which.max(table(interval))
(3.51,5.16]
3
> ######################5.期望最大化聚类 #################
> library(mclust)
__ ___________ __ _____________
/ |/ / ____/ / / / / / ___/_ __/
/ /|_/ / / / / / / / /\__ \ / /
/ / / / /___/ /___/ /_/ /___/ // /
/_/ /_/\____/_____/\____//____//_/ version 5.4.5
Type 'citation("mclust")' for citing this R package in publications.
Warning message:
程辑包‘mclust’是用R版本3.6.2 来建造的
> fit_em <- Mclust(countries[,-1])
fitting ...
|=====================================================================| 100%
> summary(fit_em)
----------------------------------------------------
Gaussian finite mixture model fitted by EM algorithm
----------------------------------------------------
Mclust EII (spherical, equal volume) model with 4 components:
log-likelihood n df BIC ICL
-418.415 68 12 -887.464 -893.5937
Clustering table:
1 2 3 4
2 13 17 36
> #获取更加详细信息
> summary(fit_em,parameters=TRUE)
Error: unexpected input in "summary(fit_em?
> #获取更加详细信息
> summary(fit_em,parameters=TRUE)
----------------------------------------------------
Gaussian finite mixture model fitted by EM algorithm
----------------------------------------------------
Mclust EII (spherical, equal volume) model with 4 components:
log-likelihood n df BIC ICL
-418.415 68 12 -887.464 -893.5937
Clustering table:
1 2 3 4
2 13 17 36
Mixing probabilities:
1 2 3 4
0.02941271 0.18048749 0.25049059 0.53960920
Means:
[,1] [,2] [,3] [,4]
birth 55.94969 33.550421 43.78269 19.721824
death 29.34963 8.512713 12.08308 9.192534
Variances:
[,,1]
birth death
birth 10.10353 0.00000
death 0.00000 10.10353
[,,2]
birth death
birth 10.10353 0.00000
death 0.00000 10.10353
[,,3]
birth death
birth 10.10353 0.00000
death 0.00000 10.10353
[,,4]
birth death
birth 10.10353 0.00000
death 0.00000 10.10353
> plot(fit_em)
Model-based clustering plots:
1: BIC
2: classification
3: uncertainty
4: density
Selection: 1
Model-based clustering plots:
1: BIC
2: classification
3: uncertainty
4: density
Selection: 2
Model-based clustering plots:
1: BIC
2: classification
3: uncertainty
4: density
Selection: 3
Model-based clustering plots:
1: BIC
2: classification
3: uncertainty
4: density
Selection: 4
Model-based clustering plots:
1: BIC
2: classification
3: uncertainty
4: density
Selection:
>
> countries_BIC <- mclustBIC(countries[,-1])
fitting ...
|=====================================================================| 100%
> countries_BICsum <- summary(countries_BIC,data = countries[,-1])
> countries_BICsum
Best BIC values:
EII,4 EEI,4 EVI,4
BIC -887.464 -891.670811 -894.177259
BIC diff 0.000 -4.206773 -6.713222
Classification table for model (EII,4):
1 2 3 4
2 13 17 36
> countries_BIC
Bayesian Information Criterion (BIC):
EII VII EEI VEI EVI VVI EEE
1 -993.8000 -993.8000 -949.3886 -949.3886 -949.3886 -949.3886 -938.6563
2 -924.5847 -902.9238 -927.0827 -901.0607 -914.1093 -905.0255 -921.6755
3 -895.1280 -895.5875 -894.5426 -897.8874 -900.1559 -906.3466 -898.5011
4 -887.4640 -896.9525 -891.6708 -907.0904 -894.1773 -911.6143 -894.8700
5 -896.0308 NA -903.7036 -915.4377 -901.1471 -923.4156 -903.5466
6 -899.0789 NA -900.1866 -926.1524 -909.2644 -942.9946 -903.8949
7 -902.4795 NA -906.3870 -937.7140 -918.1703 -954.7813 -910.5878
8 -914.3185 NA -918.4934 -952.6815 -930.7168 -969.3971 -922.7029
9 -924.6331 NA -930.0508 -955.7891 -945.8924 -986.2344 -929.3172
EVE VEE VVE EEV VEV EVV VVV
1 -938.6563 -938.6563 -938.6563 -938.6563 -938.6563 -938.6563 -938.6563
2 -912.1381 -903.8662 -897.6158 -905.8647 -901.3006 -909.9233 -905.3002
3 -904.2435 -901.9792 -906.2489 -902.8644 -904.1626 -905.0875 -910.1495
4 NA -910.7905 -914.9819 -899.6887 -916.3686 NA -924.4019
5 NA -923.3082 -931.4274 -907.0621 -932.5586 -926.6488 -946.4810
6 NA -934.2484 -949.0195 -919.1423 -945.0698 NA -959.5636
7 NA -947.0690 -965.0627 -934.9488 -957.8668 NA -983.8418
8 NA -949.0828 -977.9138 -938.3300 -972.1705 NA -1000.5820
9 NA -972.0222 -999.4991 -981.5454 -987.6825 NA -1018.6391
Top 3 models based on the BIC criterion:
EII,4 EEI,4 EVI,4
-887.4640 -891.6708 -894.1773
|
请发表评论