FCM <- function(x, K, mybeta = 2, nstart = 1, iter_max = 100, eps = 1e-06) { ## FCM ## INPUTS ## x: input matrix n*d, n d-dim samples ## K: number of desired clusters ## Optional : ## mybeta : beta, exponent for u (defaut 2). ## nstart: how many random sets should be chosen(defaut 1) ## iter_max : The maximum number of iterations allowed. (default 100) ## ## OUTPUTS ## u: The fuzzy membership matrix = maxtrix of size n*K; ## g: matrix of size K*d of the centers of the clusters ## J: objective function ## histJ: all the objective function values in the iter process ## modified time: 2015-02-07 FCM_onetime <- function(x, init_centers, mybeta = 2, iter_max = 100, eps = 1e-06) { n = dim(x)[1] d = dim(x)[2] g = init_centers K = dim(g)[1] histJ = c() pasfini = 1 Jold = Inf D = matrix(0, n, K) for (j in 1:K) { D[, j] = rowSums(sweep(x, 2, g[j, ], "-")^2) } iter = 1 J_old = Inf while (pasfini) { s = (1/(D + eps))^(1/(mybeta - 1)) u = s/(s %*% matrix(1, K, K)) t1 = t(u^mybeta) %*% x t2 = t(u^mybeta) %*% matrix(1, n, d) V = t1/t2 g = V D = matrix(0, n, K) for (j in 1:K) { D[, j] = rowSums(sweep(x, 2, g[j, ], "-")^2) } J = sum(u^mybeta * D) pasfini = abs(J - Jold) > 0.001 && (iter < iter_max) Jold = J histJ = c(histJ, J) iter = iter + 1 } cluster_id = apply(u, 1, which.max) re = list(u, J, histJ, g, cluster_id) names(re) = c("u", "J", "histJ", "g", "cluster_id") return(re) } x = as.matrix(x) seeds = 1:nrow(x) id = sample(seeds, K) g = as.matrix(x[id, ]) re_best = FCM_onetime(x = x, init_centers = g, mybeta = mybeta, iter_max = iter_max, eps = eps) if (nstart > 1) { minJ = 0 i = 2 while (i <= nstart) { init_centers_id = sample(seeds, K) init_centers = as.matrix(x[init_centers_id, ]) run = FCM_onetime(x, init_centers = init_centers, mybeta = mybeta, iter_max = iter_max) if (run$J <= re_best$J) { re_best = run } i = i + 1 } } return(re_best) }
# 对于模糊聚类均值的公式及其推到,大致如下: #主要代码参见下面:(其中使用kmeans作比较。然后通过svm分类测验训练) # 设置伪随机种子 set.seed(100) # 生产数据样本 simple.data = function (n=200, nclass=2) { require(clusterGeneration) require(mvtnorm) # Center of Gaussians xpos = seq(-nclass*2, nclass*2, length=nclass) ypos = runif(nclass, min=-2*nclass, max=2*nclass) func = function(i,xpos,ypos,n) { # Create a random covariance matrix cov = genPositiveDefMat(2, covMethod="eigen", rangeVar=c(1, 10), lambdaLow=1, ratioLambda=10) # 保存随机数据 data = rmvnorm(n=n, mean=c(xpos[i], ypos[i]), sigma=cov$Sigma) # 保存每一次的结果 list(means=cbind(xpos[i], ypos[i]), covars=cov$Sigma, data=data,class=rep(i*1.0, n)) } # do call 合并列表 为 数据框 strL=do.call(rbind,lapply(1:nclass,func,xpos,ypos,n)) data=list() data$means=do.call(rbind,strL[,1]) data$covars = as.list(strL[,2]) data$data=do.call(rbind,strL[,3]) data$class=do.call(c,strL[,4]) # 返回 data } # 第一次随机产生u值 nr点个数 k 类别数 random.uij = function(k,nr) { # u = matrix(data=round(runif(k*nr,10,20)),nrow=k,ncol=nr, dimnames=list(paste(\'u\',1:k,sep=""),paste(\'x\',1:nr,sep=\'\'))) tempu = function(x) { ret = round(x/sum(x),4) # 保证每一列之和为1 ret[1] = 1-sum(ret[-1]) ret } apply(u,2,tempu) } # 计算 点矩阵 到 中心的距离 dist_cc_dd = function(cc,dd) { # cc 为 中心点 dd 为样本点值 temp = function(cc,dd) { # 计算每一个中心点与每一个点的距离 temp1 = function(index) { sqrt(sum(index^2)) } # 结果向量以列存放,后面的结果需要转置,按行存储 apply(dd-cc,2,temp1) } # 将结果转置 t(apply(cc,1,temp,dd)) } # 模糊均值聚类 fuzzy.cmeans = function(data,u,m=3) { # 简单的判断,可以不要 if (is.array(data) || is.matrix(data)) { data = as.data.frame(data) } # nr = nrow(data) # nc = ncol(data) # while (J > lim && step < steps) # { # step = step + 1 # uij 的 m 次幂 um = u^m rowsum = apply(um,1,sum) # 求中心点 ci cc = as.matrix(um/rowsum) %*% as.matrix(data) # rownames(cc)=paste(\'c\',1:k,sep=\'\') # colnames(cc)=paste(\'x\',1:nc,sep=\'\') # 计算 J 值 distance = dist_cc_dd(cc,t(data)) J = sum(distance^2 * um) # cc_temp = matrix(rep(cc,each=nr),ncol=2) # dd_temp = NULL # lapply(1:k,function(i){dd_temp <<- rbind(dd_temp,data)}) # dist = apply((dd_temp-cc_temp)^2,1,sum) # um_temp = as.vector(t(um)) # J = um_temp %*% dist # 计算幂次系数,后面需要使用m != 1 t = -2 / (m - 1) # 根据公式 计算 tmp = distance^t colsum = apply(tmp,2,sum) mat = rep(1,nrow(cc)) %*% t(colsum) # 计算 uij,如此u的每一列之和为0 u = tmp / mat # } # u # 保存一次迭代的结果值 list(U = u,C = cc,J = J) } # 设置初始化参数 n = 100 k = 4 dat = simple.data(n,k) nr = nrow(dat$data) m = 3 limit = 1e-4 max_itr=50 # 随机初始化 uij u = random.uij(k,nr) results = list() data=dat$data # 迭代计算 收敛值 for (i in 1 : max_itr) { results[[i]] = fuzzy.cmeans(dat$data,u,m) if (i != 1 && abs((results[[i]]$J - results[[i-1]]$J)) < limit) { break } u = results[[i]]$U } # 做散点图 require(ggplot2) data=as.data.frame(dat$data,stringsAsFactors=FALSE) data=cbind(data,dat$class) nc = ncol(data) colnames(data)=paste(\'x\',1:nc,sep=\'\') # par(mar=rep(2,4)) p=ggplot(data,aes(x=x1,y=x2,color=factor(x3))) p+geom_point()+xlab(\'x轴\')+ylab(\'y轴\')+ggtitle(\'scatter points\') # plot(dat$data,col=factor(dat$class)) # points(results[[i]]$C,pch=19,col=1:uniqe(dat$class)) # Sys.sleep(1) # 计算聚类与原始类的误差比率 tclass=apply(results[[i]]$U,2,function(x){which(x==max(x))}) tclass[tclass==2]=5 tclass[tclass==3]=6 tclass[tclass==4]=7 tclass[tclass==5]=4 tclass[tclass==6]=2 tclass[tclass==7]=3 freq=table(dat$class,tclass) (sum(freq)-sum(diag(freq))) / sum(freq) # 训练 svm model svm_test = function() { library(e1071) svm.fit = svm(dat$data,dat$class) r.fit = predict(svm.fit, dat$data) diff.class = round(as.numeric(r.fit)) - as.numeric(dat$class) i.misclass = which(abs(diff.class) > 0) n.misclass = length(i.misclass) f.misclass = n.misclass/length(dat$class) } # 同一数据,使用 kmeans 聚类 kmeans_test = function() { k.fit = kmeans(x=dat$data,4) tclass=k.fit$cluster tclass[tclass==2]=5 tclass[tclass==3]=6 tclass[tclass==4]=7 tclass[tclass==5]=3 tclass[tclass==6]=4 tclass[tclass==7]=2 freq=table(dat$class,tclass) (sum(freq)-sum(diag(freq))) / sum(freq) } # kmeans 和 fuzzy c means
请发表评论