python身高体重程序代码_python EM算法4(身高体重数据集)

1 处理数据 import numpy as np # 预处理数据 def loadData(filename): dataSet = [] fr = open(filename) for line in fr.readlines(): curLine = line.strip('\n').split('\t') fltLine = list(map(float, curLine)) dataSet.append(fltLine) fr.close() return dataSet # 计算高斯函数 def Gaussian(data,mean,cov): dim = np.shape(cov)[0] # 计算维度 covdet = np.linalg.det(cov) # 计算|cov| if covdet==0: # 以防行列式为0 covdet = np.linalg.det(cov+np.eye(dim)*0.01) covinv = np.linalg.inv(cov+np.eye(dim)*0.01) else: covinv = np.linalg.inv(cov) # 计算cov的逆 #print(data,mean) m = data - mean z = -0.5 * np.dot(np.dot(m, covinv),m) # 计算exp()里的值 return 1.0/(np.power(np.power(2*np.pi,dim)*abs(covdet),0.5))*np.exp(z) # 返回概率密度值

2 获取初始聚类中心 # 获取最初的聚类中心 def GetInitialMeans(data,K,criterion): dim = data.shape[1] # 数据的维度 means = [[] for k in range(K)] # 存储均值 minmax=[] for i in range(dim): minmax.append(np.array([min(data[:,i]),max(data[:,i])])) # 存储每一维的最大最小值 minmax=np.array(minmax) while True: for i in range(K): means[i]=[] for j in range(dim): means[i].append(np.random.random()*(minmax[j][1]-minmax[j][0])+minmax[j][0] ) #随机产生means means[i]=np.array(means[i]) if isdistance(means,criterion): break return means # 用于判断初始聚类簇中的means是否距离离得比较近 def isdistance(means,criterion=0.03): K=len(means) for i in range(K): for j in range(i+1,K): if criterion>np.linalg.norm(means[i]-means[j]): return False return True

3 GMM主程序 def GMM(data,K,ITER): N = data.shape[0] dim = data.shape[1] means= Kmeans(data,K) #means=GetInitialMeans(data,K,0.03) convs=[0]*K # 初始方差等于整体data的方差 for i in range(K): convs[i]=np.cov(data.T) #convs=np.full((K,dim,dim),np.diag(np.full(dim,0.1))) phi = [1.0/K] * K omega = [np.zeros(K) for i in range(N)] loglikelyhood = 0 oldloglikelyhood = 1 while np.abs(loglikelyhood - oldloglikelyhood) > 0.00001: #print(np.abs(loglikelyhood - oldloglikelyhood)) #while ITER: oldloglikelyhood = loglikelyhood # E步 for i in range(N): res = [phi[k] * Gaussian(data[i],means[k],convs[k]) for k in range(K)] sumres = np.sum(res) for k in range(K): # gamma表示第n个样本属于第k个混合高斯的概率 omega[i][k] = res[k] / sumres # M步 for k in range(K): Nk = np.sum([omega[n][k] for n in range(N)]) # N[k] 表示N个样本中有多少属于第k个高斯 phi[k] = 1.0 * Nk/N means[k] = (1.0/Nk)*np.sum([omega[n][k] * data[n] for n in range(N)],axis=0) xdiffs = data - means[k] convs[k] = (1.0/ Nk)*np.sum([omega[n][k]* xdiffs[n].reshape(dim,1) * xdiffs[n] for n in range(N)],axis=0) # 计算最大似然函数 loglikelyhood = np.sum( [np.log(np.sum([phi[k] * Gaussian(data[n], means[k], convs[k]) for k in range(K)])) for n in range(N)]) ITER-=1 #print(oldloglikelyhood,loglikelyhood) return phi,means,convs

在GMM中用到的Kmeans算法如下: # K均值算法,估计大约几个样本属于一个GMM import copy def Kmeans(data,K): N = data.shape[0] # 样本数量 dim = data.shape[1] # 样本维度 means = GetInitialMeans(data,K,0.03) means_old = [np.zeros(dim) for k in range(K)] # 收敛条件 while np.sum([np.linalg.norm(means_old[k] - means[k]) for k in range(K)]) > 0.0001: means_old = copy.deepcopy(means) numlog = [1] * K # 存储属于某类的个数 sumlog = [np.zeros(dim) for k in range(K)] # E步 for i in range(N): dislog = [np.linalg.norm(data[i]-means[k]) for k in range(K)] tok = dislog.index(np.min(dislog)) numlog[tok]+=1 # 属于该类的样本数量加1 sumlog[tok]+=data[i] # 存储属于该类的样本取值 # M步 for k in range(K): means[k]=1.0 / numlog[k] * sumlog[k] return means def computeOmega(X,mu,sigma,phi,multiGaussian): n_samples=X.shape[0] n_clusters=len(phi) gamma=np.zeros((n_samples,n_clusters)) p=np.zeros(n_clusters) g=np.zeros(n_clusters) for i in range(n_samples): for j in range(n_clusters): p[j]=multiGaussian(X[i],mu[j],sigma[j]) g[j]=phi[j]*p[j] for k in range(n_clusters): gamma[i,k]=g[k]/np.sum(g) return gamma def predict(data,m,c,p): pred=computeOmega(np.array(data),m,c,p,Gaussian) cluster_results=np.argmax(pred,axis=1) return cluster_results

4 测试身高体重数据集 d=[] with open('d:/dataset1.txt','r') as f: for line in f.readlines(): d.append(line.strip('\n').split('\t')) d1=np.array(d) d2=[list(map(float,i))for i in d1[:,:-1]] data=np.array(d2) t=d1[:,-1] p2,m2,c2=GMM(data,2,50) pt2=predict(data,m2,c2,p2) pt2

结果如下: array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0], dtype=int64)

p2,m2,c2的值分别为: p2 [0.7822324209405439, 0.21776757905945612] m2 [array([170.5300851 , 59.69283016]), array([175.14653505, 73.79246884])] c2 [array([[63.72099898, 54.66242696], [54.66242696, 73.46041058]]), array([[ 21.00207568, 14.73278643], [ 14.73278643, 140.29202475]])]

误差和错误率: t=d1[:,-1] t[t=='f']=1 t[t=='m']=0 t=list(map(int,t)) t=np.array(t) c=0 for i in t==pt2: if i==False: c+=1 print('错误数为:',c) print('错误率为:',round(c/len(t),3))

结果为:

错误数为: 176

错误率为: 0.389

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值