目录
***训练部分***
1.工具
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from sklearn,cluster import MiniBatchKMeans
from sklearn import metrics
2.读取数据
dpath='./data/
train=pd.read_csv(dpath+'Otto_train.csv')
print(train.head())
3.准备数据
y_train=train['target']
X_train=train.drop(['id','target'],axis=1)
#用于每个样本的聚类结果
train_id=train['id']
#数据进行归一:每个样本的模长为1
normalize(X_train,norm="12",copy=False)
4.KMeans聚类
#一个参数点(聚类数据为K)的模型
def K_cluster_analysis(K,X):
print("K-means begin with cluster:{}".format(K))
#K-means,在训练集上训练
mb_kmeans=MiniBatchKMeans(n_clusters=K)
y_pred=mb_kmeans.fit_predict(X)
#K值的评估标准
#本案例中训练数据有标签,可采用有参考模型的评价指标
#v_score=metricx.v_measure_score(y_val,y_val_pred)
#亦可采用无参考默认的评价指标:轮廓稀疏Silhouette Coefficient和
Calinski-Sarabasz Index
CH_score=metics.calinski_harabaz_score(X,y_pred)
#轮廓稀疏Silhoutte Coefficient在大样本时计算太慢
#si_score=metrics.silhoutte_score(X,y_pred)
print('CH_score:{}'.format(CH_score))
return CH_score
#设置超参数(聚类数目)搜索范围
Ks=[5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
CH_scores=[]
for K in Ks:
ch=K_cluster_analysis(K,X_train)
CH_scores.append(ch)
#绘制不同K对应的聚类性能,找到最佳模型/参数(分数最高)
import matplotlib.pyplot as plt
plt.plot(Ks,np.array(CH_scores),'b-',label='CH_score')
plt.show()
#最佳超参数
index=np.unravel_index(np.argmax(CH_scores,axis=None),len(CH_score))
Best_K=Ks[index[0]]
print(Best_K)
输出结果:
从图中看出k=8时,CH分数最高。
5.用最佳的K再次聚类,得到聚类结果
mb_kmeans=MiniBatchKMeans(n_clusters=Best_K)
y_pred=mb_kmeans.fit_predict(X_train)
print(y_pred)
#保存聚类结果
feat_names_Kmeans='Kmeans_'+str(Best_K)
y=pd.Series(data=y_train,name='target')
train_kmeans=pd.concat([train_id,pd.Series(name=feat_names_Kmeans,data=y_pred),
y],axis=1)
train_kmeans.to_csv(dpath+'Otto_FE_train_KMeans.csv',index=False,header=True)
6.保存KMeans模型,用于后续对测试数据的聚类
import pickle
pickle.dump(mb_kmeans,open('mb_kmeans.pkl','wb'))
***测试部分***
读取数据
dpath='./data/'
test=pd.read_csv(dpath+'Otto_test.csv')
print(test.head())
准备数据
#暂存id,用于保存特征变换后的结果并用于结果提交
test_id=test['id']
X_test=test.drop(['id'],axis=1)
#数据进行归一:每个样本的模型为1
normalize(X_test,norm='l2',copy=False)
KMeans聚类
import pickle
mb_kmeans=pickle.load(open('mb_kmeans.pkl','rb'))
#在训练集和测试集降维
y_test_pred=mb_kmeans.predict(X_test)
保存结果,KMeans聚类后的表示可作为特征提取的一部分
feat_names_Kmeans='Kmeans_'+str(mb_kmeans.cluster_centers_))
test_kmeans=pd.concat([test_id,pd.Series(name=feat_name_Kmeans,data=
y_test_pred)],axis=1)
test_kmeans.to_csv(dpath+'Otto_FE_test_KMeans.csv')