# -*- coding: utf-8 -*-
"""
Created on Fri Jul 27 11:42:16 2018
@author: wenyun.wxw
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import copy
from sklearn.preprocessing import StandardScaler
b1=[]
with open('1.csv', 'r') as f:
for line in f:
b1.append(line.strip('\n').strip(',').split(','))
#bb1=df[:100]
#全部列
data=[]
for i in range(len(b1)):
data.append([str(x) if type(x)==str else float(x) for x in b1[i]])
#data=np.array(data)
df=copy.copy(pd.DataFrame(data))
#获取列名
colname=pd.read_table('column_name.txt',header=None)
df.columns=colname
des=df.describe() #看样本数据的大概情况
#--------------去相关处理-----------------
#部分数据:行为数据
df1=pd.DataFrame(df[df.columns[2:62]],dtype=float)
df1.astype(float)
#pearson相关系数
cor=df1.corr()
des1=df1.describe()
df2=copy.copy(df1) #删除相关性高的变量后的data
corind=[]
del_col=[] #删去的变量名
for i in range(len(cor)):
for j in range(i+1,len(cor)):
if np.array(cor)[i][j]>0.7 and df.columns[j] in df2.columns:
corind.append([df.columns[i],df.columns[j]])
df2=df2.drop(columns=df.columns[j])
del_col.append(df.columns[j])
#一共删了9个变量
#--------------异常值处理-----------------
#删去分位数前0.1%和后99.9%的值
df3=copy.copy(df2)
q1=df2.quantile(0.001)
q99=df2.quantile(0.999)
for k in df2.columns:
df3=df2[(df2[k]>q1[k]) & (df2[k]<q99[k])]
#q1不取等号会删去大部分0值
#df2=df2.dropna(axis=0,how='any') #删除存在nan的行
des3=df3.describe()
#---------------标准化处理----------------
scaler = StandardScaler().fit(df3)
#scaler.mean_ #样本均值
#scaler.var_ #样本方差
df4=pd.DataFrame(scaler.fit_transform(df3))
des4=df4.describe()
#----------------kmeans聚类---------------
#-------选K------
#K的上限为20
kmax=20
#手肘法:画sse与K的图,拐点处为最佳K
SSE = [] # 存放每次结果的误差平方和
for k in range(1,kmax):
estimator = KMeans(n_clusters=k,init='k-means++') # 构造聚类器
estimator.fit(df4)
SSE.append(estimator.inertia_)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(range(1,kmax),SSE,'o-')
plt.grid()
plt.xticks(np.arange(1, kmax, 1))
plt.show()
#效果不好,看不出来拐点
#轮廓系数法,轮廓系数最大的k最佳
from sklearn.metrics import silhouette_score
Scores = [] # 存放轮廓系数
for k in range(2,kmax):
estimator = KMeans(n_clusters=k,init='k-means++') # 构造聚类器
estimator.fit(df4)
Scores.append(silhouette_score(df4,estimator.labels_,metric='euclidean'))
plt.xlabel('k')
plt.ylabel('轮廓系数')
plt.plot(range(2,kmax),Scores,'o-')
plt.grid()
plt.xticks(np.arange(1, kmax, 1))
plt.show()
#k=12时相对最大
#-------------------带入k聚类-------------
num=12
clf = KMeans(n_clusters=num,init='k-means++')
model = clf.fit(df4)
#中心点
centers=clf.cluster_centers_
#Kmeans的sse
clf.inertia_
#分类结果:每个样本所属的簇
label=clf.labels_
#-----------------分层占比----------------
df5=[]
for i in range(len(df3)):
df5.append(len(
d=df.values[df3.index[i]]
df5.insert(0,'label',label)
df5.insert(len(df3),colname[62:71],)
# 根据分类结果计算平均值
df5_mean = df5.groupby(label)[df5.columns].mean()
#---------------聚类图-------------
#用TSNE(高维数据可视化工具)对聚类结果进行可视化
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
tsne = TSNE(learning_rate=100)
# 对数据进行降维
tsne.fit_transform(df4)
df_tsne=pd.DataFrame(tsne.embedding_, index=df4.index)
# 不同类别用不同颜色和样式绘图
colors = ['r','g', 'b', 'm', 'k'] #红,绿,蓝,酒红,黑
markers = ['o', 'x', '+', '^', 'v'] #圆圈, x, +, 正三角, 倒三角
styles = [x+y for x in colors for y in markers] #这样就有5*5=25中不同组合
for i in range(num):
d = df_tsne[model.labels_==i]
plt.plot(d[0],d[1],styles[i],label=str(i))
plt.lengend()
plt.show()
#聚类图看不清楚
#---------------------------------------其他聚类方法尝试---------------------
#-------Affinity Propagation自动选k聚类----
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
# Compute Affinity Propagation
af = AffinityPropagation(preference=-50).fit(df4)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
#自动分了282类
#----------------MeanShift-----------
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth
# Compute clustering with MeanShift
ms = MeanShift(bin_seeding=True)
ms.fit(df4)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print("number of estimated clusters : %d" % n_clusters_)
#自动分了46类
#--------------谱聚类-----------
from sklearn.cluster import SpectralClustering
result=[]
for k in range(2,kmax):
y_pred = SpectralClustering(n_clusters=k).fit_predict(df4)
result.append([k,metrics.calinski_harabaz_score(df4, y_pred)])
print("Calinski-Harabasz Score with n_clusters=", k,"score:", metrics.calinski_harabaz_score(df4, y_pred))
#跑的特别慢
y_pred = SpectralClustering(n_clusters=8).fit_predict(df4)
print("Calinski-Harabasz Score", metrics.calinski_harabaz_score(df4, y_pred))
#k=9最好
#---------------层次聚类-------------
from scipy.cluster import hierarchy
import scipy.spatial.distance as dist
#生成点与点之间的距离矩阵
distMatrix = dist.pdist(df4)
#进行层次聚类:
Z = hierarchy.linkage(distMatrix, method = 'complete')
#将层级聚类结果以树状图表示出来
hierarchy.dendrogram(Z)
#根据linkage matrix Z得到聚类结果:
cluster= hierarchy.fcluster(Z, 1, 'inconsistent')
num_clusters = cluster.max()
#473类
#-------------DBSCAN---------------
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(df4)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
#k=0分不出来