#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 29 22:37:01 2018
@author: vicky
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import copy
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from scipy import stats
b1=[]
with open('1.csv', 'r', errors='ignore',encoding='gbk') as f:
for line in f:
b1.append(line.strip('\n').strip(',').split(','))
b1=b1[:20000]
#全部列
data=[]
for i in range(len(b1)):
data.append([str(x) if type(x)==str else float(x) for x in b1[i]])
#data=np.array(data)
df=copy.copy(pd.DataFrame(data))
#获取列名
colname=pd.read_table('column_name.txt',header=None)
df.columns=colname
#属性信息的因子变量转换为哑变量,9个离散因子扩充为39维向量
df_factor=pd.get_dummies(df[df.columns[62:]])
df_fac_column=df_factor.columns.tolist()
#相关性处理,删去每个离散因子的最后一行
d=df_factor.columns
df_factor2=df_factor.drop(df_factor.columns[[12,15,18,22,23,28,35,38]],axis=1)
des_fac2=df_factor2.describe() #看样本数据的大概情况
#--------------去相关处理-----------------
#部分列
df1=pd.DataFrame(df[df.columns[2:62]],dtype=float)
#df1.astype(float)
#pearson相关系数
cor=df1.corr()
des1=df1.describe()
df2=copy.copy(df1) #删除相关性高的变量后的data
corind=[]
del_col=[] #删去的变量名
for i in range(len(cor)):
for j in range(i+1,len(cor)):
if np.array(cor)[i][j]>0.7 and df1.columns[j] in df2.columns:
corind.append([df1.columns[i],df1.columns[j]])
#df2=df2.drop(columns=('wake_days',))
df2=df2.drop(df1.columns[j],axis=1)
del_col.append(df1.columns[j])
#一共删了9个变量
#--------------异常值处理-----------------
#部分列:行为数据
#删去分位数前0.1%和后99.9%的值
df3=copy.copy(df2)
q1=df2.quantile(0.001)
q99=df2.quantile(0.999)
for k in df2.columns:
df3=df2[(df2[k]>q1[k]) & (df2[k]<q99[k])]
#q1不取等号会删去大部分0值
#df2=df2.dropna(axis=0,how='any') #删除存在nan的行
des3=df3.describe()
#---------------标准化处理----------------
scaler = StandardScaler().fit(df3)
#scaler.mean_ #样本均值
#scaler.var_ #样本方差
df4=pd.DataFrame(scaler.fit_transform(df3))
des4=df4.describe()
#----------------kmeans聚类---------------
#-------选K------
#K的上限为20
kmax=15
#手肘法:画sse与K的图,拐点处为最佳K
SSE = [] # 存放每次结果的误差平方和
for k in range(1,kmax):
estimator = KMeans(n_clusters=k,init='k-means++') # 构造聚类器
estimator.fit(df_factor)
SSE.append(estimator.inertia_)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(range(1,kmax),SSE,'o-')
plt.grid()
plt.xticks(np.arange(1, kmax, 1))
plt.show()
#效果不好,看不出来拐点
#轮廓系数法,轮廓系数最大的k最佳
Scores = [] # 存放轮廓系数
for k in range(2,kmax):
estimator = KMeans(n_clusters=k, init='k-means++') # 构造聚类器
estimator.fit(df_factor)
Scores.append(silhouette_score(df_factor,estimator.labels_,metric='euclidean'))
plt.xlabel('k')
plt.ylabel('轮廓系数')
plt.plot(range(2,kmax),Scores,'o-')
plt.grid()
plt.xticks(np.arange(1, kmax, 1))
plt.show()
#-------------------带入k聚类-------------
#根据行为属性聚类
num=6#用选出的最佳k
clf = KMeans(n_clusters=num,init='k-means++')
model = clf.fit(df5)
#中心点
centers=clf.cluster_centers_
#Kmeans的sse
clf.inertia_
#分类结果:每个样本所属的簇
label=clf.labels_
#-----------------分层占比----------------
#去相关及异常值处理后的df2对应的所有列
#df5_all=pd.concat([df2,df_factor],axis=1,join='inner')
df5=pd.concat([df2,df_factor],axis=1,join='inner')
#df5=pd.DataFrame()
#for i in range(len(df3)):
# xleft=df3.ix[df3.index[i]].to_frame()
# xright=df_factor.ix[df3.index[i]].to_frame()
# x=pd.concat([xleft,xright],axis=0)
# df5=df5.append(x)
# df5.loc[df3.index[i]]=x
#d=df3.ix[df3.index[i]]
#
#df5.insert(0,'label',label)
#df5.insert(len(df3),colname[62:71],)
df5.insert(0,'label',label)
# 根据分类结果计算平均值
df5_mean = df5.groupby(label).mean() #行为变量每个类别下的均值
df5_prop=df5.groupby(label).sum()/len(df5)#属性变量每个类别下的占比
#ddd=df5.groupby(label)
clu_size=df5.groupby(label).size()#分类个数`
clu_prop=df5.groupby(label).size()/len(df5)#分类占比
df5_per=np.zeros((num,len(df5.columns))) #行为变量每个类别下的均值所对应的百分位数
for j in range(len(df5.columns)):
for i in range(num):
df5_per[i][j]=stats.percentileofscore(df5[df5.columns[j]], df5_mean.ix[i][j])#求后者在前者中的百分位
df5_per=pd.DataFrame(df5_per)
df5_per.columns=df5_mean.columns
#---------------聚类图-------------
#用TSNE(高维数据可视化工具)对聚类结果进行可视化
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
tsne = TSNE(learning_rate=100)
# 对数据进行降维
tsne.fit_transform(df4)
df_tsne=pd.DataFrame(tsne.embedding_, index=df4.index)
# 不同类别用不同颜色和样式绘图
colors = ['r','g', 'b', 'm', 'k'] #红,绿,蓝,酒红,黑
markers = ['o', 'x', '+', '^', 'v'] #圆圈, x, +, 正三角, 倒三角
styles = [x+y for x in colors for y in markers] #这样就有5*5=25中不同组合
for i in range(num):
d = df_tsne[model.labels_==i]
plt.plot(d[0],d[1],styles[i],label=str(i))
plt.lengend()
plt.show()
#聚类图看不清楚
#---------------------------------------其他聚类方法尝试---------------------
#-------Affinity Propagation自动选k聚类----
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
# Compute Affinity Propagation
af = AffinityPropagation(preference=-50).fit(df4)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
#自动分了282类
#----------------MeanShift-----------
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth
# Compute clustering with MeanShift
ms = MeanShift(bin_seeding=True)
ms.fit(df4)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print("number of estimated clusters : %d" % n_clusters_)
#自动分了46类
#--------------谱聚类-----------
from sklearn.cluster import SpectralClustering
result=[]
for k in range(2,kmax):
y_pred = SpectralClustering(n_clusters=k).fit_predict(df4)
result.append([k,metrics.calinski_harabaz_score(df4, y_pred)])
print("Calinski-Harabasz Score with n_clusters=", k,"score:", metrics.calinski_harabaz_score(df4, y_pred))
#跑的特别慢
y_pred = SpectralClustering(n_clusters=8).fit_predict(df4)
print("Calinski-Harabasz Score", metrics.calinski_harabaz_score(df4, y_pred))
#k=9最好
#---------------层次聚类-------------
from scipy.cluster import hierarchy
import scipy.spatial.distance as dist
#生成点与点之间的距离矩阵
distMatrix = dist.pdist(df4)
#进行层次聚类:
Z = hierarchy.linkage(distMatrix, method = 'complete')
#将层级聚类结果以树状图表示出来
hierarchy.dendrogram(Z)
#根据linkage matrix Z得到聚类结果:
cluster= hierarchy.fcluster(Z, 1, 'inconsistent')
num_clusters = cluster.max()
#473类
#-------------DBSCAN---------------
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(df4)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
#k=0分不出来