#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Sep 15 23:00:00 2018
@author: vicky
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import copy
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from scipy import stats
from sklearn.decomposition import PCA
from pandas.core.frame import DataFrame
df = pd.read_excel('/Users/vicky/Desktop/建模/附件1.xlsx')
#df.isnull().sum()
#选出39列
colname=pd.read_excel('/Users/vicky/Desktop/建模/第一问/colname.xlsx')
colname=np.array(colname)
#--------提取部分列
df1=copy.copy(df)
for i in range(len(df.columns)):
if df.columns[i] not in colname:
df1=df1.drop(df.columns[i],axis=1)
#df1.isnull().sum()
#df1.dtypes
#-------缺失值处理:离散值众数填充,连续值取0
col1=['propvalue','ransomamt','ransomamtus','ransompaid','ransompaidus'] #<10w 10w-100w 100w-1000w >1000w
col2=['nperps','nkill','nwound','nhostkid']#<100 100-1000 >1000
col3=['nhostkidus','nperpcap'] #<10 10-100 >100
col4=['nhours']#<6 6-12 >12
col5=['ndays','nreleased']#<100 100-500 >500
col7=col1+col2+col3+col4+col5#连续列
col6=[]#离散列
for x in colname:
if x not in col7:
col6.append(x[0])
df2=copy.copy(df1)
#负数取0,负数=未知
df2[df2 < 0] = 0
#连续值取0
for c in col7:
df2[c]=df2[c].fillna(0)
#离散值众数填充
for c in col6:
df2[c]=df2[c].fillna(int(df2[c].mode()))
df2.isnull().sum()
#--------连续值离散化
##自定义区间法
#df3=copy.copy(df2)
#def dis(col,level):
# for c in col:
# for i in range(len(df2)):
# if df3.ix[i,c]<level[0]:
# df3.ix[i,c]=0
# elif df3.ix[i,c]<level[1]:
# df3.ix[i,c]=1
# else:
# df3.ix[i,c]=2
#dis(col1,[100000,1000000])
#dis(col2,[100,1000])
#dis(col3,[10,100])
#dis(col4,[6,12])
#dis(col5,[100,500])
#百分位数法,注意!剔除0算百分位数!
df3=copy.copy(df2)
for c in col7:
q1=df3[df3>0].quantile(1/3)[c]
q2=df3[df3>0].quantile(2/3)[c]
for i in range(len(df2)):
if df3.ix[i,c]<q1:
df3.ix[i,c]=0
elif df3.ix[i,c]<q2:
df3.ix[i,c]=1
else:
df3.ix[i,c]=2
df3[c].value_counts()#各元素计数
#df3= pd.read_csv('/Users/vicky/Desktop/建模/第一问/df3.csv')
df3.to_csv('/Users/vicky/Desktop/建模/第一问/df3.csv', sep=',', header=True, index=False)
#df1=df.dropna(axis=0,how='any')
#df1=df.dropna(axis=1,how='any')
#for i in range(len(df1.columns)):
# if df1.dtypes[i]!=int:
# df1=df1.drop(df1.columns[i],axis=1)
#------------PCA降维
pca = PCA(n_components=0.99)
pca.fit(df3)
print(pca.explained_variance_ratio_) #方差贡献率
#print(pca.explained_variance_)#方差
print(pca.n_components_)#主成分个数=7
df4 = pca.transform(df3)
#p=np.array(df4)
#x=np.linalg.inv(df3)
#w=np.dot(np.array(df4),np.linalg.inv(np.array(df3)))
plt.figure()
plt.plot(pca.explained_variance_, 'k', linewidth=2)
plt.xlabel('n_components', fontsize=16)
plt.ylabel('explained_variance_', fontsize=16)
plt.show()
#---------------标准化处理----------------
scaler = StandardScaler().fit(df4)
df5=pd.DataFrame(scaler.fit_transform(df4))
#des5=df5.describe()
#----------------kmeans聚类---------------
num=5#参数k
clf = KMeans(n_clusters=num,init='k-means++',random_state=1234)
model = clf.fit(df5)
#中心点
centers=clf.cluster_centers_
print(centers)
#Kmeans的sse
print(clf.inertia_)
#分类结果:每个样本所属的簇d
label=list(clf.labels_)
set1={ k:label.count(k) for k in set(label)}
print(set1)
#-----------------分层占比----------------
#df6=pd.concat([DataFrame(label),df1],axis=1)
df6=copy.copy(df1)
df6.insert(0,'label',label)
df6.to_csv('/Users/vicky/Desktop/建模/df6.csv', sep=',', header=True, index=False)
con=df2[col7]#连续列
con.insert(0,'label',label)
dis=df2[col6]#离散列
dis.insert(0,'label',label)
#dis.to_csv('/Users/vicky/Desktop/建模/dis.csv', sep=',', header=True, index=False)
# 根据分类结果计算连续值平均值和离散值众数
m=con.groupby(label).mean() #连续变量每个类别下的均值
#m.to_csv('/Users/vicky/Desktop/建模/m.csv', sep=',', header=True, index=False)
#25列离散值扩充到537列
fac=DataFrame()
for c in col6:
fac=pd.concat([fac,pd.get_dummies(dis[c], prefix=[c])],axis=1)
fac.insert(0,'label',label)
s=fac.groupby(label).sum()#离散变量每个类别下的汇总
num_factor=sum(fac.apply(sum)[1:3]) #属性变量每个类别下的总人数=用第一个因素总和算
prop=s/num_factor
#---------------找危险等级最大的类别中距离中心点最近的10个点
zx=centers[4,:]#label=5的类别伤害最大
juli=[]
for i in range(len(df5)):
juli.append(np.sqrt(sum(np.power(df5.iloc[i] - zx,2))))
rel={0:'较高',1:'中等',2:'最低',3:'较低',4:'最高'}
level=[]
for i in range(len(label)):
level.append(rel[label[i]])
result=copy.copy(df)
#result=DataFrame(juli,columns=['juli'])
result.insert(0,'juli',juli)
result.insert(0,'label',label)
result.insert(0,'level',level)
#d=result[:10]
result.to_csv('/Users/vicky/Desktop/建模/第一问/结果.csv', sep=',', header=True, index=False,encoding="utf_8_sig")
result.to_excel('/Users/vicky/Desktop/建模/第一问/结果.xlsx',header=True, index=False)
result3=copy.copy(DataFrame(df.eventid))
#result=DataFrame(juli,columns=['juli'])
result3.insert(0,'juli',juli)
result3.insert(0,'label',label)
result3.insert(0,'level',level)
#d=result[:10]
result3.to_csv('/Users/vicky/Desktop/建模/第一问/危险等级分类.csv', sep=',', header=True, index=False,encoding="utf_8_sig")
result.to_excel('/Users/vicky/Desktop/建模/第一问/结果.xlsx',header=True, index=False)