第六章
对数据进行拉格朗日差值:
import pandas as pd
from scipy.interpolate import lagrange
miss=pd.read_excel('/home/yao/data/chapter6/demo/data/missing_data.xls',
def p(s,n,k=5):
y = s[list(range(n-k,n))+list(range(n+1,n+1+k))]
y = y[y.notnull()]
return lagrange(y.index,list(y))(n)
for i in miss.columns:
for j in range(len(miss)):
if (miss[i].isnull())[j]:
miss[i][j]=p(miss[i],j)
使用随机函数分为训练数据和测试数据
model=pd.read_excel('/home/yao/data/chapter6/model.xls',header=0)
from random import shuffle
model = model.as_matrix()
shuffle(model)
train = model[:int(len(model)*0.8),:]
test = model[int(len(model)*0.8):,:]
#创建cm_plot模块
def cm_plot(y,yp):
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y,yp)
import matplotlib.pyplot as plt
plt.matshow(cm,cmap=plt.cm.Greens)
plt.colorbar()
for x in range(len(cm)):
for y in range(len(cm)):
plt.annotate(cm[x,y],xy=(x,y),horizontalalignment='center',verticalalignment='center')
plt.ylabel('True label')
plt.xlabel('Predicted label')
return plt
#构建LM神经网络模型
from keras.models import Sequential
from keras.layers.core import Dense ,Activation
net = Sequential()
net.add(Dense(input_dim =3,output_dim=10))
net.add(Activation('relu'))
net.add(Dense(input_dim=10,output_dim=1))
net.add(Activation('sigmoid'))
net.compile(loss = 'binary_crossentropy',optimizer='adam')
net.fit(train[:,:3],train[:,3],nb_epoch =100,batch_size = 1)
predict_result = net.predict_classes(train[:,:3]).reshape(len(train))
from cm_plot import *
cm_plot(train[:,3],predict_result).show()
predict_result = net.predict(test[:,:3]).reshape(len(test))
fpr,tpr,thresholds = roc_curve(test[:,3],predict_result,pos_label = 1)>>>
import matplotlib.pyplot as plt
plt.plot(fpr,tpr,linewidth=2,label='Roc of lm')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.xlim(0,1.05)
plt.ylim(0,1.05)
plt.show()
#创建 决策树CART模型
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(train[:,:3],train[:,3])
from sklearn.externals import joblib
#joblib.dump(tree,treefile)
from cm_plot import *
cm_plot(trian[:,3],tree.predict(train[:,:3])).show()
from sklearn.metrics import roc_curve
fpr,tpr,thresholds=roc_curve(test[:,3],tree.predict_proba(test[:,:3])[:,1],pos_label=1)
plt.plot(fpr,tpr,linewidth = 2,label = 'Roc of cart',color = 'green')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.xlim(0,1.05)
plt.ylim(0,1.05)
plt.show()
第七章: 航空公司客户价值分析
#数据分析
import pandas as pd
air = pd.read_csv('/home/yao/data/chapter7/air_data.csv')
air.describe().T
explore = air.describe(percentiles=[],include ='all').T
explore['null']= len(air)-explore['count']
explore.to_csv('/home/yao/data/chapter7/explore.csv')
#数据预处理
data = air[air['SUM_YR_1'].notnull()*air['SUM_YR_2'].notnull()]
index1 = data['SUM_YR_1']!=0
index2 = data['SUM_YR_2']!=0
index3 = (data['SEG_KM_SUM']==0&data['avg_discount']==0)
data2 = data[index1|index2|index3]
#数据正则化
zs = pd.read_excel('/home/yao/data/chapter7/zscoredata.xls')
zsn = (zs-zs.mean(axis=0))/zs.std(axis=0)
#K-means聚类算法实现
from sklearn.cluster import KMeans
kmodel = KMeans(n_clusters = 5)
kmodel.fit(zsn)
kmodel.cluster_centers_ # 聚类中心
kmodel.labels_# 查看各个样本对应的类别
#画图
import numpy as np
import matplotlib.pyplot as plt
labels = zsn.column
k = 5plot_data = kmodel.cluster_centers_
color= ['b','g','r','c','y']
angles=np.linspace(0,2*np.pi,k,endpoint = False)
plot_data=np.concatenate((plot_data,plot_data[:,[0]]),axis=1)
fig = plt.figure()
ax = fig.add_subplot(111,polar=True)
ax = fig.add_subplot(111,polar=True)
for i in range(len(plot_data)):
ax.plot(angles,plot_data[i],'o-',color =color[i],label=u'客户群'+str(i),linewidth = 2)
ax.set_rgrids(np.arange(0.01,3.5,0.5),np.arange(-1,2.5,0.5),fontproperties="SimHei")
ax.set_thetagrids(angles*180/np.pi,labels,fontproperties='SimHei')
plt.show()