0. lib
pyspark, jupyter, numpy, pandas, matplotlib
1. 特征提取
port numpy as np
def extract_features(field, categoryMap,featureEnd):
categoryIdx = categoryMap[field[3]]
categoryFeatures = np.zeros(len(categoryMap))
categoryFeatures[categoryIdx] = 1
numericalFeatures = [covt_float(f) for f in field[4:featureEnd]]
return np.concatenate((categoryFeatures,numericalFeatures))
def covt_float(x):
return (0 if x=='?' else float(x))
def extract_label(field):
label = (field[-1])
return float(label)
2. 数据准备
from pyspark.mllib.regression import LabeledPoint
def PrepateData(sc):
global Path
if sc.master[0:5] =='local':
Path = 'file:/opt/data_all/'
else:
Path = 'hdfs://localhost:9000/user/hduser'
print('start import data ...')
rawDataWithHeader = sc.textFile(Path + 'data/train.tsv')
header = rawDataWithHeader.first()
rawData = rawDataWithHeader.filter(lambda x:x!=header)
rData = rawData.map(lambda x:x.replace("\"",""))
lines = rData.map(lambda x:x.split("\t"))
print('get total line : ',lines.count())
categoriyMap = lines.map(lambda field:field[3]).distinct().zipWithIndex().collectAsMap()
labelPointRDD = lines.map(lambda r:
LabeledPoint(
extract_label(r),
extract_features(r,categoriyMap,len(r)-1)))
(trainData,validateionData,testData) = labelPointRDD.randomSplit([8,1,1])
print("{} {} {}".format(trainData.count(),validateionData.count(),testData.count()))
return (trainData, validateionData,testData,categoriyMap)
3. 数据持久化 & 训练
(train,vali,test,cate) = PrepateData(sc)
train.persist()
vali.persist()
test.persist()
from pyspark.mllib.tree import DecisionTree
model = DecisionTree.trainClassifier(train,numClasses =2 , categoricalFeaturesInfo ={},impurity = 'entropy',maxDepth = 5, maxBins = 5)
4. 预测
def PredicateData(sc,model,categoriyMap):
rawDataWithHeader = sc.textFile(Path + 'data/test.tsv')
header = rawDataWithHeader.first()
rawData = rawDataWithHeader.filter(lambda x:x!=header)
rData = rawData.map(lambda x:x.replace("\"",""))
lines = rData.map(lambda x:x.split("\t"))
print('get total line : ',lines.count())
dataRDD= lines.map(lambda r:(r[0],
extract_features(r,categoriyMap,len(r))))
descDict = {
0:'ephemeral',
1:'evergreen'
}
for data in dataRDD.take(10):
predict = model.predict(data[1])
print('site {} :{}'.format(data[0],descDict[predict]))
PredicateData(sc,model,cate)
5. 评估
from pyspark.mllib.evaluation import BinaryClassificationMetrics
def evaluate(model,validtationData):
score = model.predict(vali.map(lambda p:p.features))
scoreAndLabels = score.zip(vali.map(lambda p:p.label))
metric = BinaryClassificationMetrics(scoreAndLabels)
auc = metric.areaUnderROC
print('auc={}'.format(auc))
return auc
auc = evaluate(model,vali)
6. 训练-评估
from time import time
def trainEvaluateModel(trainData,valiData,impurityParm,maxDepthParm,maxBinParam):
startTime = time()
model = DecisionTree.trainClassifier(trainData, numClasses = 2,categoricalFeaturesInfo = {},
impurity = impurityParm,
maxDepth=maxDepthParm,
maxBins = maxBinParam)
auc = evaluate(model,valiData)
duration = time()- startTime
print ("evaluate reuslt: {} {}".format(duration,auc))
return (auc,duration,impurityParm, maxDepthParm,maxBinParam,model)
(aux,du, impurity,depth,bing,model) = trainEvaluateModel(train,vali,'entropy',5,5)
impurityList = ['gini','entropy']
maxDepthList = [10]
maxBinList = [10]
# print('depth {}'.format(maxDepthList))
metircs = [trainEvaluateModel(train,vali,impurity,maxDepth,maxBins)
for impurity in impurityList
for maxDepth in maxDepthList
for maxBins in maxBinList]
7.0 数据可视化
import pandas as pd
indexlist = impurityList
df = pd.DataFrame(metircs, index = indexlist,
columns=['auc','duration','impurity','maxdepth','maxBins','model'])
df
import matplotlib.pyplot as plt
def showChart(df,evalParam,barData,lineData,yMin,yMax):
ax = df[barData].plot(kind='bar',title=evalParam,
figsize = (10,6),legend = True,fontsize = 12)
ax.set_xlabel(evalParam,fontsize=12)
ax.set_ylim(yMin,yMax)
ax.set_ylabel(barData,fontsize=12)
ax2= ax.twinx()
ax2.plot(df[lineData].values,linestyle='-',marker = 'o',
linewidth = 2.0,color = 'r')
plt.show()
showChart(df,'impurity','auc','duration',0.5,0.7)