决策树是在数据分类问题中的一种常用且经典的机器学习方法,在本例里使用决策树模型来分析StumbleUpon数据集,来预测网页是暂时的(ephemeral)还是长青的(evergreen),并且调教参数找出最佳的参数组合,来提高预测准确度。
像这类经典的二分类问题,在python中调包来做的话会非常容易,且分析手段也有很多。但是现在的练习任务是使用Spark来对着类问题进行处理,因此,下面将开始介绍使用Spark进行二分类问题的过程。
第一步:分析数据的特性
我们在本例中要使用的数据来自于Kaggle官网的数据,这份数据的维度很大,有些数据的列是没有意义的,在选择时候要规避。数据虽然说维度很大,但是很多列都涉及到了一些文本的内容,因此在这个问题中不考虑维度灾难,进行初步的筛选之后就进行建模了。
第二步:启动Ipython Notebook
cd ~/pythonwork/ipynotebook
PYSPARK_DRIVER_PYTHON=ipython PYSPARK_DRIVER_PYTHON_OPTS=“notebook" MASTER=local[*] pyspark
第三步:进行数据准备
原始数据需要变换成训练模型所需的数据格式LabeledPoint,并以随机方式按照8:1:1比例把数据分割为3个部分trainData、validationData和testData。
from pyspark.mllib.regression import LabeledPoint
def SetPath(sc):
global Path
if sc.master[0:5]=="local" :
Path="file:/home/jorlinlee/pythonsparkexample/PythonProject/"
else:
Path="hdfs://master:9000/user/jorlinlee/"
def get_mapping(rdd, idx):
return rdd.map(lambda fields: fields[idx]).distinct().zipWithIndex().collectAsMap()
def extract_label(record):
label=(record[-1])
return float(label)
def extract_features(field,categoriesMap,featureEnd):
categoryIdx = categoriesMap[field[3]]
categoryFeatures = np.zeros(len(categoriesMap))
categoryFeatures[categoryIdx] = 1
numericalFeatures=[convert_float(field) for field in field[4: featureEnd]]
return np.concatenate(( categoryFeatures, numericalFeatures))
def convert_float(x):
return (0 if x=="?" else float(x))
def PrepareData(sc):
print("Data loading...")
rawDataWithHeader = sc.textFile(Path+"data/train.tsv")
header = rawDataWithHeader.first()
rawData = rawDataWithHeader.filter(lambda x:x !=header)
rData=rawData.map(lambda x: x.replace("\"", ""))
lines = rData.map(lambda x: x.split("\t"))
print("The number of data" + str(lines.count()))
categoriesMap = lines.map(lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap()
labelpointRDD = lines.map( lambda r:LabeledPoint(
extract_label(r),
extract_features(r,categoriesMap,len(r) - 1)))
(trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1])
print("The number of trainData:" + str(trainData.count()) +
" The number of validationData:" + str(validationData.count()) +
" The number of testData:" + str(testData.count()))
return (trainData, validationData, testData, categoriesMap)
第四步:对模型进行训练
def PredictData(sc,model,categoriesMap):
print("Data loading.....")
rawDataWithHeader = sc.textFile(Path+"data/test.tsv")
header = rawDataWithHeader.first()
rawData = rawDataWithHeader.filter(lambda x:x !=header)
rData=rawData.map(lambda x: x.replace("\"", ""))
lines = rData.map(lambda x: x.split("\t"))
print("The number of data:" + str(lines.count()) )
dataRDD = lines.map(lambda r: ( r[0] ,
extract_features(r,categoriesMap,len(r) )))
DescDict = {
0: "ephemeral",
1: "evergreen"
}
for data in dataRDD.take(10):
predictResult = model.predict(data[1])
print("The web: " +str(data[0])+"\n" +"Predict:"+ str(predictResult)+"Illustration"+DescDict[predictResult] +"\n")
第五步:评估模型的准确率(使用AUC)
def evaluateModel(model, validationData):
score = model.predict(validationData.map(lambda p: p.features))
scoreAndLabels=score.zip(validationData.map(lambda p: p.label))
metrics = BinaryClassificationMetrics(scoreAndLabels)
AUC=metrics.areaUnderROC
return(AUC)
第六步:评估并确定模型的参数(需要确定impurityParm、maxDepthParm和maxBinsParm)
def trainEvaluateModel(trainData,validationData,
impurityParm, maxDepthParm, maxBinsParm):
startTime = time()
model = DecisionTree.trainClassifier(trainData,
numClasses=2, categoricalFeaturesInfo={},
impurity=impurityParm,
maxDepth=maxDepthParm,
maxBins=maxBinsParm)
AUC = evaluateModel(model, validationData)
duration = time() - startTime
print ("Parameter" + \
" impurity="+str(impurityParm) +\
" maxDepth="+str(maxDepthParm) + \
" maxBins="+str(maxBinsParm) +\
" Time="+str(duration) + \
" AUC = " + str(AUC) )
return (AUC,duration, impurityParm, maxDepthParm, maxBinsParm,model)
def evalParameter(trainData, validationData, evalparm,
impurityList, maxDepthList, maxBinsList):
metrics = [trainEvaluateModel(trainData, validationData,
impurity,maxDepth, maxBins )
for impurity in impurityList
for maxDepth in maxDepthList
for maxBins in maxBinsList ]
if evalparm=="impurity":
IndexList=impurityList[:]
elif evalparm=="maxDepth":
IndexList=maxDepthList[:]
elif evalparm=="maxBins":
IndexList=maxBinsList[:]
df = pd.DataFrame(metrics,index=IndexList,
columns=['AUC', 'duration','impurity', 'maxDepth', 'maxBins','model'])
showchart(df,evalparm,'AUC','duration',0.5,0.7 )
使用可视化方法来作为参数选择的参考
def showchart(df,evalparm ,barData,lineData,yMin,yMax):
ax = df[barData].plot(kind='bar', title =evalparm,figsize=(10,6),legend=True, fontsize=12)
ax.set_xlabel(evalparm,fontsize=12)
ax.set_ylim([yMin,yMax])
ax.set_ylabel(barData,fontsize=12)
ax2 = ax.twinx()
ax2.plot(df[[lineData ]].values, linestyle='-', marker='o', linewidth=2.0,color='r')
plt.show()
找出准确率最高的参数组合
def evalAllParameter(trainData, validationData,
impurityList, maxDepthList, maxBinsList):
metrics = [trainEvaluateModel(trainData, validationData,
impurity,maxDepth, maxBins )
for impurity in impurityList
for maxDepth in maxDepthList
for maxBins in maxBinsList ]
Smetrics = sorted(metrics, key=lambda k: k[0], reverse=True)
bestParameter=Smetrics[0]
print("The best parameter:impurity:" + str(bestParameter[2]) +
" ,maxDepth:" &#