
# 这个数据集来自, 用于预测网页是短暂存在还是长时间流行,这里我把它存放在hdfs文件系统内的/user/yy/stumbleupon/目录里
rawData = sc.textFile("hdfs:///user/yy/stumbleupon/train_noheader.tsv")
records = line: line.split('\t'))
# 每一条样本是一个网页的信息

from pyspark.mllib.regression import LabeledPoint # 标注点类型,(label, feature)通常feature是Vectors.dense类型
from pyspark.mllib.linalg import Vectors

# DenseVector类型的好处,这2行只是一个演示而已
a = sc.parallelize([Vectors.dense(1,2),Vectors.dense(3,4)])
a.sum() # 装着DenseVector类型向量的rdd,可以按列求值

# def records_processing(record):
#     trimmed = e: e.replaceAll('\"', ''))
#     label = int(trimmed[record.size - 1])
#     features = trimmed[4 : record.size - 1].map(lambda d: 0.0 if d == '?' else float(d))
#     return LabeledPoint(label, Vectors.dense(features))

sizeAndData = record : (len(record), map(lambda e: e.replace('\"', ''), record)))
labelAndFeature = (size, data): (int(data[size-1]), map(lambda d: 0.0 if d == '?' else float(d), data[4: size-1])))
data = (label, feature): LabeledPoint(label, Vectors.dense(map(lambda d: 0.0 if d < 0 else d, feature))))

LabeledPoint(0.0, [0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575])



from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.tree import DecisionTree
# from pyspark.mllib.tree.configuration import Algo 
# from pyspark.mllib.tree.impurity import Entropy

numIterations = 10
maxTreeDepth = 5
dataPoint = data.first()

LabeledPoint(0.0, [0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575])

# 训练逻辑回归模型
lrModel = LogisticRegressionWithSGD.train(data, numIterations)

# 使用逻辑归回模型
prediction = lrModel.predict(dataPoint.features)
print "预测值:%d, 真实值:%d" % (prediction, dataPoint.label)

# 看看逻辑回归整体的预测结果
predictionsAndRealities = point: lrModel.predict(point.features) == point.label)

# 计算逻辑回归的准确率
lrTotalCorrect = point :
                         1 if lrModel.predict(point.features) == point.label else 0
lrAccuracy = float(lrTotalCorrect) / data.count()
print "逻辑回归准确率:", lrAccuracy

# 计算PR(准确率-召回率)曲线下面积, 以及ROC(受试者工作特征曲线, 真阳性率-假阳性率)曲线下面积即AUC
from pyspark.mllib.evaluation import BinaryClassificationMetrics
scoreAndLabels = point: (float(lrModel.predict(point.features)), point.label))

metrics = BinaryClassificationMetrics(scoreAndLabels)
print 'Area under PR: %2.4f%%, Area under ROC: %2.4f%%' % \
    (metrics.areaUnderPR * 100, metrics.areaUnderROC * 100)

预测值:1, 真实值:0

逻辑回归准确率: 0.514672075727

Area under PR: 75.6759%, Area under ROC: 50.1418%

# 训练SVM模型
svmModel = SVMWithSGD.train(data, numIterations)

# 使用SVM模型
prediction = svmModel.predict(dataPoint.features)
print "预测值:%d, 真实值:%d" % (prediction, dataPoint.label)

# 看看SVM整体的预测结果
predictionsAndRealities = point: svmModel.predict(point.features) == point.label)

# 计算SVM的准确率
svmTotalCorrect = point :
                         1 if svmModel.predict(point.features) == point.label else 0
svmAccuracy = float(svmTotalCorrect) / data.count()
print "SVM准确率:", svmAccuracy

# 计算PR(准确率-召回率)曲线下面积, 以及ROC(受试者工作特征曲线, 真阳性率-假阳性率)曲线下面积即AUC
scoreAndLabels = point: (float(svmModel.predict(point.features)), point.label))

metrics = BinaryClassificationMetrics(scoreAndLabels)
print 'Area under PR: %2.4f%%, Area under ROC: %2.4f%%' % \
    (metrics.areaUnderPR * 100, metrics.areaUnderROC * 100)

预测值:1, 真实值:0

SVM准确率: 0.514672075727

Area under PR: 75.6759%, Area under ROC: 50.1418%

# 训练朴素贝叶斯模型
nbModel = NaiveBayes.train(data)

# 使用朴素贝叶斯模型
prediction = nbModel.predict(dataPoint.features)
print "预测值:%d, 真实值:%d" % (prediction, dataPoint.label)

# 看看朴素贝叶斯整体的预测结果
predictionsAndRealities = point: nbModel.predict(point.features) == point.label)

# 计算朴素贝叶斯的准确率
nbTotalCorrect = point :
                         1 if nbModel.predict(point.features) == point.label else 0
nbAccuracy = float(nbTotalCorrect) / data.count()
print "朴素贝叶斯准确率:", nbAccuracy

# 计算PR(准确率-召回率)曲线下面积, 以及ROC(受试者工作特征曲线, 真阳性率-假阳性率)曲线下面积即AUC
from pyspark.mllib.evaluation import BinaryClassificationMetrics
scoreAndLabels = point: (float(nbModel.predict(point.features)), point.label))

metrics = BinaryClassificationMetrics(scoreAndLabels)
print 'Area under PR: %2.4f%%, Area under ROC: %2.4f%%' % \
    (metrics.areaUnderPR * 100, metrics.areaUnderROC * 100)

预测值:1, 真实值:0

朴素贝叶斯准确率: 0.580392156863

Area under PR: 68.0851%, Area under ROC: 58.3559%

# 训练决策树模型
dtModel = DecisionTree.trainClassifier(data, 2, {})

# 使用决策树模型
prediction = dtModel.predict(dataPoint.features)
print "预测值:%d, 真实值:%d" % (prediction, dataPoint.label)

# 看看决策树整体的预测结果, 决策树模型只能在主节点上使用
predictions = dtModel.predict( point: point.features))
results = point: point.label))

# 计算决策树的准确率
dtTotalCorrect = results.filter(lambda (real, pred): pred == real).count()
dtAccuracy = float(dtTotalCorrect) / data.count()
print "决策树准确率:", dtAccuracy

# 计算PR(准确率-召回率)曲线下面积, 以及ROC(受试者工作特征曲线, 真阳性率-假阳性率)曲线下面积即AUC
metrics = BinaryClassificationMetrics(results)
print 'Area under PR: %2.4f%%, Area under ROC: %2.4f%%' % \
    (metrics.areaUnderPR * 100, metrics.areaUnderROC * 100)

预测值:0, 真实值:0

决策树准确率: 0.648275862069

Area under PR: 74.2894%, Area under ROC: 64.8916%

# 上面的结果跟随机差不多,原因是我们仅仅只是把数据送入模型
# 改进模型性能以及参数调优

# from pyspark.mllib.linalg.distributed import RowMatrix

matrix = point: point.features)
print "矩阵的第一行:\n", matrix.first()

result = matrix.stats()
print "矩阵各列的平均值:\n", result.mean()
print "矩阵各列的最小值:\n", result.min()
print "矩阵各列的最大值:\n", result.max()
print "矩阵各列的标准差:\n", result.stdev()
print "矩阵各列的方差:\n", result.variance()




[ 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0.04556422 0. 0.
0. 0. 0. 0. 1. 0. 0.
0. ]


[ 9.99426000e-01 3.63000000e+02 1.00000000e+00 1.00000000e+00
9.80392157e-01 9.80392157e-01 2.10000000e+01 2.50000000e-01
0.00000000e+00 4.44444444e-01 1.00000000e+00 7.16883117e-01
1.13333333e+02 1.00000000e+00 1.00000000e+00 1.00000000e+02
1.00000000e+00 2.07952000e+05 4.99700000e+03 2.20000000e+01
1.00000000e+00 1.00000000e+00]


[ 3.31251568e-01 8.61920979e+00 2.03119645e-01 1.46732834e-01
9.59717238e-02 7.26238264e-02 5.70392733e+00 7.42829375e-03
0.00000000e+00 4.14428371e-02 1.44152088e-01 5.24830153e-02
1.87878356e+00 4.86464752e-01 4.73603607e-01 2.03917219e+01
1.95386810e-01 8.87483231e+03 1.79454063e+02 3.23289275e+00
1.83273269e-01 7.92255262e-02]


from pyspark.mllib.feature import StandardScaler

# 第一个True表示均值正则化(每个值减去均值),第二个True表示正则化标准差(每个值除以标准差进行缩放)
scaler = StandardScaler(withMean=True, withStd=True).fit(matrix)
scaledData = scaler.transform(matrix)
scaledData = lp: lp.label).zip(scaledData).map(lambda (label, feature): LabeledPoint(label, feature))

print "标准化前:\n", data.first().features
print "标准化后:\n", scaledData.first().features

import numpy as np
vector = np.array(matrix.first())
mean = np.array(result.mean())
stdev = np.array(result.stdev())
print "手动标准化:\n", (vector - mean) / stdev




[ 1.13772426 -0.08194111 1.02520913 -0.05586734 -0.46892496 -0.35432928
-0.31755669 -0.12316245 nan 0.82887822 -0.1472789 0.22965535
-0.19434481 0.79029149 0.71724323 -0.29801697 -0.20347634 -0.03296944
-0.04878443 0.94013354 -0.10870584 -0.27883964]

# 用标准化后的数据重新训练模型,不训练决策树和朴素贝叶斯,因为他俩不受特征标准化的影响

# 训练逻辑回归模型
lrModelScaled = LogisticRegressionWithSGD.train(scaledData, numIterations)

# 看看逻辑回归整体的预测结果
lrTotalCorrectScaled = scaledData.filter(lambda point: lrModelScaled.predict(point.features) == point.label).count()
print "预测正确的样本个数", lrTotalCorrectScaled

# 计算逻辑回归的准确率
lrAccuracyScaled = float(lrTotalCorrectScaled) / data.count()
print "逻辑回归准确率:", lrAccuracyScaled

# 计算PR(准确率-召回率)曲线下面积, 以及ROC(受试者工作特征曲线, 真阳性率-假阳性率)曲线下面积即AUC
lrPredictionsVsTrue = point: (float(lrModelScaled.predict(point.features)), point.label))
lrMetricsScaled = BinaryClassificationMetrics(lrPredictionsVsTrue)
print 'Area under PR: %2.4f%%, Area under ROC: %2.4f%%' % \
    (lrMetricsScaled.areaUnderPR * 100, lrMetricsScaled.areaUnderROC * 100)

预测正确的样本个数 4609

逻辑回归准确率: 0.623258958756

Area under PR: 73.0204%, Area under ROC: 62.2292%

# 考虑网页的类别特征,比如这个网页是属于sports类还是business类
categories = dict( record: record[3].replace('\"', '')).distinct().zipWithIndex().collect())
print categories
numCategories = len(categories)
print numCategories

categories = sc.broadcast(categories)
numCategories = sc.broadcast(numCategories)

{u’gaming’: 7, u’recreation’: 0, u’business’: 1, u’computer_internet’: 2, u’unknown’: 8, u’culture_politics’: 3, u’science_technology’: 9, u’law_crime’: 4, u’sports’: 10, u’religion’: 11, u’weather’: 12, u’health’: 5, u’?’: 6, u’arts_entertainment’: 13}

def buildCategoriesVector(data):
    vector = [0] * numCategories.value
    vector[categories.value[data[3]]] = 1
    return vector + data[4:-1]

sizeAndData = record : (len(record), map(lambda e: e.replace('\"', ''), record)))
labelAndFeature = (size, data): (int(data[-1]), map(lambda d: 0.0 if d == '?' else float(d), buildCategoriesVector(data))))
dataCategories = (label, feature): LabeledPoint(label, Vectors.dense(map(lambda d: 0.0 if d < 0 else d, feature))))
dataPointCats = dataCategories.first()
print labelAndFeature.first()
print dataPointCats 

(0, [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.789131, 2.055555556, 0.676470588, 0.205882353, 0.047058824, 0.023529412, 0.443783175, 0.0, 0.0, 0.09077381, 0.0, 0.245831182, 0.003883495, 1.0, 1.0, 24.0, 0.0, 5424.0, 170.0, 8.0, 0.152941176, 0.079129575])


# 标准化转换
labels = lp: lp.label)
featuresMatrix = lp: lp.features)
scalerCats = StandardScaler(withMean=True, withStd=True).fit(featuresMatrix)
scaledDataCats = scalerCats.transform(featuresMatrix).zip(labels).map(lambda (features, label): LabeledPoint(label, features))

LabeledPoint(0.0, [-0.446421204794,2.72073665645,-0.204182210579,-0.220526884579,-0.0648775723926,-0.270999069693,-0.680752790425,-0.101894690972,-0.028494000387,-0.201654052319,-0.232727977095,-0.0991499193088,-0.0232621058984,-0.381813223243,1.1376473365,-0.0819355716929,1.02513981289,-0.0558635644254,-0.468893253129,-0.354305326308,-0.317535217236,-0.123154125351,0.0,0.828822173315,-0.147268943346,0.229639823578,-0.194331667814,0.790238049918,0.717194729453,-0.297996816496,-0.20346257793,-0.0329672096969,-0.0487811297558,0.940069975117,-0.108698488525,-0.278820782314])

lrModelScaledCats = LogisticRegressionWithSGD.train(scaledDataCats, numIterations)

# 看看逻辑回归整体的预测结果
lrTotalCorrectScaledCats = scaledDataCats.filter(lambda point: lrModelScaledCats.predict(point.features) == point.label).count()
print "预测正确的样本个数", lrTotalCorrectScaledCats

# 计算逻辑回归的准确率
lrAccuracyScaledCats = float(lrTotalCorrectScaledCats) / scaledDataCats.count()
print "逻辑回归准确率:", lrAccuracyScaledCats

# 计算PR(准确率-召回率)曲线下面积, 以及ROC(受试者工作特征曲线, 真阳性率-假阳性率)曲线下面积即AUC
lrPredictionsVsTrue = point: (float(lrModelScaledCats.predict(point.features)), point.label))
lrMetricsScaledCats = BinaryClassificationMetrics(lrPredictionsVsTrue)
print 'Area under PR: %2.4f%%, Area under ROC: %2.4f%%' % \
    (lrMetricsScaledCats.areaUnderPR * 100, lrMetricsScaledCats.areaUnderROC * 100)

预测正确的样本个数 4928

逻辑回归准确率: 0.666396213658

Area under PR: 75.8535%, Area under ROC: 66.6127%

# 只使用1-of-k编码的类别特征
def onlyCategoriesVector(data):
    vector = [0] * numCategories.value
    vector[categories.value[data[3]]] = 1
    return vector

sizeAndData = record : (len(record), map(lambda e: e.replace('\"', ''), record)))
labelAndFeature = (size, data): \
                                  (int(data[-1]), map(lambda d: 0.0 if d == '?' else float(d), onlyCategoriesVector(data))))
dataNB = (label, feature): \
                             LabeledPoint(label, Vectors.dense(map(lambda d: 0.0 if d < 0 else d, feature))))

# 训练朴素贝叶斯模型
nbModelCats = NaiveBayes.train(dataNB)

# 计算朴素贝叶斯的准确率
nbTotalCorrectCats = point :
                         1 if nbModelCats.predict(point.features) == point.label else 0
nbAccuracyCats = float(nbTotalCorrectCats) / dataNB.count()
print "朴素贝叶斯准确率:", nbAccuracyCats

nbPredictionsVsTrueCats = point: (float(nbModelCats.predict(point.features)), point.label))
nbMetricsCats = BinaryClassificationMetrics(nbPredictionsVsTrueCats)
print 'Area under PR: %2.4f%%, Area under ROC: %2.4f%%' % \
    (nbMetricsCats.areaUnderPR * 100, nbMetricsCats.areaUnderROC * 100)

朴素贝叶斯准确率: 0.609601081812

Area under PR: 74.0522%, Area under ROC: 60.5138%

