读取数据
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
# 读取文本文件,创建为DataFrame 结构
row_df=spark.read.format("csv").option("header","true").option("delimiter","\t").load("train.tsv")
print(row_df.count()) # 查看数据条数
row_df.printSchema() # 查看数据字段类型
7395
root
|-- url: string (nullable = true)
|-- urlid: string (nullable = true)
|-- boilerplate: string (nullable = true)
|-- alchemy_category: string (nullable = true)
|-- alchemy_category_score: string (nullable = true)
|-- avglinksize: string (nullable = true)
|-- commonlinkratio_1: string (nullable = true)
|-- commonlinkratio_2: string (nullable = true)
|-- commonlinkratio_3: string (nullable = true)
|-- commonlinkratio_4: string (nullable = true)
|-- compression_ratio: string (nullable = true)
|-- embed_ratio: string (nullable = true)
|-- framebased: string (nullable = true)
|-- frameTagRatio: string (nullable = true)
|-- hasDomainLink: string (nullable = true)
|-- html_ratio: string (nullable = true)
|-- image_ratio: string (nullable = true)
|-- is_news: string (nullable = true)
|-- lengthyLinkDomain: string (nullable = true)
|-- linkwordscore: string (nullable = true)
|-- news_front_page: string (nullable = true)
|-- non_markup_alphanum_characters: string (nullable = true)
|-- numberOfLinks: string (nullable = true)
|-- numwords_in_url: string (nullable = true)
|-- parametrizedLinkRatio: string (nullable = true)
|-- spelling_errors_ratio: string (nullable = true)
|-- label: string (nullable = true)
数据转换
from pyspark.sql.functions import udf
def replace_question(x):
return ("0" if x=="?" else x)
replace_question = udf(replace_question)
from pyspark.sql.functions import col
import pyspark.sql.types
# 将第4个字段开始转换成double类型
df=row_df.select(['url','alchemy_category']+[replace_question(col(column)).cast("double").alias(column) for column in row_df.columns[4:]])
df.printSchema() # 查看转换后数据字段类型
root
|-- url: string (nullable = true)
|-- alchemy_category: string (nullable = true)
|-- alchemy_category_score: double (nullable = true)
|-- avglinksize: double (nullable = true)
|-- commonlinkratio_1: double (nullable = true)
|-- commonlinkratio_2: double (nullable = true)
|-- commonlinkratio_3: double (nullable = true)
|-- commonlinkratio_4: double (nullable = true)
|-- compression_ratio: double (nullable = true)
|-- embed_ratio: double (nullable = true)
|-- framebased: double (nullable = true)
|-- frameTagRatio: double (nullable = true)
|-- hasDomainLink: double (nullable = true)
|-- html_ratio: double (nullable = true)
|-- image_ratio: double (nullable = true)
|-- is_news: double (nullable = true)
|-- lengthyLinkDomain: double (nullable = true)
|-- linkwordscore: double (nullable = true)
|-- news_front_page: double (nullable = true)
|-- non_markup_alphanum_characters: double (nullable = true)
|-- numberOfLinks: double (nullable = true)
|-- numwords_in_url: double (nullable = true)
|-- parametrizedLinkRatio: double (nullable = true)
|-- spelling_errors_ratio: double (nullable = true)
|-- label: double (nullable = true)
划分训练集和测试集
# 存入缓存中
train_df, test_df = df.randomSplit([0.7, 0.3])
train_df.cache()
test_df.cache()
DataFrame[url: string, alchemy_category: string, alchemy_category_score: double, avglinksize: double, commonlinkratio_1: double, commonlinkratio_2: double, commonlinkratio_3: double, commonlinkratio_4: double, compression_ratio: double, embed_ratio: double, framebased: double, frameTagRatio: double, hasDomainLink: double, html_ratio: double, image_ratio: double, is_news: double, lengthyLinkDomain: double, linkwordscore: double, news_front_page: double, non_markup_alphanum_characters: double, numberOfLinks: double, numwords_in_url: double, parametrizedLinkRatio: double, spelling_errors_ratio: double, label: double]
使用StringIndexer对alchemy_category进行编码
from pyspark.ml.feature import StringIndexer
categoryIndexer=StringIndexer(inputCol='alchemy_category', outputCol='alchemy_category_Index')
categoryTransfomer = categoryIndexer.fit(df)
category_train = categoryTransfomer.transform(train_df)
# 查看编码后前五个编码
category_train.select("alchemy_category", "alchemy_category_Index").show(5)
+------------------+----------------------+
| alchemy_category|alchemy_category_Index|
+------------------+----------------------+
|arts_entertainment| 2.0|
| ?| 0.0|
| ?| 0.0|
| business| 3.0|
|arts_entertainment| 2.0|
+------------------+----------------------+
only showing top 5 rows
使用OneHotEncoder进行独热编码
from pyspark.ml.feature import OneHotEncoder
onehot=OneHotEncoder(dropLast= False, inputCol="alchemy_category_Index", outputCol= "alchemy_category_vector")
df2 = onehot.transform(category_train)
# 输出前五行数据
df2.select("alchemy_category", "alchemy_category_Index", "alchemy_category_vector").show(5)
+------------------+----------------------+-----------------------+
| alchemy_category|alchemy_category_Index|alchemy_category_vector|
+------------------+----------------------+-----------------------+
|arts_entertainment| 2.0| (14,[2],[1.0])|
| ?| 0.0| (14,[0],[1.0])|
| ?| 0.0| (14,[0],[1.0])|
| business| 3.0| (14,[3],[1.0])|
|arts_entertainment| 2.0| (14,[2],[1.0])|
+------------------+----------------------+-----------------------+
only showing top 5 rows
合并特征
from pyspark.ml.feature import VectorAssembler
# 选择要进行转换的多个特征
asseblerInputs = ["alchemy_category_vector"] + row_df.columns[4: -1]
assembler = VectorAssembler(inputCols=asseblerInputs, outputCol= "features")
df3 = assembler.transform(df2)
print(df3.columns) # 查看数据的所有列
['url', 'alchemy_category', 'alchemy_category_score', 'avglinksize', 'commonlinkratio_1', 'commonlinkratio_2', 'commonlinkratio_3', 'commonlinkratio_4', 'compression_ratio', 'embed_ratio', 'framebased', 'frameTagRatio', 'hasDomainLink', 'html_ratio', 'image_ratio', 'is_news', 'lengthyLinkDomain', 'linkwordscore', 'news_front_page', 'non_markup_alphanum_characters', 'numberOfLinks', 'numwords_in_url', 'parametrizedLinkRatio', 'spelling_errors_ratio', 'label', 'alchemy_category_Index', 'alchemy_category_vector', 'features']
基于pipeline通过决策树进行预测
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
stringIndex = StringIndexer(inputCol="alchemy_category", outputCol="alchemy_category_index")
onehoter=OneHotEncoder(dropLast=False,inputCol="alchemy_category_index", outputCol="alchemy_category_vector")
assemleInputs = ["alchemy_category_vector"] + row_df.columns[4: -1]
assembler = VectorAssembler(inputCols=asseblerInputs, outputCol="features")
dt=DecisionTreeClassifier(labelCol="label",featuresCol="features",impurity="gini", maxDepth=10, maxBins=14)
pipeline = Pipeline(stages = [stringIndex, onehoter, assembler, dt])
pipeline.getStages() # 查看pipeline的阶段
[StringIndexer_460bba4dba684346139d,
OneHotEncoder_43f8be793b2bfc1bda6c,
VectorAssembler_4117bdc62478e1e7c978,
DecisionTreeClassifier_4bbe9381cabc88a82edc]
训练模型
pipelineModel = pipeline.fit(train_df)
pipelineModel.stages[3]
# 进行预测
predicted = pipelineModel.transform(test_df)
# 查看预测类别和概率
predicted.select("rawPrediction", "probability", "prediction", "label").take(5)
[Row(rawPrediction=DenseVector([3.0, 45.0]), probability=DenseVector([0.0625, 0.9375]), prediction=1.0, label=1.0),
Row(rawPrediction=DenseVector([103.0, 111.0]), probability=DenseVector([0.4813, 0.5187]), prediction=1.0, label=0.0),
Row(rawPrediction=DenseVector([27.0, 156.0]), probability=DenseVector([0.1475, 0.8525]), prediction=1.0, label=1.0),
Row(rawPrediction=DenseVector([168.0, 274.0]), probability=DenseVector([0.3801, 0.6199]), prediction=1.0, label=1.0),
Row(rawPrediction=DenseVector([24.0, 92.0]), probability=DenseVector([0.2069, 0.7931]), prediction=1.0, label=1.0)]
利用auc对模型进行评估
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
auc = evaluator.evaluate(predicted)
auc
0.6445965905518819
使用TrainValidationSplit找到最好模型参数
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
paramGrid = ParamGridBuilder().addGrid(dt.impurity, ["gini", "entropy"]).addGrid(dt.maxDepth, [5, 10, 15]).addGrid(dt.maxBins, [10, 15, 20]).build()
# 在训练集找到最好的参数
tsv = TrainValidationSplit(estimator=dt, evaluator=evaluator, estimatorParamMaps=paramGrid, trainRatio=0.8)
tsv_pipeline = Pipeline(stages = [stringIndex, onehoter, assembler, tsv]) # 将dt转换成tsv
tsv_pipelineModel = tsv_pipeline.fit(train_df)
bestModel = tsv_pipelineModel.stages[3].bestModel # 找出最好的模型
bestModel
tsv_prediction = tsv_pipelineModel.transform(test_df)
tsv_auc = evaluator.evaluate(tsv_prediction)
tsv_auc
0.6469125299137065
使用交叉验证方法
from pyspark.ml.tuning import CrossValidator
cv=CrossValidator(estimator=dt,evaluator=evaluator,estimatorParamMaps=paramGrid,numFolds=3)
cv_pipeline = Pipeline(stages = [stringIndex, onehoter, assembler, cv])
cv_pipelineModel = cv_pipeline.fit(train_df)
bestModel = cv_pipelineModel.stages[3].bestModel
bestModel
cv_prediction = cv_pipelineModel.transform(test_df)
cv_auc = evaluator.evaluate(cv_prediction)
cv_auc
0.6469125299137065
使用随机森林进行预测
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=50)
rfpipeline = Pipeline(stages = [stringIndex, onehoter, assembler, rf])
rfpipelineModel = rfpipeline.fit(train_df)
rf_preddiction = rfpipelineModel.transform(test_df)
evaluator.evaluate(rf_preddiction)
0.7437341450211594
训练数据 train.tsv 下载网址:https://share.weiyun.com/qGM0bxwT