pyspark预测网页分类

最新推荐文章于 2024-07-31 15:08:57 发布

HackerLZH

最新推荐文章于 2024-07-31 15:08:57 发布

阅读量715

点赞数 1

分类专栏： Spark 文章标签： spark 机器学习

本文链接：https://blog.csdn.net/hackeryuan/article/details/117323924

版权

Spark 专栏收录该内容

6 篇文章 1 订阅

订阅专栏

读取数据

from pyspark.context import SparkContext 
from pyspark.sql.session import SparkSession
# 读取文本文件，创建为DataFrame 结构
row_df=spark.read.format("csv").option("header","true").option("delimiter","\t").load("train.tsv")
print(row_df.count()) # 查看数据条数
row_df.printSchema() # 查看数据字段类型

7395
root
 |-- url: string (nullable = true)
 |-- urlid: string (nullable = true)
 |-- boilerplate: string (nullable = true)
 |-- alchemy_category: string (nullable = true)
 |-- alchemy_category_score: string (nullable = true)
 |-- avglinksize: string (nullable = true)
 |-- commonlinkratio_1: string (nullable = true)
 |-- commonlinkratio_2: string (nullable = true)
 |-- commonlinkratio_3: string (nullable = true)
 |-- commonlinkratio_4: string (nullable = true)
 |-- compression_ratio: string (nullable = true)
 |-- embed_ratio: string (nullable = true)
 |-- framebased: string (nullable = true)
 |-- frameTagRatio: string (nullable = true)
 |-- hasDomainLink: string (nullable = true)
 |-- html_ratio: string (nullable = true)
 |-- image_ratio: string (nullable = true)
 |-- is_news: string (nullable = true)
 |-- lengthyLinkDomain: string (nullable = true)
 |-- linkwordscore: string (nullable = true)
 |-- news_front_page: string (nullable = true)
 |-- non_markup_alphanum_characters: string (nullable = true)
 |-- numberOfLinks: string (nullable = true)
 |-- numwords_in_url: string (nullable = true)
 |-- parametrizedLinkRatio: string (nullable = true)
 |-- spelling_errors_ratio: string (nullable = true)
 |-- label: string (nullable = true)

数据转换

from pyspark.sql.functions import udf
def replace_question(x):
    return ("0" if x=="?" else x)
replace_question = udf(replace_question)
from pyspark.sql.functions import col
import pyspark.sql.types
# 将第4个字段开始转换成double类型
df=row_df.select(['url','alchemy_category']+[replace_question(col(column)).cast("double").alias(column) for column in row_df.columns[4:]])
df.printSchema() # 查看转换后数据字段类型

root
 |-- url: string (nullable = true)
 |-- alchemy_category: string (nullable = true)
 |-- alchemy_category_score: double (nullable = true)
 |-- avglinksize: double (nullable = true)
 |-- commonlinkratio_1: double (nullable = true)
 |-- commonlinkratio_2: double (nullable = true)
 |-- commonlinkratio_3: double (nullable = true)
 |-- commonlinkratio_4: double (nullable = true)
 |-- compression_ratio: double (nullable = true)
 |-- embed_ratio: double (nullable = true)
 |-- framebased: double (nullable = true)
 |-- frameTagRatio: double (nullable = true)
 |-- hasDomainLink: double (nullable = true)
 |-- html_ratio: double (nullable = true)
 |-- image_ratio: double (nullable = true)
 |-- is_news: double (nullable = true)
 |-- lengthyLinkDomain: double (nullable = true)
 |-- linkwordscore: double (nullable = true)
 |-- news_front_page: double (nullable = true)
 |-- non_markup_alphanum_characters: double (nullable = true)
 |-- numberOfLinks: double (nullable = true)
 |-- numwords_in_url: double (nullable = true)
 |-- parametrizedLinkRatio: double (nullable = true)
 |-- spelling_errors_ratio: double (nullable = true)
 |-- label: double (nullable = true)

划分训练集和测试集

# 存入缓存中
train_df, test_df = df.randomSplit([0.7, 0.3])
train_df.cache()
test_df.cache()

DataFrame[url: string, alchemy_category: string, alchemy_category_score: double, avglinksize: double, commonlinkratio_1: double, commonlinkratio_2: double, commonlinkratio_3: double, commonlinkratio_4: double, compression_ratio: double, embed_ratio: double, framebased: double, frameTagRatio: double, hasDomainLink: double, html_ratio: double, image_ratio: double, is_news: double, lengthyLinkDomain: double, linkwordscore: double, news_front_page: double, non_markup_alphanum_characters: double, numberOfLinks: double, numwords_in_url: double, parametrizedLinkRatio: double, spelling_errors_ratio: double, label: double]

使用StringIndexer对alchemy_category进行编码

from pyspark.ml.feature import StringIndexer
categoryIndexer=StringIndexer(inputCol='alchemy_category', outputCol='alchemy_category_Index')
categoryTransfomer = categoryIndexer.fit(df)
category_train = categoryTransfomer.transform(train_df)
# 查看编码后前五个编码
category_train.select("alchemy_category", "alchemy_category_Index").show(5)

+------------------+----------------------+
|  alchemy_category|alchemy_category_Index|
+------------------+----------------------+
|arts_entertainment|                   2.0|
|                 ?|                   0.0|
|                 ?|                   0.0|
|          business|                   3.0|
|arts_entertainment|                   2.0|
+------------------+----------------------+
only showing top 5 rows

使用OneHotEncoder进行独热编码

from pyspark.ml.feature import OneHotEncoder
onehot=OneHotEncoder(dropLast= False, inputCol="alchemy_category_Index", outputCol= "alchemy_category_vector")
df2 = onehot.transform(category_train)
# 输出前五行数据
df2.select("alchemy_category", "alchemy_category_Index", "alchemy_category_vector").show(5)

+------------------+----------------------+-----------------------+
|  alchemy_category|alchemy_category_Index|alchemy_category_vector|
+------------------+----------------------+-----------------------+
|arts_entertainment|                   2.0|         (14,[2],[1.0])|
|                 ?|                   0.0|         (14,[0],[1.0])|
|                 ?|                   0.0|         (14,[0],[1.0])|
|          business|                   3.0|         (14,[3],[1.0])|
|arts_entertainment|                   2.0|         (14,[2],[1.0])|
+------------------+----------------------+-----------------------+
only showing top 5 rows

合并特征

from pyspark.ml.feature import VectorAssembler
# 选择要进行转换的多个特征
asseblerInputs = ["alchemy_category_vector"] + row_df.columns[4: -1]
assembler = VectorAssembler(inputCols=asseblerInputs, outputCol= "features")
df3 = assembler.transform(df2)
print(df3.columns) # 查看数据的所有列

['url', 'alchemy_category', 'alchemy_category_score', 'avglinksize', 'commonlinkratio_1', 'commonlinkratio_2', 'commonlinkratio_3', 'commonlinkratio_4', 'compression_ratio', 'embed_ratio', 'framebased', 'frameTagRatio', 'hasDomainLink', 'html_ratio', 'image_ratio', 'is_news', 'lengthyLinkDomain', 'linkwordscore', 'news_front_page', 'non_markup_alphanum_characters', 'numberOfLinks', 'numwords_in_url', 'parametrizedLinkRatio', 'spelling_errors_ratio', 'label', 'alchemy_category_Index', 'alchemy_category_vector', 'features']

基于pipeline通过决策树进行预测

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
 
stringIndex = StringIndexer(inputCol="alchemy_category", outputCol="alchemy_category_index")
onehoter=OneHotEncoder(dropLast=False,inputCol="alchemy_category_index", outputCol="alchemy_category_vector")
assemleInputs = ["alchemy_category_vector"] + row_df.columns[4: -1]
assembler = VectorAssembler(inputCols=asseblerInputs, outputCol="features")
dt=DecisionTreeClassifier(labelCol="label",featuresCol="features",impurity="gini", maxDepth=10, maxBins=14)
pipeline = Pipeline(stages = [stringIndex, onehoter, assembler, dt])
pipeline.getStages() # 查看pipeline的阶段

[StringIndexer_460bba4dba684346139d,
 OneHotEncoder_43f8be793b2bfc1bda6c,
 VectorAssembler_4117bdc62478e1e7c978,
 DecisionTreeClassifier_4bbe9381cabc88a82edc]

训练模型

pipelineModel = pipeline.fit(train_df)
pipelineModel.stages[3]
# 进行预测
predicted = pipelineModel.transform(test_df)
# 查看预测类别和概率
predicted.select("rawPrediction", "probability", "prediction", "label").take(5)

[Row(rawPrediction=DenseVector([3.0, 45.0]), probability=DenseVector([0.0625, 0.9375]), prediction=1.0, label=1.0),
 Row(rawPrediction=DenseVector([103.0, 111.0]), probability=DenseVector([0.4813, 0.5187]), prediction=1.0, label=0.0),
 Row(rawPrediction=DenseVector([27.0, 156.0]), probability=DenseVector([0.1475, 0.8525]), prediction=1.0, label=1.0),
 Row(rawPrediction=DenseVector([168.0, 274.0]), probability=DenseVector([0.3801, 0.6199]), prediction=1.0, label=1.0),
 Row(rawPrediction=DenseVector([24.0, 92.0]), probability=DenseVector([0.2069, 0.7931]), prediction=1.0, label=1.0)]

利用auc对模型进行评估

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
auc = evaluator.evaluate(predicted)
auc

0.6445965905518819

使用TrainValidationSplit找到最好模型参数

from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
 
paramGrid = ParamGridBuilder().addGrid(dt.impurity, ["gini", "entropy"]).addGrid(dt.maxDepth, [5, 10, 15]).addGrid(dt.maxBins, [10, 15, 20]).build()
# 在训练集找到最好的参数
tsv = TrainValidationSplit(estimator=dt, evaluator=evaluator, estimatorParamMaps=paramGrid, trainRatio=0.8)
tsv_pipeline = Pipeline(stages = [stringIndex, onehoter, assembler, tsv]) # 将dt转换成tsv 
tsv_pipelineModel = tsv_pipeline.fit(train_df)
bestModel = tsv_pipelineModel.stages[3].bestModel # 找出最好的模型
bestModel
tsv_prediction = tsv_pipelineModel.transform(test_df)
tsv_auc = evaluator.evaluate(tsv_prediction)
tsv_auc

0.6469125299137065

使用交叉验证方法

from pyspark.ml.tuning import CrossValidator
 
cv=CrossValidator(estimator=dt,evaluator=evaluator,estimatorParamMaps=paramGrid,numFolds=3)
cv_pipeline = Pipeline(stages = [stringIndex, onehoter, assembler, cv])
cv_pipelineModel = cv_pipeline.fit(train_df)
bestModel = cv_pipelineModel.stages[3].bestModel
bestModel
cv_prediction = cv_pipelineModel.transform(test_df)
cv_auc = evaluator.evaluate(cv_prediction)
cv_auc

0.6469125299137065

使用随机森林进行预测

from pyspark.ml.classification import RandomForestClassifier
 
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=50)
rfpipeline = Pipeline(stages = [stringIndex, onehoter, assembler, rf])
rfpipelineModel = rfpipeline.fit(train_df)
rf_preddiction = rfpipelineModel.transform(test_df)
evaluator.evaluate(rf_preddiction)