pyspark 实现LR

1. 相关包导入&客户端配置&加载数据

# -*- coding: utf-8 -*-

#1.1 加上下两行代码,可以直接使用python aa.py 执行
import findspark
findspark.init()

import datetime
import os
import logging
import numpy as np
import pandas as pd
from pyhive import hive

from pyspark import SparkConf, SparkContext
from pyspark.context import SparkContext
from pyspark.sql import HiveContext, SparkSession
from pyspark.sql.functions import current_date,datediff,udf
from pyspark.sql.types import StructField, StringType, FloatType, StructType, IntegerType, LongType

from pyspark.mllib.feature import Normalizer,StandardScaler
from pyspark.mllib.linalg import SparseVector, DenseVector
from pyspark.ml.feature import VectorAssembler,StringIndexer,QuantileDiscretizer,RFormula
from pyspark.ml.feature import MaxAbsScaler,StandardScaler,VectorAssembler,ChiSqSelector,OneHotEncoder
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline,PipelineModel


if __name__ == "__main__":
    # 1.2 配置spark客户端
    spark = SparkSession \
        .builder \
        .enableHiveSupport() \
        .master("local[*]") \
        .appName("test_lr") \
        .config('spark.driver.maxResultSize', '10g') \
        .config('spark.driver.memory', '4g') \
        .config('spark.excutor.memory', '3g') \
        .getOrCreate()

    sc = spark.sparkContext
    sc.setLogLevel("ERROR")
    
     # 1.3 加载数据
    df = spark.read.options(inferSchema=True, header=True, delimiter='\t').csv('file:///data/kouhj/test_pysprk/data.csv')

2. 数据类型转换与ohe-hot编码

    # 2.1 数据转换
    stringIndexer = StringIndexer(inputCol="gender", outputCol="gender_string")
    model = stringIndexer.fit(df)
    df = model.transform(df)  
 
    # 2.2 进行独热编码
    ohe = OneHotEncoder(inputCol="gender_string", outputCol="gender_Vector")
    ohe.setDropLast(False)
    # ohe1 = OneHotEncoder().setInputCol("gender_string").setOutputCol("gender_Vector").setDropLast(False)
    df_ohe = ohe.fit(df)
    df = ohe.transform(df_ohe)
    df.show(10)

3.  训练集划分

    # 3. 划分数据集
    trainingData, testData = df.randomSplit([0.8, 0.2], seed=1234)

4. 搭建model pipeline

    # 4. 搭建model pipeline
    featuresArray = ['is_new_user', 'gender']
    assembler = VectorAssembler().setInputCols(featuresArray).setOutputCol("features")
    scaler = StandardScaler(inputCol="features", outputCol="features_scaled", withMean=True, withStd=True)
    # 进行逻辑回归,该类支持多项逻辑(softmax)和二项逻辑回归
    lr = LogisticRegression(maxIter=100, elasticNetParam=0.80, regParam=0.001, labelCol="today_active", featuresCol="features_scaled",
        predictionCol="prediction")
    # lr = LogisticRegression().setLabelCol("today_active").setFeaturesCol("features")
    lrPipeline = Pipeline().setStages([assembler, scaler, lr])

5. 模型训练及预测

    # 5. 训练逻辑回归模型
    lrpipeline_model = lrPipeline.fit(trainingData)
    trainingPredictions = lrpipeline_model.transform(trainingData)
    # trainingPredictions.select("uid", "features", "features_scaled", "is_agree", "prediction", "probability").show(10, False)

6. 模型参数查看

    # 6.模型参数查看
    print('模型', lrpipeline_model.stages[-1])
    # LR 系数
    coefficients = DenseVector(lrpipeline_model.stages[-1].coefficients).values
    # LR 截距
    intercept = lrpipeline_model.stages[-1].intercept
    print('归一', lrpipeline_model.stages[-2])
    # LR 均值
    mean = DenseVector(lrpipeline_model.stages[-2].mean).values
    # LR 方差
    scale = DenseVector(lrpipeline_model.stages[-2].std).values

7. 使用公式模拟打分

    # 7.使用公式模拟打分
    threshold = 0.8
    scale = [0.000001 if x <= 0.0000001 else x for x in scale]
    training_df_pd = assembler.transform(trainingData.limit(10)).toPandas()
    for i in range(10):
        print(training_df_pd['features'][i])
        training_df_pd_des = DenseVector(training_df_pd['features'][i])
        print(training_df_pd_des)
        y_pre = 1.0 / (1.0 + np.exp(-(np.dot(((training_df_pd_des - mean) / scale), coefficients) + intercept)))
        if y_pre >= threshold:
            print("1")
        else:
            print("0")
        print(round(y_pre, 16))

8. 模型评估

    # 8. 模型评估
    secondelement = udf(lambda v: float(v[1]), StringType())
    trainingPredictions.select(['today_active', 'prediction', secondelement('probability').alias('prob')]).show(10, False)

    evaluator = BinaryClassificationEvaluator().setMetricName("areaUnderROC")\
        .setRawPredictionCol("rawPrediction").setLabelCol("today_active")
    # CV = CrossValidator().setEstimator(lrPipeline).setEvaluator(evaluator).setNumFolds(3)
    AUC = evaluator.evaluate(trainingPredictions)
    print("The Area Under ROC of LogisticRegression:", AUC)

    evaluatorX = MulticlassClassificationEvaluator().setMetricName("accuracy").setLabelCol("today_active")
    # metricName="weightedPrecision"
    # metricName="weightedRecall"
    ACC = evaluatorX.evaluate(trainingPredictions)
    print("The Accuracy of LogisticRegression:", ACC)

    # pmmlBuilder = PMMLBuilder(sc, trainingData, lrpipeline_model)
    # pmmlBuilder.buildFile("Logistic.pmml")
    spark.stop()

9. 模型保存与加载

    # 9.1 模型保存
    lrpipeline_model.write().overwrite().save(path+"model/XXModel")
    # 9.2 模型加载
    lrpipeline_model = PipelineModel.load(path+"model/XXModel")

10. 设置日志处理器

    # 10.1 获取logger实例,如果参数为空则返回root logger
    logger = logging.getLogger("LRModel")
    logger.setLevel(logging.INFO)  # 指定日志的最低输出级别,默认为WARN级别

    # 10.2 文件日志
    file_handler = logging.FileHandler("log/LRModel.log")

    # 10.3 指定logger输出格式
    formatter = logging.Formatter('%(asctime)s %(levelname)-8s: %(message)s')
    file_handler.setFormatter(formatter) 

    # 10.4 控制台日志
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.formatter = formatter  # 也可以直接给formatter赋值

    # 10.5 为logger添加的日志处理器,可以自定义日志处理器让其输出到其他地方
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
    # 10.6 写入日志
    logger.info("AUC(The Area Under ROC of LogisticRegression)为:{}".format(AUC))

    # 10.7 移除日志处理器
    logger.removeHandler(file_handler)

11. 网格搜索

# 11. 网格搜索
ParamGrid = ParamGridBuilder().addGrid(lr.maxIter, [12, 18]).addGrid(lr.regParam, [0.001, 0.005]).build()
CV = CrossValidator().setEstimator(lrPipeline).setEvaluator(evaluator).setEstimatorParamMaps(ParamGrid).setNumFolds(3)
model = CV.fit(trainingData)
# 最佳模型参数
bestModel = model.bestModel
lrModel = bestModel.stages[1]
print("best regParam : ", lrModel.explainParam(lrModel.regParam))
print("best maxIter : ", lrModel.explainParam(lrModel.maxIter))

9999

# 1.从hive加载数据
sparkDF = hive_context.sql(sql_str)
# 计算日期差
sparkDF = sparkDF.withColumn('new_reg_dt', datediff(current_date(), sparkDF.reg_dt).alias('diff'))

# 2.数据类型转换
lis = sparkDF.columns
lis.remove('pt')
for col in lis:
    sparkDF = sparkDF.withColumn(col, sparkDF[col].cast(FloatType()))
sparkDF = sparkDF.fillna(0)

# 3.训练集与测试集
predict_data = sparkDF.filter(sparkDF['pt'] == pre_date)
train_test_data = sparkDF.filter(sparkDF['pt'].isin(train_date_list))

# 4. 预测逻辑回归的值
predictions = model.transform(predict_data)
pre = predictions.select("uid", "today_active", "prediction", "probability", "pt").collect()

# 5. 逻辑回归
log_reg = LogisticRegression(labelCol='today_active').fit(training_df)
train_results = log_reg.evaluate(training_df).predictions
print('{}{}'.format('预测准确率:', log_reg.evaluate(training_df).accuracy))  

# 6.连续数据离散化
bucketizer1 = QuantileDiscretizer(numBuckets=5
    ,inputCol='age', outputCol='age'
    ,relativeError=0.01, handleInvalid='error')

# 奇偶选择器 卡方检验,用于筛选重要特征,numTopFeatures=10表示筛选出最重要的10个特征,fpr=0.05假设检验的p值
chiSqSelector = ChiSqSelector(featuresCol="features", fpr=0.05,
    outputCol="selectedFeatures", labelCol="is_agree")

  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
pyspark可以用来实现新闻分类,基本流程如下: 1. 数据预处理:将原始文本数据转换为可以被pyspark读取和处理的格式,例如CSV或Parquet格式。 2. 特征提取:使用pyspark的特征提取工具,例如TF-IDF或Word2Vec,将文本数据转换为数值特征表示。 3. 模型训练:选择适当的分类模型,例如朴素贝叶斯、逻辑回归或支持向量机,并使用pyspark的MLlib模块进行模型训练。 4. 模型评估:使用pyspark的评估指标,例如准确率、精确率、召回率和F1值,对模型进行评估。 5. 模型优化:根据评估结果,进行模型调参或改进特征提取方法,以提高模型性能。 下面是一个基本的pyspark新闻分类代码示例: ```python from pyspark.ml.feature import HashingTF, IDF, Tokenizer from pyspark.ml.classification import LogisticRegression from pyspark.ml import Pipeline from pyspark.sql.functions import col # 读取数据 data = spark.read.csv("news.csv", header=True) # 分词 tokenizer = Tokenizer(inputCol="text", outputCol="words") data = tokenizer.transform(data) # 特征提取 hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000) idf = IDF(inputCol="rawFeatures", outputCol="features") pipeline = Pipeline(stages=[tokenizer, hashingTF, idf]) model = pipeline.fit(data) data = model.transform(data) # 模型训练 train, test = data.randomSplit([0.8, 0.2], seed=42) lr = LogisticRegression(featuresCol="features", labelCol="category") model = lr.fit(train) # 模型评估 result = model.transform(test) result = result.withColumn("prediction", col("prediction").cast("double")) accuracy = result.filter(result.category == result.prediction).count() / result.count() print("Accuracy:", accuracy) ``` 在这个示例中,我们使用了Logistic Regression模型,并使用HashingTF和IDF进行特征提取。数据集采用CSV格式,其中包含“text”和“category”两列,分别表示新闻文本和分类标签。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值