1. 相关包导入&客户端配置&加载数据
# -*- coding: utf-8 -*-
#1.1 加上下两行代码,可以直接使用python aa.py 执行
import findspark
findspark.init()
import datetime
import os
import logging
import numpy as np
import pandas as pd
from pyhive import hive
from pyspark import SparkConf, SparkContext
from pyspark.context import SparkContext
from pyspark.sql import HiveContext, SparkSession
from pyspark.sql.functions import current_date,datediff,udf
from pyspark.sql.types import StructField, StringType, FloatType, StructType, IntegerType, LongType
from pyspark.mllib.feature import Normalizer,StandardScaler
from pyspark.mllib.linalg import SparseVector, DenseVector
from pyspark.ml.feature import VectorAssembler,StringIndexer,QuantileDiscretizer,RFormula
from pyspark.ml.feature import MaxAbsScaler,StandardScaler,VectorAssembler,ChiSqSelector,OneHotEncoder
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline,PipelineModel
if __name__ == "__main__":
# 1.2 配置spark客户端
spark = SparkSession \
.builder \
.enableHiveSupport() \
.master("local[*]") \
.appName("test_lr") \
.config('spark.driver.maxResultSize', '10g') \
.config('spark.driver.memory', '4g') \
.config('spark.excutor.memory', '3g') \
.getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")
# 1.3 加载数据
df = spark.read.options(inferSchema=True, header=True, delimiter='\t').csv('file:///data/kouhj/test_pysprk/data.csv')
2. 数据类型转换与ohe-hot编码
# 2.1 数据转换
stringIndexer = StringIndexer(inputCol="gender", outputCol="gender_string")
model = stringIndexer.fit(df)
df = model.transform(df)
# 2.2 进行独热编码
ohe = OneHotEncoder(inputCol="gender_string", outputCol="gender_Vector")
ohe.setDropLast(False)
# ohe1 = OneHotEncoder().setInputCol("gender_string").setOutputCol("gender_Vector").setDropLast(False)
df_ohe = ohe.fit(df)
df = ohe.transform(df_ohe)
df.show(10)
3. 训练集划分
# 3. 划分数据集
trainingData, testData = df.randomSplit([0.8, 0.2], seed=1234)
4. 搭建model pipeline
# 4. 搭建model pipeline
featuresArray = ['is_new_user', 'gender']
assembler = VectorAssembler().setInputCols(featuresArray).setOutputCol("features")
scaler = StandardScaler(inputCol="features", outputCol="features_scaled", withMean=True, withStd=True)
# 进行逻辑回归,该类支持多项逻辑(softmax)和二项逻辑回归
lr = LogisticRegression(maxIter=100, elasticNetParam=0.80, regParam=0.001, labelCol="today_active", featuresCol="features_scaled",
predictionCol="prediction")
# lr = LogisticRegression().setLabelCol("today_active").setFeaturesCol("features")
lrPipeline = Pipeline().setStages([assembler, scaler, lr])
5. 模型训练及预测
# 5. 训练逻辑回归模型
lrpipeline_model = lrPipeline.fit(trainingData)
trainingPredictions = lrpipeline_model.transform(trainingData)
# trainingPredictions.select("uid", "features", "features_scaled", "is_agree", "prediction", "probability").show(10, False)
6. 模型参数查看
# 6.模型参数查看
print('模型', lrpipeline_model.stages[-1])
# LR 系数
coefficients = DenseVector(lrpipeline_model.stages[-1].coefficients).values
# LR 截距
intercept = lrpipeline_model.stages[-1].intercept
print('归一', lrpipeline_model.stages[-2])
# LR 均值
mean = DenseVector(lrpipeline_model.stages[-2].mean).values
# LR 方差
scale = DenseVector(lrpipeline_model.stages[-2].std).values
7. 使用公式模拟打分
# 7.使用公式模拟打分
threshold = 0.8
scale = [0.000001 if x <= 0.0000001 else x for x in scale]
training_df_pd = assembler.transform(trainingData.limit(10)).toPandas()
for i in range(10):
print(training_df_pd['features'][i])
training_df_pd_des = DenseVector(training_df_pd['features'][i])
print(training_df_pd_des)
y_pre = 1.0 / (1.0 + np.exp(-(np.dot(((training_df_pd_des - mean) / scale), coefficients) + intercept)))
if y_pre >= threshold:
print("1")
else:
print("0")
print(round(y_pre, 16))
8. 模型评估
# 8. 模型评估
secondelement = udf(lambda v: float(v[1]), StringType())
trainingPredictions.select(['today_active', 'prediction', secondelement('probability').alias('prob')]).show(10, False)
evaluator = BinaryClassificationEvaluator().setMetricName("areaUnderROC")\
.setRawPredictionCol("rawPrediction").setLabelCol("today_active")
# CV = CrossValidator().setEstimator(lrPipeline).setEvaluator(evaluator).setNumFolds(3)
AUC = evaluator.evaluate(trainingPredictions)
print("The Area Under ROC of LogisticRegression:", AUC)
evaluatorX = MulticlassClassificationEvaluator().setMetricName("accuracy").setLabelCol("today_active")
# metricName="weightedPrecision"
# metricName="weightedRecall"
ACC = evaluatorX.evaluate(trainingPredictions)
print("The Accuracy of LogisticRegression:", ACC)
# pmmlBuilder = PMMLBuilder(sc, trainingData, lrpipeline_model)
# pmmlBuilder.buildFile("Logistic.pmml")
spark.stop()
9. 模型保存与加载
# 9.1 模型保存
lrpipeline_model.write().overwrite().save(path+"model/XXModel")
# 9.2 模型加载
lrpipeline_model = PipelineModel.load(path+"model/XXModel")
10. 设置日志处理器
# 10.1 获取logger实例,如果参数为空则返回root logger
logger = logging.getLogger("LRModel")
logger.setLevel(logging.INFO) # 指定日志的最低输出级别,默认为WARN级别
# 10.2 文件日志
file_handler = logging.FileHandler("log/LRModel.log")
# 10.3 指定logger输出格式
formatter = logging.Formatter('%(asctime)s %(levelname)-8s: %(message)s')
file_handler.setFormatter(formatter)
# 10.4 控制台日志
console_handler = logging.StreamHandler(sys.stdout)
console_handler.formatter = formatter # 也可以直接给formatter赋值
# 10.5 为logger添加的日志处理器,可以自定义日志处理器让其输出到其他地方
logger.addHandler(file_handler)
logger.addHandler(console_handler)
# 10.6 写入日志
logger.info("AUC(The Area Under ROC of LogisticRegression)为:{}".format(AUC))
# 10.7 移除日志处理器
logger.removeHandler(file_handler)
11. 网格搜索
# 11. 网格搜索
ParamGrid = ParamGridBuilder().addGrid(lr.maxIter, [12, 18]).addGrid(lr.regParam, [0.001, 0.005]).build()
CV = CrossValidator().setEstimator(lrPipeline).setEvaluator(evaluator).setEstimatorParamMaps(ParamGrid).setNumFolds(3)
model = CV.fit(trainingData)
# 最佳模型参数
bestModel = model.bestModel
lrModel = bestModel.stages[1]
print("best regParam : ", lrModel.explainParam(lrModel.regParam))
print("best maxIter : ", lrModel.explainParam(lrModel.maxIter))
9999
# 1.从hive加载数据
sparkDF = hive_context.sql(sql_str)
# 计算日期差
sparkDF = sparkDF.withColumn('new_reg_dt', datediff(current_date(), sparkDF.reg_dt).alias('diff'))
# 2.数据类型转换
lis = sparkDF.columns
lis.remove('pt')
for col in lis:
sparkDF = sparkDF.withColumn(col, sparkDF[col].cast(FloatType()))
sparkDF = sparkDF.fillna(0)
# 3.训练集与测试集
predict_data = sparkDF.filter(sparkDF['pt'] == pre_date)
train_test_data = sparkDF.filter(sparkDF['pt'].isin(train_date_list))
# 4. 预测逻辑回归的值
predictions = model.transform(predict_data)
pre = predictions.select("uid", "today_active", "prediction", "probability", "pt").collect()
# 5. 逻辑回归
log_reg = LogisticRegression(labelCol='today_active').fit(training_df)
train_results = log_reg.evaluate(training_df).predictions
print('{}{}'.format('预测准确率:', log_reg.evaluate(training_df).accuracy))
# 6.连续数据离散化
bucketizer1 = QuantileDiscretizer(numBuckets=5
,inputCol='age', outputCol='age'
,relativeError=0.01, handleInvalid='error')
# 奇偶选择器 卡方检验,用于筛选重要特征,numTopFeatures=10表示筛选出最重要的10个特征,fpr=0.05假设检验的p值
chiSqSelector = ChiSqSelector(featuresCol="features", fpr=0.05,
outputCol="selectedFeatures", labelCol="is_agree")