首先是启动spark
from pyspark.ml import Pipeline
from pyspark.sql.types import DoubleType,StringType,IntegerType
from pyspark.sql.functions import col
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer,OneHotEncoder,VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.sql import SparkSession
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator
spark = SparkSession.builder.appName("pyspark_example").enableHiveSupport().getOrCreate()
然后使用spark读取sql或者csv,查看一下特征的名称
data_raw = spark.sql("""select * from t_user""")
data_raw.columns
然后开始对特征进行处理,第一步就是去除我们不会使用到的特征,也可以直接在sql中进行处理
drop_feature = ['height','weight']
data_raw = data_raw.drop(*drop_feature)
然后更改数据类型,这里把连续变量的值都更改为double类型,并且把空值填充为0,如果先填充为0,再更改数据类型会有问题,有些值不会被替换成0,可能是因为没有被识别出来。但是如果先更改数据类型,就会被识别为double,fillna的时候就可以被替换了。
feature_list = ['age','money','lat','lng','score','days']
for c in feature_list:
data_raw = data_raw.withColumn(c, data_raw[c].cast(DoubleType()))
data_raw = data_raw.fillna(0.0,subset=feature_list)
连续数据处理完之后,接着处理类别变量,也就是离散的值,例如,性别,是否安装app等等。我们要做两部处理,第一步是处理空值,第二步是对离散值处理
- 空值填充
onehot_list = ['app','gender']
for c in onehot_list:
data_raw = data_raw.withColumn(c, data_raw[c].cast(StringType()))
data_raw = data_raw.fillna("999",subset=onehot_list)
- 离散处理,onehot编码
indexers = [StringIndexer(inputCol=c, outputCol='{}_indexed'.format(c)).setHandleInvalid("keep") for c in onehot_list]
encoders = [OneHotEncoder(dropLast=False,inputCol=indexer.getOutputCol(),
outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers]
assembler1 = VectorAssembler(inputCols=feature_list+[encoder.getOutputCol() for encoder in encoders],outputCol="features")
好好解释一下,StringIndexer是把离散的类别变量编码成数值,例如有一个年纪类别,包含三种类型,老年,中年,少年,StringIndexer会按照出现的频率进行编码,出现次数最高的编码为0,依次类推,例如老年出现10次,中年出现100次,少年出现80次,那么编码就是{少年 :1,中年:2,老年:3}
然后我们使用onehot进行编码
中年 | 1 | 0 | 0 |
少年 | 0 | 1 | 0 |
老年 | 0 | 0 | 1 |
这里输出直接是一个向量。最后再把所有训练包含的特征通过VectorAssembler合并成一个向量。
建立一个机器学习的管道
pipeline = Pipeline(stages=indexers + encoders + [assembler1])
feature_model = pipeline.fit(data_raw)
train_data = feature_model.transform(data_raw)
train = train_data.select('passenger_id','features','label')
最后就是训练模型了,这里使用gridsearch进行搜索
RFclassifier = RandomForestClassifier(numTrees=100, featuresCol='features', labelCol="label", seed=7)
grid = ParamGridBuilder().addGrid(RFclassifier.numTrees, [100, 120, 140,160,180,200]).addGrid(RFclassifier.maxDepth, [10, 12, 14,16,18,20]).build()
evaluator = BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='label') #默认是roc,可以
cv = CrossValidator(
estimator=RFclassifier,
estimatorParamMaps=grid,
evaluator=evaluator,
numFolds=3,
parallelism=6
)
cvModel = cv.fit(train)
prediction = cvModel.transform(train)
params = [{p.name: v for p, v in m.items()} for m in cvModel.getEstimatorParamMaps()]
res = [{cvModel.getEvaluator().getMetricName(): metric, **ps} for ps, metric in zip(params, cvModel.avgMetrics)]
print(res)
我不会输出最优模型,所以得到最优参数之后,再次训练了模型,然后进行预测
rf = RandomForestClassifier(numTrees=100,maxDepth=16, featuresCol='features', labelCol="label", seed=7).fit(train_df)
rf_pred = rf.transform(test_df)
Spark入门:标签和索引的转化:StringIndexer- IndexToString-VectorIndexer