我们在应用spark ml中的一些算法API进行机器学习建模时,常常会遇到一个问题,即经向量化操作后,无法再还原为原来单列的数据形式,为此开发了一个python方法,该方法可与spark原生ml中API进行pipeline组装应用,以模型形式保存加载,具体实现过程见示例代码:
from pyspark.sql.types import DoubleType
from pyspark import keyword_only
from pyspark.ml.param.shared import HasOutputCols, Param, Params, HasInputCol
from pyspark.ml import Pipeline, PipelineModel
from sparktorch.pipeline_util import PysparkReaderWriter
from pyspark.ml import Model
from sparktorch import PysparkPipelineWrapper
from pyspark.ml.regression import LinearRegression, LinearRegressionModel
from pyspark.ml.util import Identifiable, MLReadable, MLWritable
from pyspark.ml.param import TypeConverters
spark = SparkSession.builder \
.enableHiveSupport() \
.getOrCreate()
df = spark.read.table('hive_table_name')
class SplitCol(Model, HasInputCol, HasOutputCols, PysparkReaderWriter, MLReadable, MLWritable, Identifiable):
kepCol = Param(Params._dummy(), "kepCol", "", typeConverter=TypeConverters.toBoolean)
@keyword_only
def __init__(
self,
inputCol=None,
outputCols=None,
kepCol=False
):
super().__init__()
self._setDefault(
inputCol=None,
outputCols=None,
kepCol=False
)
kwargs = self._input_kwargs
self.setParams(**kwargs)
@keyword_only
def setParams(
self,
inputCol=None,
outputCols=None,
kepCol=False
):
kwargs = self._input_kwargs
return self._set(**kwargs)
def _transform(self, df):
#inp = self.getOrDefault()
out = self.getOutputCols()
inp = self.getInputCol()
kepCol = self.getOrDefault(self.kepCol)
new_features = []
outCols = out
inCol = inp
if not kepCol:
for col in outCols:
df: DataFrame = df.drop(col)
new_features = outCols
else:
for col in outCols:
new_features.append('scaled_' + col)
schema = df.schema
cols = df.columns
for col in new_features:
schema = schema.add(col, DoubleType(), True)
df: DataFrame = spark.createDataFrame(
df.rdd.map(lambda row: [row[i] for i in cols] + row.scaled_feature.tolist()), schema)
df: DataFrame = df.drop(inCol)
df: DataFrame = df.drop('assemble_feature')
return df
feature_list = ['col1', 'col2', 'col3']
vector_assembler: VectorAssembler = VectorAssembler(inputCols=feature_list, outputCol='assemble_feature')
scaler: MinMaxScaler = MinMaxScaler(min=0, max=1, inputCol='assemble_feature',
outputCol='scaled_feature')
split_model = SplitCol(
inputCol='scaled_feature',
outputCols=feature_list,
kepCol=False)
vectored: VectorAssembler = VectorAssembler(inputCols=feature_list, outputCol='assemble_feature')
lr = LinearRegression(featuresCol='assemble_feature', labelCol='medv', predictionCol="prediction",
maxIter=2, regParam=0)
p = Pipeline(stages=[vector_assembler, scaler, split_model, vectored, lr]).fit(df)
p.write().overwrite().save('hdfs_path')
pm1 = PysparkPipelineWrapper.unwrap(PipelineModel.load('hdfs_path'))
data = pm1.transform(df)
data.show()
print('***************************************')