在实际算法训练过程中,pyspark ml中的一些标准算法包并不能解决我们遇到的问题,需要自定义一些算法,具体封装案例如下:
from argparse import ArgumentParser, RawDescriptionHelpFormatter, Namespace
from dataclasses import dataclass
import xmltodict
import textwrap
from pyspark.ml.feature import MinMaxScaler, MinMaxScalerModel, VectorAssembler
from pyspark.sql import SparkSession, DataFrame
from pysparklib.pmml import PMMLUtil
from pyspark.sql.types import DoubleType
from pyspark import keyword_only
from pyspark.ml.param.shared import HasOutputCol, HasOutputCols, Param, Params, HasInputCol, HasInputCols, HasPredictionCol, HasLabelCol
from pyspark.ml import Pipeline, PipelineModel
from sparktorch.pipeline_util import PysparkReaderWriter
from sparktorch import PysparkPipelineWrapper
from pyspark.ml import Model
from pyspark.ml.base import Estimator
from pyspark.ml.regression import LinearRegression, LinearRegressionModel
from pyspark.ml.util import Identifiable, MLReadable, MLWritable
from pyspark.ml.param import TypeConverters
from pyspark.ml.linalg import VectorUDT, Vectors
from pyspark.sql import functions as F
import os
from datetime import datetime
import pandas as pd
import numpy as np
import dill
import codecs
import toad
class CombinerModel(Model, HasInputCol, HasOutputCol, HasLabelCol, Pysp