关于随机森林的定义就不赘叙,有兴趣可以看这篇文章
import findspark
from numpy import frompyfunc
from pyspark.ml import classification
from pyspark.sql.functions import spark_partition_id
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('test1').getOrCreate()
df = spark.read.csv('D:/PythonCode/Python_dataming/pythons/.vscode/机器学习/affairs.csv',inferSchema=True,header=True)
print(df.count())
print(df.printSchema())
from pyspark.ml.feature import VectorAssembler
df_ass=VectorAssembler(inputCols=['rate_marriage','age','yrs_married','children','religious'],outputCol='features')
df