本文记录了用pyspark构建一个简单的模型的过程。
1. 读取数据集
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("Spark_mllearn_example").setMaster("local")
sc = SparkContext(conf=conf)
spark = SparkSession.builder.master("local").appName("Spark_mllearn_example").config("", "").getOrCreate()
dpath = '/Users/huoshirui/Desktop/Spark/'
df = spark.read.csv(dpath + 'spark_mllearn_test.csv', header=True)
数据集如下图:
2. 将数据集转换成可以用于模型使用的features/label的形式
df = df.withColumn('c2', df['c2'].cast('double'))\
.withColumn('c3', df['c3'