上一次发的那个回归是线性回归,这次给大家发一下逻辑回归
原数据
from pyspark.mllib.linalg import Vectors,Vector
from pyspark import SparkContext
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.python.pyspark.shell import spark
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.clustering import KMeans
sc=SparkContext.getOrCreate()
train_data=sc.textFile("hdfs://master:9000/RunorWalk.csv")
def GetParts(line):
parts = line.split(',')
return parts[0],parts[1],parts[2],parts[3],parts[4],parts[5],parts[6]
header = train_data.first()
train_data = train_data.filter(lambda row:row != header)
train = train_data.map(lambda line: GetParts(line))
df = spark.createDataFrame(train,["acceleration_x","acceleration_y","acceleration_z","gyro_x","gyro_y","gyro_z","activity"])
df.show()
df = df.withColumn("acceleration_x", df["acceleration_x"].cast(FloatType()))
df = df.withColumn("acceleration_y", df["acceleration_y"].cast(FloatType()))
df = df.withColumn("acceleration_z", df["acceleration_z"].cast(FloatType()))
df = df.withColumn("gyro_x", df["gyro_x"].cast(FloatType()))
df = df.withColumn("gyro_y", df["gyro_y"].cast(FloatType()))
df = df.withColumn("gyro_z", df["gyro_z"].cast(FloatType()))
df = df.withColumn("activity", df["activity"].cast(FloatType()))
assembler = VectorAssembler(inputCols=["acceleration_x","acceleration_y","acceleration_z","gyro_x","gyro_y","gyro_z"],outputCol="features")
output = assembler.transform(df)
label_features = output.select("features", "activity").toDF('features','label')
label_features.show(truncate=False)
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(label_features)
df1 = spark.createDataFrame([(-1.0602,-0.282,-0.0618,0.8069,-0.9107,1.6153,1)],["acceleration_x","acceleration_y","acceleration_z","gyro_x","gyro_y","gyro_z","activity"])
df1.show()
test_assembler = VectorAssembler(inputCols=["acceleration_x","acceleration_y","acceleration_z","gyro_x","gyro_y","gyro_z"],outputCol="features")
test_output = test_assembler.transform(df1)
test_label_features = test_output.select("features", "activity").toDF('features','label')
test_label_features.show(truncate=False)
# df1 = label_features.head(5)
# df1 = spark.createDataFrame(df1)
# df1.show()
prediction = lrModel.transform(test_label_features)
result = prediction.select("features", "label", "probability","prediction").collect()
print(result)
#划分特征量与标签
#需要预测的语句