贝叶斯分类:在做算法时数据不能为负我就将原来数据中的负号去掉导致结果预测失败
优点:在数据较少的情况下仍然有效,可以处理多类别问题。
缺点:对于输入数据的准备方式较为敏感。 适用数据类型:标称型数据。
#1.导包
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.linalg import Vectors,Vector
from pyspark import SparkContext
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.python.pyspark.shell import spark
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.clustering import KMeans
#2.读取hdfs中的文件
sc=SparkContext.getOrCreate()
train_data=sc.textFile("hdfs://master:9000/qw.csv")
def GetParts(line):
parts = line.split(',')
return parts[0],parts[1],parts[2],parts[3],parts[4],parts[5],parts[6]
header = train_data.first() #获取第一行内容
train_data = train_data.filter(lambda row:row != header) #删除第一行数据
train = train_data.map(lambda line: GetParts(line))
df = spark.createDataFrame(train,["acceleration_x","acceleration_y","acceleration_z","gyro_x","gyro_y","gyro_z","activity"])#将数据转化为DataFrame格式
df.show()
#将string类型转化为浮点型
df = df.withColumn("acceleration_x", df["acceleration_x"].cast(FloatType()))
df = df.withColumn("acceleration_y", df["acceleration_y"].cast(FloatType()))
df = df.withColumn("acceleration_z", df["acceleration_z"].cast(FloatType()))
df = df.withColumn("gyro_x", df["gyro_x"].cast(FloatType()))
df = df.withColumn("gyro_y", df["gyro_y"].cast(FloatType()))
df = df.withColumn("gyro_z", df["gyro_z"].cast(FloatType()))
df = df.withColumn("activity", df["activity"].cast(FloatType()))
#将数据划分为特征和标签
assembler = VectorAssembler(inputCols=["acceleration_x","acceleration_y","acceleration_z","gyro_x","gyro_y","gyro_z"],outputCol="features")
output = assembler.transform(df)
label_features = output.select("features", "activity").toDF('features','label')
label_features.show(truncate=False)
#贝叶斯
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
# 训练模型
model = nb.fit(label_features)
df1 = spark.createDataFrame([(-1.0602,-0.282,-0.0618,0.8069,-0.9107,1.6153,1)],["acceleration_x","acceleration_y","acceleration_z","gyro_x","gyro_y","gyro_z","activity"])
df1.show()
test_assembler = VectorAssembler(inputCols=["acceleration_x","acceleration_y","acceleration_z","gyro_x","gyro_y","gyro_z"],outputCol="features")
test_output = test_assembler.transform(df1)
test_label_features = test_output.select("features", "activity").toDF('features','label')
test_label_features.show(truncate=False)
# df1 = label_features.head(5)
# df1 = spark.createDataFrame(df1)
# df1.show()
# compute accuracy on the test set
result = model.transform(test_label_features)
print(result.collect())
predictionAndLabels = result.select("prediction", "label").collect()
print(predictionAndLabels)