object Main {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("SA")
val spark = SparkSession.builder().config(conf).getOrCreate()
spark.sparkContext.setLogLevel("WARN") ///日志级别
import spark.implicits._
val rand = new Random()
val neg = spark.read.textFile("neg.txt").map(
line => {
(line.split(" ").filter(!_.equals(" ")),0,rand.nextDouble())
}).toDF("words","value","random")
val pos = spark.read.textFile("pos.txt").map(
line => {
(line.split(" ").filter(!_.equals(" ")),1,rand.nextDouble())
}).toDF("words","value","random") //思考:这里把inner function提出重用来如何操作
val data = neg.union(pos).sort("random") //思考:为什么不用join
//data.show(false)
//println(neg.count(),data.count())//合并
//文本特征抽取
val hashingTf = new HashingTF()
.setInputCol("words")
.setOutputCol("hashing")
.transform(data)
val idfModel = new IDF()
.setInputCol("hashing")
.setOutputCol("tfidf")
.fit(hashingTf)
val transformedData = idfModel
.transform(hashingTf)
val Array(training,test) = transformedData
.randomSplit(Array(0.7,0.3))
//根据抽取到的文本特征,使用分类器进行分类,这是一个二分类问题
//分类器是可替换的
val bayes = new NaiveBayes()
.setFeaturesCol("tfidf") //X
.setLabelCol("value") //y
.fit(training)
val result = bayes.transform(test) //交叉验证
result.show(false)
//对模型的准去率进行评估
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("value")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val accuracy = evaluator.evaluate(result)
println(s"""accuracy is $accuracy""")
//尝试用pipeline重构代码
}
}
推荐系统
object Main {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("RS")
val spark = SparkSession.builder().config(conf).getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val parseRating = (string: String) =>{
val stringArray = string.split("\t")
Rating(stringArray(0).toInt,stringArray(1).toInt,stringArray(2).toFloat)
}
import spark.implicits._
val data = spark.read.textFile("u.data")
.map(parseRating)
.toDF("userID","itemID","rating")
//data.show(false)
val Array(traing,test) = data.randomSplit(Array(0.8,0.2))
val als = new ALS()
.setMaxIter(20)
.setUserCol("userID")
.setItemCol("itemID")
.setRatingCol("rating")
.setRegParam(0.01)//正则化参数
val model = als.fit(traing)
model.setColdStartStrategy("drop")//冷启动策略,这是推荐系统的一个重点内容哦~
val predictions = model.transform(test)
//predictions.show(false)//根据(userID,itemID)预测rating
//MovieLens
val users = spark.createDataset(Array(196)).toDF("userID")
//users.show(false)
model.recommendForUserSubset(users,10).show(false)//想一想工业实践该怎么结合这段代码?
//模型评估
val evaluator = new RegressionEvaluator()
.setMetricName("rmse")
.setLabelCol("rating")
.setPredictionCol("prediction")
val rmse = evaluator.evaluate(predictions)
println(s"Root-mean-square error is $rmse \n")
//Spark机器学习模型的持久化
//模型保存
//model.save("./xxx")
//模型加载
//val model = ALS.load("xxxx")
}
}