机器学习
1.监督学习
有训练数据集。规范数据。合规数据。产生推断函数.然后对新数据应用函数。
director actor edit Label
2.非监督学习
没有训练数据。
分组。
3.推荐
协同过滤.
猜测你喜欢.
电商。
Spark机器学习库
[Estimator]
运行在包含了feature和label(结果)的dataFrame之上,对数据进行训练创建model。
该模型用于以后的预测。
[Transformer]
将包含feature的Dataframe变换成了包含了预测的dataframe.
由Estimator创建的model就是Transformer。
[Parameter]
Estimator和Transformer使用的数据,通常和机器学习的算法相关。
Spark API给出了一致性API针对算法。
[Pipeline]
将Estimators和Transformers组合在一起,形成机器学习工作流.
机器学习应用步骤
1.读取数据文件形成训练数据框
2.创建LinearRegression并设置参数
3.对训练数据进行模型拟合,完成评估管线.
4.创建包含测试数据的DataFrame,典型包含feature和label,可以通过比较预测标签和测试标签确认model是ok,
5.使用模型,对测试数据进行变换(应用模型),抽取feature ,label,predication.
scala 机器学习使用:
1.引入pom
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.11</artifactId>
<version>2.1.0</version>
</dependency>
2.scala类:
package com.mao.scala.scala
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.sql.SparkSession
object SparkMLDemo1 {
def main(args: Array[String]): Unit = {
val sess = SparkSession.builder().appName("ml").master("local[4]").getOrCreate();
val sc = sess.sparkContext;
//数据目录
val dataDir = "file:///D:/downloads/bigdata/ml/winequality-white.csv"
//定义样例类
case class Wine(FixedAcidity: Double, VolatileAcidity: Double,
CitricAcid: Double, ResidualSugar: Double, Chlorides: Double,
FreeSulfurDioxide: Double, TotalSulfurDioxide: Double, Density: Double, PH:
Double, Sulphates: Double, Alcohol: Double, Quality: Double)
//变换
val wineDataRDD = sc.textFile(dataDir).map(_.split(";")).map(w => Wine(w(0).toDouble, w(1).toDouble,
w(2).toDouble, w(3).toDouble, w(4).toDouble, w(5).toDouble, w(6).toDouble, w(7).toDouble, w(8).toDouble
, w(9).toDouble, w(10).toDouble, w(11).toDouble))
import sess.implicits._
//转换RDD成DataFrame
val trainingDF = wineDataRDD.map(w => (w.Quality,
Vectors.dense(w.FixedAcidity, w.VolatileAcidity, w.CitricAcid,
w.ResidualSugar, w.Chlorides, w.FreeSulfurDioxide, w.TotalSulfurDioxide,
w.Density, w.PH, w.Sulphates, w.Alcohol))).toDF("label", "features")
//显式数据
trainingDF.show()
println("======================")
//创建线性回归对象
val lr = new LinearRegression()
//设置最大迭代次数
lr.setMaxIter(50)
//通过线性回归拟合训练数据,生成模型
val model = lr.fit(trainingDF)
//保存模型,方便调用
model.save("file:///d:/scala/model");
/*
* 调用保存的模型
val model = LinearRegressionModel.load("file:///d:/scala/model");
*/
//创建内存测试数据数据框
val testDF = sess.createDataFrame(Seq((6.0, Vectors.dense(7, 0.27, 0.36, 20.7, 0.045, 45, 170, 1.001, 3, 0.45, 8.8)),
(6.0,Vectors.dense(6.3, 0.3, 0.34, 1.6, 0.049, 14, 132, 0.994, 3.3, 0.49, 9.5)),
(6.0, Vectors.dense(8.1, 0.28, 0.4, 6.9, 0.05, 30, 97, 0.9951, 3.26, 0.44, 10.1)))).toDF("label", "features")
testDF.show()
//创建临时视图
testDF.createOrReplaceTempView("test")
println("======================")
//利用model对测试数据进行变化,得到新数据框,查询features", "label", "prediction方面值。
val tested = model.transform(testDF).select("features", "label", "prediction");
tested.show();
//
println("======================")
val featureDF = sess.sql("SELECT features FROM test");
//对特征数据进行模型变换,得到预测结果
val predictedDF = model.transform(featureDF).select("features", "prediction")
predictedDF.show()
}
}
模型持久化
1.保存
model.save("file:///d:/scala/model");
2.加载
val model = LinearRegressionModel.load("file:///d:/scala/model");
样例类
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.regression.LinearRegressionModel
import org.apache.spark.sql.{Row, SparkSession}
/**
* wine分类
*/
object LogicRegressWineClassifyDemo {
def main(args: Array[String]): Unit = {
val sess = SparkSession.builder().appName("ml").master("local[4]").getOrCreate();
val sc = sess.sparkContext;
//数据目录
val dataDir = "file:///D:/downloads/bigdata/ml/winequality-white.csv"
//定义样例类
case class Wine(FixedAcidity: Double, VolatileAcidity: Double,
CitricAcid: Double, ResidualSugar: Double, Chlorides: Double,
FreeSulfurDioxide: Double, TotalSulfurDioxide: Double, Density: Double, PH:
Double, Sulphates: Double, Alcohol: Double, Quality: Double)
//变换
val wineDataRDD = sc.textFile(dataDir).map(_.split(";")).map(w => Wine(w(0).toDouble, w(1).toDouble,
w(2).toDouble, w(3).toDouble, w(4).toDouble, w(5).toDouble, w(6).toDouble, w(7).toDouble, w(8).toDouble
, w(9).toDouble, w(10).toDouble, w(11).toDouble))
import sess.implicits._
//转换RDD成DataFrame
val trainingDF = wineDataRDD.map(w => (if (w.Quality < 7) 0D else 1D,
Vectors.dense(w.FixedAcidity, w.VolatileAcidity, w.CitricAcid,
w.ResidualSugar, w.Chlorides, w.FreeSulfurDioxide, w.TotalSulfurDioxide,
w.Density, w.PH, w.Sulphates, w.Alcohol))).toDF("label", "features")
//创建线性回归对象
val lr = new LogisticRegression()
//设置最大迭代次数
lr.setMaxIter(10).setRegParam(0.01)
//
val model = lr.fit(trainingDF)
//创建测试Dataframe
val testDF = sess.createDataFrame(Seq((1.0,Vectors.dense(6.1, 0.32, 0.24, 1.5, 0.036, 43, 140, 0.9894, 3.36, 0.64, 10.7)),
(0.0, Vectors.dense(5.2, 0.44, 0.04, 1.4, 0.036, 38, 124, 0.9898, 3.29, 0.42, 12.4)),
(0.0,Vectors.dense(7.2, 0.32, 0.47, 5.1, 0.044, 19, 65, 0.9951, 3.38, 0.36, 9)),
(0.0, Vectors.dense(6.4, 0.595, 0.14, 5.2, 0.058, 15, 97, 0.991, 3.03, 0.41, 12.6)))
).toDF("label", "features")
//显式测试数据
testDF.show();
println("========================")
//预测测试数据(带标签),评测模型的质量。
testDF.createOrReplaceTempView("test")
val tested = model.transform(testDF).select("features", "label", "prediction")
tested.show();
println("========================")
//预测无标签的测试数据。
val predictDF = sess.sql("SELECT features FROM test")
//预测结果
val predicted = model.transform(predictDF).select("features", "prediction")
predicted.show();
}
}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.Row
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{HashingTF, Tokenizer,
val training = spark.createDataFrame(Seq(
("you@example.com","hope you are well", 0.0),
("raj@example.com", "nice to hear from you",0.0),
("thomas@example.com", "happy holidays", 0.0),
("mark@example.com","see you tomorrow", 0.0),
("xyz@example.com", "save money",1.0),
("top10@example.com", "low interest rate",1.0),
("marketing@example.com", "cheap loan", 1.0)))
.toDF("email","message", "label")
val test = spark.createDataFrame(Seq(
("you@example.com", "how are you"),
("jain@example.com", "hope doing well"),
("caren@example.com","want some money"),
("zhou@example.com", "secure loan"),
("ted@example.com","need loan"))).toDF("email", "message")
val prediction = model.transform(test).select("email","message", "prediction")
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{HashingTF, RegexTokenizer, StopWordsRemover, Tokenizer, Word2Vec}
/**
* 垃圾邮件过滤
*/
object SpamFilterDemo1 {
def main(args: Array[String]): Unit = {
val sess = SparkSession.builder().appName("ml").master("local[4]").getOrCreate();
val sc = sess.sparkContext;
//垃圾邮件训练数据
val training = sess.createDataFrame(Seq(
("you@example.com", "hope you are well", 0.0),
("raj@example.com", "nice to hear from you", 0.0),
("thomas@example.com", "happy holidays", 0.0),
("mark@example.com", "see you tomorrow", 0.0),
("dog@example.com", "save loan money", 1.0),
("xyz@example.com", "save money", 1.0),
("top10@example.com", "low interest rate", 1.0),
("marketing@example.com", "cheap loan", 1.0)))
.toDF("email", "message", "label")
//分词器,指定输入列,生成输出列
val tokenizer = new Tokenizer().setInputCol("message").setOutputCol("words")
//哈希词频
val hashingTF = new HashingTF().setNumFeatures(1000).setInputCol("words").setOutputCol("features")
//创建逻辑回归对象
val lr = new LogisticRegression().setMaxIter(10).setRegParam(0.01)
//设置管线
val pipeline = new Pipeline().setStages(Array(tokenizer,hashingTF, lr))
//拟合,产生模型
val model = pipeline.fit(training)
//测试数据,评判model的质量
val test = sess.createDataFrame(Seq(
("you@example.com", "ab how are you"),
("jain@example.com", "ab hope doing well"),
("caren@example.com", "ab want some money"),
("zhou@example.com", "ab secure loan"),
("ted@example.com", "ab need loan"))).toDF("email", "message")
//对测试数据进行模型变换,得到模型的预测结果
val prediction = model.transform(training).select("email", "message", "prediction")
//prediction.show()
//类似于切割动作。
val wordsDF = tokenizer.transform(training)
//wordsDF.show()
val featurizedDF = hashingTF.transform(wordsDF)
featurizedDF.show()
}
}
ALS:最小二乘法
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
object RecommDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Recommend").setMaster("local[4]")
val sc = new SparkContext(conf)
// Load and parse the data
val data = sc.textFile("file:///d:/scala/ml/recomm/data2.txt")
//变换数据成为Rating。
val ratings = data.map(_.split(',') match { case Array(user, item, rate) =>
Rating(user.toInt, item.toInt, rate.toDouble)
})
// Build the recommendation model using ALS
val rank = 10
val numIterations = 10
//交替最小二乘法算法构建推荐模型
val model = ALS.train(ratings, rank, numIterations, 0.01)
// 取出评分数据的(User,product)
val usersProducts = ratings.map { case Rating(user, product, rate) =>
(user, product)
}
//通过model对(user,product)进行预测,((user, product),rate)
val ug2 = sc.makeRDD(Array((2,3),(2,4)))
val predictions =
model.predict(ug2).map { case Rating(user, product, rate) =>
((user, product), rate)
}
predictions.collect().foreach(println)
//对训练数据进行map ,((user, product),rate)
// val ratesAndPreds = ratings.map { case Rating(user, product, rate) =>
// ((user, product), rate)
// }.join(predictions)
//
// ratesAndPreds.collect().foreach(println)
//
// val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
// val err = (r1 - r2)
// err * err
// }.mean()
// println("Mean Squared Error = " + MSE)
// // Save and load model
// model.save(sc, "target/tmp/myCollaborativeFilter")
// val sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter")
// $example off$
}
}
商品推荐
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.recommendation.ALS
import org.apache.spark.sql.SparkSession
/**
* 商品推荐
*/
object MovieRecommDemo {
//定义评级样例类
case class Rating0(userId: Int, movieId: Int, rating: Float, timestamp: Long)
def main(args: Array[String]): Unit = {
val conf = new SparkConf();
conf.setAppName("movieRecomm");
conf.setMaster("local[4]")
val spark = SparkSession.builder().config(conf).getOrCreate() ;
import spark.implicits._
//解析评级
def parseRating(str: String): Rating0 = {
val fields = str.split("::")
assert(fields.size == 4)
Rating0(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong)
}
//转换成Rating的DF对象
var ratings = spark.sparkContext.textFile("file:///D:\\scala\\ml\\recomm\\sample_movielens_ratings.txt");
val ratings0 = ratings.map(parseRating)
val df = ratings0.toDF()
//随机切割训练数据,生成两个一个数组,第一个元素是training,第二个是test
val Array(training, test) = df.randomSplit(Array(0.99, 0.01))
//建ALS推荐算法并设置参数
val als = new ALS().setMaxIter(5)
.setRegParam(0.01)
.setUserCol("userId")
.setItemCol("movieId")
.setRatingCol("rating")
//通过als对象对训练数据进行拟合,生成推荐模型
val model = als.fit(training)
/*******向用户推荐n款商品********/
//val res = model.recommendProducts(5,8);
/*******将指定的商品推荐给n个用户********/
//val res = model.recommendUsers(3,5)
/*******向所有用户推荐3种商品********/
val res = model.recommendProductsForUsers(3)
//使用model对test数据进行变换,实现预测过程
val predictions = model.transform(test);
predictions.collect().foreach(println)
}
}
电影推荐
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.recommendation.ALS
import org.apache.spark.sql.SparkSession
/**
* 电影推荐
*/
object MovieRecommDemo {
//定义评级样例类
case class Rating0(userId: Int, movieId: Int, rating: Float, timestamp: Long)
def main(args: Array[String]): Unit = {
val conf = new SparkConf();
conf.setAppName("movieRecomm");
conf.setMaster("local[4]")
val spark = SparkSession.builder().config(conf).getOrCreate() ;
import spark.implicits._
//解析评级
def parseRating(str: String): Rating0 = {
val fields = str.split("::")
assert(fields.size == 4)
Rating0(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong)
}
//转换成Rating的DF对象
var ratings = spark.sparkContext.textFile("file:///D:\\scala\\ml\\recomm\\sample_movielens_ratings.txt");
val ratings0 = ratings.map(parseRating)
val df = ratings0.toDF()
//随机切割训练数据,生成两个一个数组,第一个元素是training,第二个是test
val Array(training, test) = df.randomSplit(Array(0.99, 0.01))
//建ALS推荐算法并设置参数
val als = new ALS().setMaxIter(5)
.setRegParam(0.01)
.setUserCol("userId")
.setItemCol("movieId")
.setRatingCol("rating")
//通过als对象对训练数据进行拟合,生成推荐模型
val model = als.fit(training)
//使用model对test数据进行变换,实现预测过程
val predictions = model.transform(test);
predictions.collect().foreach(println)
}
}