文章目录
1 机器学习流程
原始数据—>数据特征工程(训练数据和测试数据)—>建立模型—>模型评估(测试数据进行评估)—>判断模型是否合格(不合格继续进行训练,算法学习)—>模型应用
2 逻辑回归demo
1 模型训练+使用+保存
//1、构建spark环境
val spark: SparkSession = SparkSession
.builder()
.appName("person")
.master("local")
.config("spark,sql.shuffle.partitions", "2")
.getOrCreate()
import spark.implicits._
import org.apache.spark.sql.functions._
//2、读取被预处理后的人体SVM格式的数据
val data: DataFrame = spark
.read
.format("libsvm")
.load("data/人体指标.txt")
//data.show(false) //查看数据,false可以看全
//3、将数据拆分为训练集和测试集:训练集0.7,测试集0.3
val splitDS: Array[Dataset[Row]] = data.randomSplit(Array(0.7, 0.3))
val tran: Dataset[Row] = splitDS(0) //训练集
val test: Dataset[Row] = splitDS(1) //测试集
//4、选择算法,这里是二分类问题,可以采用逻辑回归算法
val regression: LogisticRegression = new LogisticRegression()
.setFitIntercept(true) //设置截距
.setMaxIter(100) //设置最大迭代次数
//5、将训练集带入算法训练模型
val model: LogisticRegressionModel = regression.fit(tran)
//6、模型的评估:会在最后增加一列表示预测的结果
val testDF: DataFrame = model.transform(test)
//testDF.show(10000,false) //查看数据,10000表示行数
//7、对预测的结果和表中原来的结果继续比较,如果相等置1相加,不相等置0,最后求出准备的比例(相等的/总数)
val p: DataFrame = testDF.select(sum(when($"label" === $"prediction", value = "1.0").otherwise(value = "0.0")) / count($"label") as "p")
//p.show() //查看准确率
//8、如果模型准确率达标,将模型保存
model.save("data/personModel")
//9、模型的使用:找到路径下保存的模型直接使用
//val model: LogisticRegressionModel = LogisticRegressionModel.load("data/personModel")
2 保存模型的使用
//1、构建spark环境
val spark: SparkSession = SparkSession
.builder()
.appName("person")
.master("local")
.config("spark,sql.shuffle.partitions", "2")
.getOrCreate()
import spark.implicits._
import org.apache.spark.sql.functions._
//2、加载模型
val model: LogisticRegressionModel = LogisticRegressionModel.load("data/personModel")
/**
* 一条数据
* 0 1:5.3 2:3.5 3:2.5 4:106.4 5:67.5 6:69.1 7:83
*/
val vector: linalg.Vector = Vectors.dense(Array(5.3,3.5,2.5,106.4,67.5,69.1,83))
//3、预测
val result: Double = model.predict(vector)
println(result) //打印预测结果
3 K-means demo
//1、构建spark环境
val spark: SparkSession = SparkSession
.builder()
.appName("Kmeans")
.master("local")
.config("spark,sql.shuffle.partitions", "2")
.getOrCreate()
import spark.implicits._
//2、读取数据
val data: DataFrame = spark
.read
.format("csv")
.schema("x DOUBLE,y DOUBLE")
.load("data/kmeans.txt")
//3、将每行数据拼接成数组
val ds: Dataset[(Double, Double)] = data.as[(Double, Double)] //每行转为元组
val vectorDF: DataFrame = ds
.map(kv => Array(kv._1, kv._2))
.toDF("features") //必须要加这个列名,不然无法获取数据会报错
//4、构建Kmeans算法
val Kmeans: KMeans = new KMeans()
.setK(3) //k的数量,聚类中心的数量
//5、迭代计算训练模型
val model: KMeansModel = Kmeans.fit(vectorDF)
//6、计算结果
val result: DataFrame = model.transform(vectorDF)
result.show(100000) //查看结果
4 图片识别demo
1 读图片
//1、构建spark环境
val spark: SparkSession = SparkSession
.builder()
.appName("image")
.master("local[*]")
.config("spark,sql.shuffle.partitions", "2")
.getOrCreate()
import spark.implicits._
import org.apache.spark.sql.functions._
//2、读取图片文件
val imageData: DataFrame = spark
.read
.format("image")
.load("data/train")
.repartition(64) //重分区
/**
* root
* |-- image: struct (nullable = true)
* | |-- origin: string (nullable = true) 文件名
* | |-- height: integer (nullable = true) 高度
* | |-- width: integer (nullable = true) 宽度
* | |-- nChannels: integer (nullable = true)
* | |-- mode: integer (nullable = true)
* | |-- data: binary (nullable = true) 数据
*/
//imageData.printSchema() //查看结构
//3、提取文件的名称以及文件的数据
val data: DataFrame = imageData.select($"image.origin" as "name", $"image.data" as "data")
//4、数据预处理
val nameandfeatures: DataFrame = data
.as[(String, Array[Byte])] //DataFrame不能map,DataSet才可map
.map(kv => {
val name: String = kv._1.split("/").last //文件名称
val value: Array[Byte] = kv._2 //文件数据
val newdata: Array[Double] = value
.map(_.toDouble) //将数据转为Double类型
.map(p => { //将像素小于0的全部置1.0,其他的置0.0
if (p < 0) {
1.0
} else {
0.0
}
})
val sparse: linalg.Vector = Vectors.dense(newdata) //考虑转换为稀疏向量,节省存储空间。不可行,后面会导致长度不一
(name, sparse)
})
.toDF("name", "features") //重新转为DataFrame
//5、读取图片标签
val labelData: DataFrame = spark
.read
.format("csv")
.option("sep", " ")
.schema("name String,label Double") //名称必须设为label
.load("data/train.txt")
//6、特征数据和标签数据进行关联
val resultData: DataFrame = nameandfeatures
.join(labelData.hint("broadcast"), List("name"), "inner") //$"name"===$"name"会报错,无法识别是哪一个name,所以用List("name")
.select("label", "features")
//7、保存为svm
resultData
.write
.mode(SaveMode.Overwrite)
.format("libsvm")
.save("data/images")
2 模型训练
//1、构建spark环境
val spark: SparkSession = SparkSession
.builder()
.appName("image")
.master("local[*]")
.config("spark,sql.shuffle.partitions", "2")
.getOrCreate()
import spark.implicits._
import org.apache.spark.sql.functions._
//2、读取数据
val data: DataFrame = spark
.read
.format("libsvm")
.load("data/images")
//3、拆分训练集、测试集
val array: Array[Dataset[Row]] = data.randomSplit(Array(0.7, 0.3))
val train: Dataset[Row] = array(0) //训练集
val test: Dataset[Row] = array(1) //测试集
//4、构建算法
val regression: LogisticRegression =
new LogisticRegression() //逻辑回归
.setFitIntercept(true) //设置截距
.setMaxIter(100) //设置最大迭代次数
//5、将数据带入算法训练模型
val model: LogisticRegressionModel = regression.fit(train)
val result: DataFrame = model.transform(test)
//6、计算准确率
result
.select(sum(when($"label" === $"prediction", 1).otherwise(0)) / count($"label") as "rate")
.show()
//7、保存模型
model
.write
.overwrite()
.save("data/imageModel")
3 测试
//1、构建spark环境
val spark: SparkSession = SparkSession
.builder()
.appName("image")
.master("local[*]")
.config("spark,sql.shuffle.partitions", "2")
.getOrCreate()
import spark.implicits._
import org.apache.spark.sql.functions._
//2、加载模型
val model: LogisticRegressionModel = LogisticRegressionModel.load("data/imageModel")
//3、读取图片
val imageData: DataFrame = spark
.read
.format("image")
.load("data/27550.jpg")
val data: DataFrame = imageData.select($"image.origin" as "name", $"image.data" as "data")
//4、提取特征
val nameandfeatures: DataFrame = data
.as[(String, Array[Byte])] //DataFrame不能map,DataSet才可map
.map(kv => {
val name: String = kv._1 //文件名称
val value: Array[Byte] = kv._2 //文件数据
val newdata: Array[Double] = value
.map(_.toDouble) //将数据转为Double类型
.map(p => { //将像素小于0的全部置1.0,其他的置0.0
if (p < 0) {
1.0
} else {
0.0
}
})
val sparse: linalg.Vector = Vectors.dense(newdata)
(name, sparse)
})
.toDF("name", "features")
//5、得出结果
model
.transform(nameandfeatures)
.show()
5 IK demo
object Demo08IK {
def main (args: Array[String]): Unit = {
val str = "别人笑我太疯癫,我笑他人看不穿;不见五陵豪杰墓,无花无酒锄作田。"
val words: List[String] = fit(str)
println(words)
}
def fit (text: String): List[String] = {
val listBuffer: ListBuffer[String] = new ListBuffer[String]
val sr: StringReader = new StringReader(text)
val ik: IKSegmenter = new IKSegmenter(sr, true)
var word: Lexeme = ik.next()
while (word != null) {
listBuffer += word.getLexemeText
word = ik.next()
}
listBuffer.toList
}
}
6 贝叶斯文本分类 demo
//1、构建spark环境
val spark: SparkSession = SparkSession
.builder()
.appName("person")
.master("local[*]")
.config("spark,sql.shuffle.partitions", "2")
.getOrCreate()
import spark.implicits._
import org.apache.spark.sql.functions._
//2、读取文本数据
val data: DataFrame = spark
.read
.format("csv")
.option("sep", "\t")
.schema("label DOUBLE,text STRING")
.load("data/train1.txt")
//3、分词
val wordsDS: Dataset[(Double, List[String])] = data
.as[(Double, String)]
.map(kv => {
val label: Double = kv._1
val text: String = kv._2
val words: List[String] = Demo08IK.fit(text) //调用IK分词器进行分词
(label, words)
})
//wordsDS.show(1000,false)
//4、去除脏数据
val filterDS: Dataset[(Double, List[String])] = wordsDS.filter(_._2.length > 2)
//5、将集合中每一个词语使用空格拼接
val linesDF: DataFrame = filterDS
.map(kv => {
(kv._1, kv._2.mkString(" "))
}).toDF("label", "text")
//6、使用官方提供的英文分词器,转为空格拼接也是为了方便使用英文分词器
val tokenizer: Tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
val wordsData: DataFrame = tokenizer.transform(linesDF)
//7、计算TF(词频)和IDF(逆文本频率)
val hashingTF: HashingTF = new HashingTF()
.setInputCol("words").setOutputCol("rawFeatures")
val featurizedData: DataFrame = hashingTF.transform(wordsData)
val idf: IDF = new IDF().setInputCol("rawFeatures").setOutputCol("features") //IDF
val idfModel: IDFModel = idf.fit(featurizedData) //训练IDF模型
val rescaledData: DataFrame = idfModel.transform(featurizedData) //
//8、将结果拆分训练集和测试集
val array: Array[Dataset[Row]] = rescaledData.randomSplit(Array(0.7, 0.3))
val train: Dataset[Row] = array(0)
val test: Dataset[Row] = array(1)
//9、采用贝叶斯分类做问文本分类
val model: NaiveBayesModel = new NaiveBayes().fit(train)
val dataFrame: DataFrame = model.transform(test)
//10、计算准确率
dataFrame
.select(sum(when($"label" === $"prediction", 1).otherwise(0)) / count($"label") as "rate")
.show()
//11、保存IDF模型和贝叶斯模型
idfModel.write.overwrite().save("data/idfModel")
model.write.overwrite().save("data/naiveBayes")
附上需要文件
https://github.com/xiaoyoupei/spark/tree/master