因为需要实时数据量,创建一个自动发送数据的工具类
1.创建StreamingData1
package com.mllib
import java.io.PrintWriter
import java.net.ServerSocket
import breeze.linalg.DenseVector
import scala.util.Random
object StreamingData1 {
def main(args: Array[String]): Unit = {
// 每秒最大事件数
val MaxEvents = 100
val NumFeatures = 2
/** 生成正态分布稠密向量的函数 **/
val random = new Random()
// tabulate 返回指定长度的数组
// nextGaussian 获得下一个高斯分布均值
def generateRandomArray(n: Int) =
Array.tabulate(n)(_ => random.nextGaussian())
/** 生成固定随机模型权重向量 **/
val w = new DenseVector(generateRandomArray(NumFeatures))
val intercept = random.nextGaussian() * 10
/** 生成若干随机产品事件 */
def generateNoisyData(n: Int) = {
(1 to n).map { i =>
val x = new DenseVector(generateRandomArray(NumFeatures))
val y: Double = w.dot(x)
val noisy = y + intercept //+ 0.1 * random.nextGaussian()
(noisy, x)
}
}
// create a network producer
val listener = new ServerSocket(9998)
println("Listening on port: 9999")
while (true) {
val socket = listener.accept()
new Thread() {
override def run = {
println("Got client connected from: " + socket.getInetAddress)
val out = new PrintWriter(socket.getOutputStream(), true)
while (true) {
Thread.sleep(5000)
val num = random.nextInt(MaxEvents)
val data = generateNoisyData(num)
data.foreach { case (y, x) =>
val xStr = x.data.mkString(",")
val eventStr = s"$y\t$xStr"
println("eventStr: \t" + eventStr)
out.write(eventStr)
out.write("\n")
}
println("data: \n"+data)
out.flush()
println(s"Created $num events...")
}
socket.close()
}
}.start()
}
}
}
2.创建StreamingMLDemo2
package com.mllib
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
object StreamingMLDemo2 {
def main(args: Array[String]): Unit = {
//Streaming读取数据
//创建对象
val ssc = new StreamingContext(
new SparkConf()
.setAppName("streaming")
.setMaster("local[2]"),Seconds(5)
)
ssc.checkpoint(".")
val data = ssc
.socketTextStream("127.0.0.1",9998)
data.print()
ssc.start()
ssc.awaitTermination()
}
}
先启动StreamingData1,再启动StreamingMLDemo2。返回结果
3.创建模型、训练模型、做出预测
//创建模型
//使用实时模型是,第一次启动是没有数据的,必须去指定空的模型对象
val zeroVec = DenseVector.zeros[Double](2)//定义向量
val model = new StreamingLinearRegressionWithSGD()
.setInitialWeights(Vectors.dense(zeroVec.data))//传入向量
.setNumIterations(1)//设计迭代
.setStepSize(0.01)//设置步长
//将参数改成模型需要的形状
val labeledPoint = data.map(
x => {
val arr = x.split("\t")
val y = arr(0).toDouble
val feature = arr(1).split(",").map(_.toDouble)
LabeledPoint(y,Vectors.dense(feature))
}
)
//训练模型
model.trainOn(labeledPoint)
//做出预测
val result = data.map(
x => {
val arr = x.split("\t")
val y = arr(0).toDouble
val feature = arr(1).split(",").map(_.toDouble)
//获得当前模型
val m1 = model.latestModel()//得到最近的一次
(y,m1.predict(Vectors.dense(feature)))
}
)