炼数成金 课程
1、监控本地文件夹下的文件信息
</pre><pre name="code" class="java">import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.StreamingContext._
object HdfsWordCount {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("HdfsWordCount").setMaster("local[2]")//这里指在本地运行,2个线程,一个监听,一个处理数据
// Create the context
val ssc = new StreamingContext(sparkConf, Seconds(20))// 时间划分为20秒
val lines = ssc.textFileStream("/home/mmicky/temp/")
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
wordCounts.print()
ssc.start()
ssc.awaitTermination()
}
}
2、 网络socket监控
1)构建socket模拟周期发送数据
import java.io.{PrintWriter}
import java.net.ServerSocket
import scala.io.Source
object SaleSimulation {
def index(length: Int) = { //销售模拟器:参数1:读入的文件;参数2:端口;参数3:发送时间间隔ms
import java.util.Random
val rdm = new Random
rdm.nextInt(length)
}
def main(args: Array[String]) {
if (args.length != 3) {
System.err.println("Usage: <filename> <port> <millisecond>")
System.exit(1)
}
val filename = args(0)
val lines = Source.fromFile(filename).getLines.toList
val filerow = lines.length
val listener = new ServerSocket(args(1).toInt)
while (true) {
val socket = listener.accept()
new Thread() {
override def run = {
println("Got client connected from: " + socket.getInetAddress)
val out = new PrintWriter(socket.getOutputStream(), true)
while (true) {
Thread.sleep(args(2).toLong)
val content = lines(index(filerow))
println(content)
out.write(content + '\n')
out.flush()
}
socket.close()
}
}.start()
}
}
}
运行:java -cp week5.jar week5.SaleSimulation /home/mmicky/data/spark/people.txt 9999 1000 //从people文件随机读取,发送端口9999,间隔1秒
2)sparkStream 监控端
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext}
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.storage.StorageLevel
object NetworkWordCount {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("NetworkWordCount").setMaster("local[2]")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(5))//5秒间隔
val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER)// 服务器地址,端口,序列化方案
val words = lines.flatMap(_.split(","))
val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
wordCounts.print()
ssc.start()
ssc.awaitTermination()
}
}
3、监控有状态(stateful)
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.StreamingContext._
object StatefulWordCount {
def main(args: Array[String]) {
val updateFunc = (values: Seq[Int], state: Option[Int]) => { //StateFul需要定义的处理函数,第一个参数是本次进来的值,第二个是过去处理后保存的值
val currentCount = values.foldLeft(0)(_ + _) //求和
val previousCount = state.getOrElse(0) // 如果过去没有 即取0
Some(currentCount + previousCount)// 求和<span style="white-space:pre"> </span>
}
val conf = new SparkConf().setAppName("StatefulWordCount").setMaster("local[2]")
val sc = new SparkContext(conf)
//创建StreamingContext
val ssc = new StreamingContext(sc, Seconds(5))
ssc.checkpoint(".")//因为是有状态的,需要保存之前的信息,所以这里设定了 checkpoint的目录,以防断电后内存数据丢失。
//这里因为没有设置checkpoint的时间间隔,所以会发现每一次数据块过来 即切分一次,产生一个 .checkpoint 文件
//获取数据
val lines = ssc.socketTextStream(args(0), args(1).toInt)
val words = lines.flatMap(_.split(","))
val wordCounts = words.map(x => (x, 1))
//使用updateStateByKey来更新状态
val stateDstream = wordCounts.updateStateByKey[Int](updateFunc)//调用 处理函数 updateFunc
stateDstream.print()
ssc.start()
ssc.awaitTermination()
}
}
4、windows操作
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming._
import org.apache.spark.streaming.StreamingContext._
object WindowWordCount {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("WindowWordCount").setMaster("local[2]")
val sc = new SparkContext(conf)
//创建StreamingContext
val ssc = new StreamingContext(sc, Seconds(5))
ssc.checkpoint(".")
// //获取数据
val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_ONLY_SER)
val words = lines.flatMap(_.split(","))
//windows操作
val wordCounts = words.map(x => (x , 1)).reduceByKeyAndWindow((a:Int,b:Int) => (a + b), Seconds(args(2).toInt), Seconds(args(3).toInt))
//第二个参数是 windows的窗口时间间隔,比如是 监听间隔的 倍数,上面是 5秒,这里必须是5的倍数。eg :30
//第三个参数是 windows的滑动时间间隔,也必须是监听间隔的倍数。eg :10
//那么这里的作用是, 每隔10秒钟,对前30秒的数据, 进行一次处理,这里的处理就是 word count。
//val wordCounts = words.map(x => (x , 1)).reduceByKeyAndWindow(_+_, _-_,Seconds(args(2).toInt), Seconds(args(3).toInt))
//这个是优化方法, 即加上上一次的结果,减去 上一次存在又不在这一次的数据块的部分。
wordCounts.print()
ssc.start()
ssc.awaitTermination()
}
}