第1关 QueueStream
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.spark.{HashPartitioner, SparkConf}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable
object QueueStream {
def main(args: Array[String]) {
val rddQueue = new mutable.SynchronizedQueue[RDD[String]]()
val conf = new SparkConf().setMaster("local[2]").setAppName("queueStream")
/********** Begin **********/
//1.初始化StreamingContext,设置时间间隔为1s
val ssc = new StreamingContext(conf, Seconds(1))
//2.对接队列流
val inputStream = ssc.queueStream(rddQueue)
/**
*
* 数据格式如下:
* 100.143.124.29,1509116285000,'GET www/1 HTTP/1.0',https://www.baidu.com/s?wd=反叛的鲁鲁修,404
* 数据从左往右分别代表:用户IP、访问时间戳、起始URL及相关信息(访问方式,起始URL,http版本)、目标URL、状态码
*
*
* 原始数据的切割符为逗号,(英文逗号)
*
* 需求:
* 1.将时间戳转换成规定时间(格式为:yyyy-MM-dd HH:mm:ss )
* 2.提取数据中的起始URL(切割符为空格)
* 3.拼接结果数据,格式如下:
* Ip:124.132.29.10,visitTime:2019-04-22 11:08:33,startUrl:www/2,targetUrl:https://search.yahoo.com/search?p=反叛的鲁鲁修,statusCode:200
* 4.将最终结果写入 mysql 数据库, 调用DBUtils.add(line)即可, line:String
*/
//3.获取队列流中的数据,进行清洗、转换(按照上面的需求)
val data = inputStream.map(data=>{
val dataliat = data.split(',')
val ip = dataliat(0)
val simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
val lt = dataliat(1).toLong
val date = new Date(lt)
val visitTime = simpleDateFormat.format(date)
val startUrl = dataliat(2).split(' ')(1)
val targetUrl= dataliat(3)
val statusCode = dataliat(4)
val result = "Ip:" + ip + ",visitTime:" + visitTime + ",startUrl:" + startUrl + ",targetUrl:" + targetUrl + ",statusCode:" + statusCode
result
})
//4.将最终结果写入 mysql 数据库, 调用DBUtils.add(line)即可, line:String
data.foreachRDD(rdd => {
rdd.foreachPartition(it => {
it.foreach(line => {
DBUtils.add(line)
})
})
})
//5.启动SparkStreaming
ssc.start()
/********** End **********/
DBUtils.addQueue(ssc, rddQueue)
}
}
第2关 File Streams
package com.sanyiqi
import java.sql.{Connection, DriverManager, ResultSet}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object SparkStreaming {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("edu").setMaster("local")
/********** Begin **********/
//1.初始化StreamingContext,设置时间间隔为1s
val ssc = new StreamingContext(conf, Seconds(1))
//2.设置文件流,监控目录/root/step11_fils
val lines = ssc.textFileStream("/root/step11_fils")
/* *数据格式如下:hadoop hadoop spark spark
*切割符为空格
*需求:
*累加各个批次单词出现的次数
*将结果导入Mysql
*判断MySQL表中是否存在即将要插入的单词,不存在就直接插入,存在则把先前出现的次数与本次出现的次数相加后插入
*库名用educoder,表名用step,单词字段名用word,出现次数字段用count
*/
//3.对数据进行清洗转换
val wordcount = lines.flatMap(_.split(" ")).map(x=>(x,1)).reduceByKey(_+_)
//4.将结果导入MySQL
wordcount.foreachRDD(rdd => {
rdd.foreachPartition(f = eachPartition => {
val connection: Connection = createConnection()
eachPartition.foreach(f = record => {
val querySql = "SELECT t.count FROM step t WHERE t.word = '" + record._1 + "'"
val queryResultSet: ResultSet = connection.createStatement().executeQuery(querySql)
val hasNext = queryResultSet.next()
print("MySQL had word:" + record._1 + " already : " + hasNext)
if (!hasNext)
{
val insertSql = "insert into step(word,count) values('" + record._1 + "'," + record._2 + ")"
connection.createStatement().execute(insertSql)
} else {
val newWordCount = queryResultSet.getInt("count") + record._2
val updateSql = "UPDATE step SET count = " + newWordCount + " where word = '" + record._1 + "'"
connection.createStatement().execute(updateSql)
}
})
connection.close()
})
})
//5.启动SparkStreaming
ssc.start()
/********** End **********/
Thread.sleep(15000)
ssc.awaitTermination()
ssc.stop()
}
/**
*获取mysql连接
*@return
*/
def createConnection(): Connection ={
Class.forName("com.mysql.jdbc.Driver")
DriverManager.getConnection("jdbc:mysql://localhost:3306/educoder","root","123123")
}
}
第3关 KafkaStream
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
object KafkaStream {
def main(args: Array[String]): Unit = {
val kafkaParams = Map[String, Object]("bootstrap.servers" -> "127.0.0.1:9092",
"group.id" -> "kafkaStream",
"enable.auto.commit" -> "false",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "g1"
)
val partition = new TopicPartition("test", 0)
val list = List(partition)
val offsets = Map(partition -> 0l)
val conf = new SparkConf().setMaster("local[*]").setAppName("kafkaStream")
/********** Begin **********/
//1.初始化StreamingContext,设置时间间隔为1S
val ssc = new StreamingContext(conf, Seconds(1))
//2.使用 KafkaUtils 对象创建流,使用 Assign 订阅主题(Topic),上面已经为你定义好了 Topic列表:list,kafka参数:kafkaParams,偏移量:offsets
val inputStream = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Assign[String, String](list, kafkaParams, offsets))
/**
*
* 数据格式如下:
* 100.143.124.29,1509116285000,'GET www/1 HTTP/1.0',https://www.baidu.com/s?wd=反叛的鲁鲁修,404
* 数据从左往右分别代表:用户IP、访问时间戳、起始URL及相关信息(访问方式,起始URL,http版本)、目标URL、状态码
*
*
* 原始数据的切割符为逗号,(英文逗号)
*
* 需求:
* 1.将时间戳转换成规定时间(格式为:yyyy-MM-dd HH:mm:ss )
* 2.提取数据中的起始URL(切割符为空格)
* 3.拼接结果数据,格式如下:
* Ip:124.132.29.10,visitTime:2019-04-22 11:08:33,startUrl:www/2,targetUrl:https://search.yahoo.com/search?p=反叛的鲁鲁修,statusCode:200
* 4.判断rdd是否为空,如果为空,调用 ssc.stop(false, false)与sys.exit(0) 两个方法,反之将结果数据存储到mysql数据库中,调用DBUtils.add(line)即可, line:String
*/
//3.获取kafka流中的数据,进行清洗、转换(按照上面的需求)
val simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
val value = inputStream.map(x => {
val arrs = x.value().split(",")
val ip = arrs(0)
val time = simpleDateFormat.format(new Date(arrs(1).toLong))
val startUrl = arrs(2).split(" ")(1)
val targetUrl = arrs(3)
val statusCode = arrs(4)
val result = "Ip:" + ip + ",visitTime:" + time + ",startUrl:" + startUrl + ",targetUrl:" + targetUrl + ",statusCode:" + statusCode
result
})
//4.判断rdd是否为空,如果为空,调用 ssc.stop(false, false)与sys.exit(0) 两个方法,反之将结果数据存储到mysql数据库中,调用DBUtils.add(line)即可, line:String
value.foreachRDD(rdd => {
if (rdd.isEmpty()) {
ssc.stop(false, false)
sys.exit(0)
} else {
rdd.foreachPartition(it => {
it.foreach(line => {
DBUtils.add(line)
})
})
}
})
//5.启动SparkStreaming
ssc.start()
//6.等待计算结束
ssc.awaitTermination()
/********** End **********/
}
}
第4关 socketTextStream
import java.sql.{DriverManager, ResultSet}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* 使用spark streaming完成有状态统计,并且将结果写入到mysql数据库中
*/
object SocketSparkStreaming {
//累加操作函数
/********** Begin **********/
val upfunc = (iter:Iterator[(String,Seq[Int],Option[Int])])=>{
iter.map{case (x,y,z) => (x,y.sum+z.getOrElse(0))}
}
/********** End **********/
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("socketSparkStreaming").setMaster("local[2]")
conf.set("spark.streaming.stopGracefullyOnShutdown", "true")
val ssc = new StreamingContext(conf, Seconds(2))
/********** Begin **********/
//1.连接socket流 主机名:localhost 端口:5566
val lines: ReceiverInputDStream[String] = ssc.socketTextStream("localhost", 5566)
//2.切分压平
val worlds = lines.flatMap(_.split(" "))
//3.组装
val wordAndOne = worlds.map(x => (x, 1))
//4.设置检查点
ssc.checkpoint("/root/check")
//5.将每个时间窗口内得到的统计值都进行累加
val reduced = wordAndOne.updateStateByKey(upfunc,new HashPartitioner(ssc.sparkContext.defaultParallelism),true)
//6.将结果写入MySQL
// 语法:如果存在这个单词就更新它所对应的次数
// 如果不存在将其添加
reduced.foreachRDD(rdd => {
rdd.foreachPartition(partitionOfRecords => {
val connection = createConnection()
partitionOfRecords.foreach(record => {
val querySql = "insert into wordcount (word,wordcount) values('" + record._1 + "',"+record._2+") on DUPLICATE key update wordcount="+record._2
connection.createStatement().execute(querySql)
})
connection.close()
})
})
/********** End **********/
ssc.start()
ssc.awaitTermination()
}
/**
* 获取mysql连接
* @return
*/
def createConnection()={
Class.forName("com.mysql.jdbc.Driver")
DriverManager.getConnection("jdbc:mysql://localhost:3306/edu","root","123123")
}
}