文章目录
Spark练习记录
一、身高排序,平均年龄,所有姓氏,每月最大天数三人,相同生日的人
已知学生数据如下:请用spark core完成下列需求
班级 学号 性别 姓名 出生年月 血型 家庭地址 身高 手机号
RB171 RB17101 男 张** 1997-02-10 AB 河南省郑州市1号 172 11122223333
RB171 RB17102 女 冯** 1996-10-01 A 河南省洛阳市2号 175 18837110115
RB171 RB17103 男 卢** 1998-08-02 B 河南省开封市3号 165 19999228822
RB171 RB17104 男 杨** 1996-08-09 A 河南省安阳市4号 168 13322554455
RB172 RB17201 女 姜** 1997-01-03 A 河南省鹤壁市1号 170 13688552244
RB172 RB17282 男 高* 1996-08-27 B 河南省新乡市2号 171 13522114455
RB173 RB17203 女 何* 1997-12-20 B 河南省焦作市3号 168 13566998855
1)按照身高排序
package com.zxy.spark.Streaming.day004
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object demo4 {
def main(args: Array[String]): Unit = {
val sparkContext = new SparkContext(new SparkConf().setAppName("demo4").setMaster("local[*]"))
val linesRDD: RDD[String] = sparkContext.textFile("date/student.txt")
val StudentRDD: RDD[(Int, (String, String, String, String, String, String, String, String))] = linesRDD.map(line => {
val fields: Array[String] = line.split("\\s+")
val ClassName: String = fields(0)
val StuID: String = fields(1)
val Sex: String = fields(2)
val Name: String = fields(3)
val Brithday: String = fields(4)
val TypeOfRed: String = fields(5)
val Adress: String = fields(6)
val Height: Int = fields(7).toInt
val PhoneNumber: String = fields(8)
(Height, (ClassName, StuID, Sex, Name, Brithday, TypeOfRed, Adress, PhoneNumber))
})
val resRDD: RDD[(Int, (String, String, String, String, String, String, String, String))] = StudentRDD.sortByKey(false,1)
resRDD.foreach(println)
sparkContext.stop()
}
}
2)求平均年龄
package com.zxy.spark.Streaming.day004
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object demo4 {
def main(args: Array[String]): Unit = {
val sparkContext = new SparkContext(new SparkConf().setAppName("demo4").setMaster("local[*]"))
val linesRDD: RDD[String] = sparkContext.textFile("date/student.txt")
val ageRDD: RDD[Int] = linesRDD.map(line => {
val fields: Array[String] = line.split("\\s+")
val Brithday: String = fields(4)
val fields: Array[String] = Brithday.split("-")
val year = fields(0).toInt
val age = 2021 - year
(age)
})
val num: Long = ageRDD.count()
val sum: Double = ageRDD.sum()
val avgage = (sum / num).toInt
println(s"$sum -> $num")
println(s"$avgage")
sparkContext.stop()
}
}
3)求学生中出现的所有姓氏
package com.zxy.spark.Streaming.day004
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object demo4 {
def main(args: Array[String]): Unit = {
val sparkContext = new SparkContext(new SparkConf().setAppName("demo4").setMaster("local[*]"))
val linesRDD: RDD[String] = sparkContext.textFile("date/student.txt")
val firstName: RDD[String] = linesRDD.map(line => {
val fields: Array[String] = line.split("\\s+")
val Name: String = fields(3)
val firstName: String = Name.substring(0, 1)
(firstName)
})
val thefirstName: RDD[String] = firstName.distinct()
thefirstName.foreach(println)
sparkContext.stop()
}
}
4)返回出生在每月最大天数的3人
package com.zxy.spark.Streaming.day004
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object demo4 {
def main(args: Array[String]): Unit = {
val sparkContext = new SparkContext(new SparkConf().setAppName("demo4").setMaster("local[*]"))
val linesRDD: RDD[String] = sparkContext.textFile("date/student.txt")
val studentRDD: RDD[(String, String)] = linesRDD.map(line => {
val fields: Array[String] = line.split("\\s+")
val Name: String = fields(3)
val Brithday: String = fields(4)
(Name, Brithday)
})
val monthRDD: RDD[(String, (String, Int))] = studentRDD.map(info => {
val brithday: String = info._2
val fields: Array[String] = brithday.split("-")
val month: String = fields(1)
val day: Int = fields(2).toInt
val days = 30 - day
(month, (info._1, days))
})
val gbkRDD: RDD[(String, Iterable[(String, Int)])] = monthRDD.groupByKey()
gbkRDD.foreach(println)
val resRDD: RDD[(String, List[(String, Int)])] = gbkRDD.map(line => {
val arrs = line._2
val tuples: List[(String, Int)] = arrs.toList.sorted.reverse.take(3)
(line._1, tuples)
})
resRDD.foreach(println)
sparkContext.stop()
}
}
5)索引出相同生日下同学的姓名链表
package com.zxy.spark.Streaming.day004
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object demo4 {
def main(args: Array[String]): Unit = {
val sparkContext = new SparkContext(new SparkConf().setAppName("demo4").setMaster("local[*]"))
val linesRDD: RDD[String] = sparkContext.textFile("date/student.txt")
val studentRDD: RDD[(String, String)] = linesRDD.map(line => {
val fields: Array[String] = line.split("\\s+")
val Name: String = fields(3)
val Brithday: String = fields(4)
(Brithday,Name)
})
val dateRDD: RDD[((String, String), String)] = studentRDD.map(line => {
val brithday: String = line._1
val field: Array[String] = brithday.split("-")
val month: String = field(1)
val day: String = field(2)
((month, day), line._2)
})
val resRDD: RDD[((String, String), Iterable[String])] = dateRDD.groupByKey()
resRDD.foreach(println)
sparkContext.stop()
}
}
二、每个班级的成绩排行的的前三名
1)需求:
一组数据,有id,name,score,myclass等四个字段,要求取到每个班级的成绩排行的的前三名
- 数据
ID name score stuclass
1 z1 90 1
2 z2 100 1
3 z3 70 2
4 z4 30 1
5 z5 200 2
6 z6 120 1
7 z7 90 1
8 z8 100 2
9 z9 80 1
10 z10 90 2
11 z11 10 1
- 理想结果
(2,List((200,5,z5), (100,8,z8), (90,10,z10)))
(1,List((120,6,z6), (100,2,z2), (90,1,z1)))
2)代码测试:版本一
package com.zxy.spark.core.day06
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo5 {
def main(args: Array[String]): Unit = {
val sc = new SparkContext(new SparkConf().setAppName("demo5").setMaster("local[*]"))
val stuRDD: RDD[String] = sc.textFile("date/student.txt")
val mapRDD: RDD[(String,( String, String, String))] = stuRDD.map(line => {
val info: Array[String] = line.split("\\s+")
val id = info(0)
val name = info(1)
val score = info(2)
val stuclass = info(3)
(stuclass, (score, id, name))
})
val gbkRDD: RDD[(String, Iterable[(String, String, String)])] = mapRDD.groupByKey()
val value: RDD[(String, List[(String, String, String)])] = gbkRDD.mapValues(values => {
values.toList.sortWith(_._1 > _._1).take(3)
})
value.foreach(println)
sc.stop()
}
}
- 运行结果
(2,List((90,10,z10), (70,3,z3), (200,5,z5)))
(1,List((90,1,z1), (90,7,z7), (80,9,z9)))
- 分析
结果明显与预期效果不符合,原因是需要进行比较的score字段是String,没有转成Int型进行比大小
3)代码优化:版本二
package com.zxy.spark.core.day06
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo5 {
def main(args: Array[String]): Unit = {
val sc = new SparkContext(new SparkConf().setAppName("demo5").setMaster("local[*]"))
val stuRDD: RDD[String] = sc.textFile("date/student.txt")
val mapRDD: RDD[(String,( Int, String, String))] = stuRDD.map(line => {
val info: Array[String] = line.split("\\s+")
val id = info(0)
val name = info(1)
val score = info(2).toInt
val stuclass = info(3)
(stuclass, (score, id, name))
})
val gbkRDD: RDD[(String, Iterable[(Int, String, String)])] = mapRDD.groupByKey()
val value: RDD[(String, List[(Int, String, String)])] = gbkRDD.mapValues(values => {
values.toList.sortWith(_._1 > _._1).take(3)
})
value.foreach(println)
sc.stop()
}
}
(2,List((200,5,z5), (100,8,z8), (90,10,z10)))
(1,List((120,6,z6), (100,2,z2), (90,1,z1)))
三、筛选日志中的某一条指定的数据(Filter,startsWith)
1)Spark-Core代码开发
object num4{
def main(args: Array[String]): Unit = {
val sc = new SparkContext(new SparkConf().setMaster("local[*]").setAppName("num4"))
val RDD: RDD[String] = sc.textFile("1.txt")
RDD.filter(line => {
val dates: Array[String] = line.split(" ")
val time: String = dates(3)
time.startsWith("15/03/2017")
}).foreach(println)
sc.stop()
}
}
2)数据内容
100.79.121.48 HIT 33 15/02/2017:00:00:46 +0800
111.19.97.15 HIT 18 15/02/2017:00:00:39 +0800
218.108.100.234 HIT 1 15/03/2017:00:00:57 +0800
3)输出结果
218.108.100.234 HIT 1 15/03/2017:00:00:57 +0800
四、获取日志中日期的某一参数(yyyy-MM-dd:HH:mm:ss)
1)SparkCore代码开发
object num3 {
def main(args: Array[String]): Unit = {
val sc = new SparkContext(new SparkConf().setMaster("local[*]").setAppName("num4"))
val RDD: RDD[String] = sc.textFile("1.txt")
RDD.map(line => {
val dates: Array[String] = line.split(" ")
val time: String = dates(3)
val hour: String = getHours(time)
(time,hour)
}).foreach(println)
}
/*
*方法1:
*/
def getHours(date:String):String = {
val fm: SimpleDateFormat = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss")
val dates: Date = fm.parse(date)
dates.getMonth.toString
// val fms = new SimpleDateFormat("MM")
// val time: String = fms.format(dates)
// time
}
/*
*方法2:
*/
def getHours(date:String):String = {
val fm: SimpleDateFormat = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss")
val dates: Date = fm.parse(date)
val fms = new SimpleDateFormat("MM")
val time: String = fms.format(dates)
time
}
}
2)数据文件
100.79.121.48 HIT 33 15/02/2017:00:00:46 +0800
111.19.97.15 HIT 18 15/02/2017:00:00:39 +0800
218.108.100.234 HIT 1 15/03/2017:00:00:57 +0800
3)已知时间戳,得出小时时间
//时间戳
//1516609143867
//传入时间戳,传出时间hour
def getHour(time_long : String): String = {
val fm = new SimpleDateFormat("HH")
val tim: String = fm.format(new Date(time_long.toLong))
tim
}
五、《Spark系列-SparkCore》统计广告ID、TopN、访问量
1. 统计广告ID
- 1)数据格式:
timestamp province city userid adid
时间点 省份 城市 用户 广告
用户ID范围:0-99
省份,城市,ID相同:0-9
adid:0-19
e.g.
1516609143867 6 7 64 16
- 2)需求:
1.统计每一个省份点击TOP3的广告ID
2.统计每一个省份每一个小时的TOP3广告ID
- 3)代码
package com.qf.bigdata.spark.core.day5
import com.qf.bigdata.spark.core.day2.SparkUtils
import org.apache.spark.rdd.RDD
object Demo1 {
def main(args: Array[String]): Unit = {
val sc = SparkUtils.getDefaultSparkContext()
val logsRDD: RDD[String] = sc.textFile("file:///c://ftp/Advert.txt")
val arrsRDD: RDD[Array[String]] = logsRDD.map(_.split("\\s+"))
val proAdnAdvCntRDD: RDD[(String, Int)] = arrsRDD.map(arr => (arr(1) + "_" + arr(4),1))
val sumRDD: RDD[(String, Int)] = proAdnAdvCntRDD.reduceByKey(_ + _)
val sum2RDD: RDD[(String, (String, Int))] = sumRDD.map(t => {
val param = t._1.split("_")
(param(0), (param(1), t._2))
})
val gbkRDD: RDD[(String, Iterable[(String, Int)])] = sum2RDD.groupByKey()
val resRDD: RDD[(String, List[(String, Int)])] = gbkRDD.mapValues(values => {
values.toList.sortWith((x, y) => x._2 > y._2).take(3)
})
val map: collection.Map[String, List[(String, Int)]] = resRDD.collectAsMap()
println(map)
sc.stop()
}
}
2. 基站停留时间TopN
- 1)数据格式
19735E1C66.log 这个文件中存储着日志信息
手机号,时间戳,基站ID 连接状态(1连接 0断开)
18688888888,20160327082400,16030401EAFB68F1E3CDF819735E1C66,1
lac_info.txt 这个文件中存储基站信息
文件组成 基站ID, 经,纬度
- 2)需求
根据用户产生日志的信息,在那个基站停留时间最长
在一定范围内,求所用户经过的所有基站所停留时间最长的Top2
- 3)思路
1.获取用户产生的日志信息并切分
2.用户在基站停留的总时长
3.获取基站的基础信息
4.把经纬度的信息join到用户数据中
5.求出用户在某些基站停留的时间top2
- 4)代码
package com.zxy.bigdata.spark.core.day5
import com.zxy.bigdata.spark.core.day2.SparkUtils
import org.apache.spark.rdd.RDD
import java.text.SimpleDateFormat
object Demo2 {
def main(args: Array[String]): Unit = {
val sc = SparkUtils.getDefaultSparkContext()
//一 加载用户数据
val filesRDD: RDD[String] = sc.textFile("file:///c://ftp/19735E1C66.log")
val userInfoRDD: RDD[((String, String), Long)] = filesRDD.map(line => {
val fields: Array[String] = line.split(",")
val phone = fields(0) // 手机号
val time = str2Long(fields(1)) // 时间戳
val lac = fields(2) // 基站ID
val eventType = fields(3) // 连接状态
val time_long = if (eventType.equals(1)) -time else time
((phone, lac), time_long) // ((手机号,基站ID), 时长)
})
val sumRDD: RDD[((String, String), Long)] = userInfoRDD.reduceByKey(_ + _)
val lacAndPTRDD: RDD[(String, (String, Long))] = sumRDD.map(t => {
val phone = t._1._1
val lac = t._1._2
val time = t._2
(lac, (phone, time))
})
// 加载基站的信息
val lacInfoRDD: RDD[String] = sc.textFile("file:///c://ftp/lac_info.txt")
val lacAndXYRDD: RDD[(String, (String, String))] = lacInfoRDD.map(line => {
val fields: Array[String] = line.split(",")
val lac = fields(0)
val x = fields(1)
val y = fields(2)
(lac, (x, y))
})
val joinRDD: RDD[(String, ((String, Long), (String, String)))] = lacAndPTRDD join lacAndXYRDD
val phoneAndTXYRDD: RDD[(String, Long, (String, String))] = joinRDD.map(t => {
val phone = t._2._1._1
val time = t._2._1._2
val xy = t._2._2
(phone, time, xy)
})
val groupRDD: RDD[(String, Iterable[(String, Long, (String, String))])] = phoneAndTXYRDD.groupBy(_._1)
val sortRDD: RDD[(String, List[(String, Long, (String, String))])] = groupRDD.mapValues(_.toList.sortBy(_._2).reverse)
val resRDD: RDD[(String, List[(Long, (String, String))])] = sortRDD.map(t => {
val phone = t._1
val list = t._2
val filterlist: List[(Long, (String, String))] = list.map(tup => {
val time = tup._2
val xy = tup._3
(time, xy)
})
(phone, filterlist)
})
val ressRDD: RDD[(String, List[(Long, (String, String))])] = resRDD.mapValues(_.take(2))
println(ressRDD.collect().toList)
sc.stop()
}
def str2Long(date:String):Long = {
val format = new SimpleDateFormat("yyyyMMddHHmmss")
val time: Long = format.parse(date).getTime
time
}
}
3. ip所属区域的访问量
1)批注
这个案例数据中有一个sql文件直接在数据库软件中执行,可以创建出一张表,这张可以用于将计算结果写入到数据库中,用于后续JDBCRDD使用
2)代码
package com.zxy.bigdata.spark.core.day5
import com.zxy.bigdata.spark.core.day2.SparkUtils
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import java.sql.{Connection, Date, DriverManager, PreparedStatement}
object Demo3 {
def main(args: Array[String]): Unit = {
//1. 加载数据,并且将ip数据转换为广播变量
val sc = SparkUtils.getDefaultSparkContext()
val httpLogRDD: RDD[String] = sc.textFile("file:///c://ftp/http.log")
val ipRDD: RDD[String] = sc.textFile("file:///c://ftp/ip.txt")
val province2IpRDD: RDD[(String, Long, Long)] = ipRDD.map(line => {
val fields: Array[String] = line.split("\\|")
val startIp = ip2long(fields(0))
val endIp = ip2long(fields(1))
val province = fields(6)
(province, startIp, endIp)
})
val ipArrs: Array[(String, Long, Long)] = province2IpRDD.collect().sortWith {
case ((pro1, sIp1, eIp1), (pro2, sIp2, eIp2)) => {
sIp1 < sIp2
}
}
val provinceIpBC: Broadcast[Array[(String, Long, Long)]] = sc.broadcast(ipArrs)
//2. 处理用户数据
val province2CountRDD: RDD[(String, Int)] = httpLogRDD.map(line => {
val fields = line.split("\\|")
val ip: Long = ip2long(fields(1))
val ipArr: Array[(String, Long, Long)] = provinceIpBC.value
val index = binarySearch(ip, ipArr)
if (index < 0) (null, -1)
else (ipArr(index)._1, 1)
}).filter(_._1 != null).reduceByKey(_ + _)
//3. 保存结果到数据库
res2mysql(province2CountRDD)
sc.stop()
}
/**
* 将字符串ip的表示形式转换为long的形式
*
*/
def ip2long(ip:String): Long = {
val fields: Array[String] = ip.split("\\.")
var ipNum = 0L
fields.foreach(field => ipNum = field.toLong | ipNum << 8)
ipNum
}
/**
* 根据二分查找法,查询ip在ipArray中的位置,如果没有返回-1
*/
def binarySearch(ip:Long, ipArr:Array[(String, Long, Long)]) : Int = {
var start = 0
var end = ipArr.length
while(start <= end) {
val mid = (start + end) / 2
val startIp = ipArr(mid)._2
val endIp = ipArr(mid)._3
if (ip >= startIp && ip <= endIp) {
return mid
} else if (ip < startIp) {
end = mid - 1
} else {
start = mid + 1
}
}
return -1
}
/**
* 将rdd保存到mysql数据库
*/
def res2mysql(rdd:RDD[(String, Int)]) : Unit = {
rdd.foreachPartition(itertor => {
var con:Connection = null
var ps:PreparedStatement = null
val jdbcUrl = "jdbc:mysql://146.56.208.76:3307/sqoop?useUnicode=true&characterEncoding=utf8"
val user = "root"
val pass = "123456"
con = DriverManager.getConnection(jdbcUrl, user, pass)
val sql = s"insert into `location_info` (location, counts, access_date) values (?, ?, ?)"
ps = con.prepareStatement(sql)
itertor.foreach {
case (province, count) => {
ps.setString(1, province)
ps.setString(2, count+"")
ps.setDate(3, new Date(System.currentTimeMillis()))
ps.addBatch()
}
}
ps.executeBatch()
ps.close()
})
}
}
六、Spark -> WordCount
1.经典项目:WordCount
WordCount方式一
类似于Scala的写法
package com.zxy.SparkCore
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount{
def main(args: Array[String]): Unit = {
//建立和Spark框架的连接
val wordCount: SparkConf = new SparkConf().setMaster("local").setAppName("WordCount")
val context: SparkContext = new SparkContext(wordCount)
//读取指定文件目录的数据
val lines: RDD[String] = context.textFile("spark-core\\dates")
//数据切割
val words: RDD[String] = lines.flatMap(_.split("\\s+"))
//数据分组
val map: RDD[(String, Iterable[String])] = words.groupBy(word => word)
//数据格式化
val WordToCount: RDD[(String, Int)] = map.map {
case (word, list) => (word, list.size)
}
//数据收集
val array: Array[(String, Int)] = WordToCount.collect()
//数据打印
array.foreach(println)
//关闭连接
context.stop()
}
}
WordCount方式一简化版
package com.zxy.SparkCore
import org.apache.spark.{SparkConf, SparkContext}
object WordCount{
def main(args: Array[String]): Unit = {
//建立和Spark框架的连接
val wordCount: SparkConf = new SparkConf().setMaster("local").setAppName("WordCount")
val context: SparkContext = new SparkContext(wordCount)
//函数式编程特点
context.textFile("spark-core\\dates").flatMap(_.split("\\s+")).groupBy(word => word).map(kv => (kv._1,kv._2.size)).collect().foreach(println)
//关闭连接
context.stop()
}
}
WordCount方式二
采用了Spark特有方法的写法
package com.zxy.SparkCore
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount2{
def main(args: Array[String]): Unit = {
//建立和Spark框架的连接
val wordCount: SparkConf = new SparkConf().setMaster("local").setAppName("WordCount")
val context: SparkContext = new SparkContext(wordCount)
//读取指定文件目录数据
val lines: RDD[String] = context.textFile("spark-core\\dates")
//切分数据
val words: RDD[String] = lines.flatMap(_.split("\\s+"))
//数据分组
val WordToOne: RDD[(String, Int)] = words.map(
word => (word, 1)
)
//spark提供的方法,将分组和聚合通过一个方法实现
//reduceByKey:相同的饿数据,可以对value进行reduce聚合
val WordToCount: RDD[(String, Int)] = WordToOne.reduceByKey(_ + _)
//数据收集
val array: Array[(String, Int)] = WordToCount.collect()
//数据打印
array.foreach(println)
//关闭连接
context.stop()
}
}
WordCount方式二简化版
package com.zxy.SparkCore
import org.apache.spark.{SparkConf, SparkContext}
object WordCount4{
def main(args: Array[String]): Unit = {
//建立和Spark框架的连接
val wordCount: SparkConf = new SparkConf().setMaster("local").setAppName("WordCount")
val context: SparkContext = new SparkContext(wordCount)
context.textFile("spark-core\\dates").flatMap(_.split("\\s+")).map(word => (word,1)).reduceByKey(_ + _).collect().foreach(println)
//关闭连接
context.stop()
}
}
控制台效果
2.Maven的POM文件
我这里采用的Scala2.11.8
使用的Spark2.4.7
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scalap</artifactId>
<version>2.11.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.7</version>
</dependency>
</dependencies>
七、Spark Streaming->WordCount
1.Consumer
package com.zxy.spark.Streaming
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
object wordCount {
def main(args: Array[String]): Unit = {
// 创建配置文件对象,一般设计local[*],即有多少用多少
val conf: SparkConf = new SparkConf().setAppName("WordCount").setMaster("local[*]")
// 创建Spark Streaming对象
val scc = new StreamingContext(conf, Seconds(3))
// 从端口中获取数据源,这个9999端口就是与Linux端的Producer数据传输的端口
val socketDS: ReceiverInputDStream[String] = scc.socketTextStream("192.168.130.110", 9999)
val reduceDS: DStream[(String, Int)] = socketDS.flatMap(_.split(" ")).map((_,1)).reduceByKey(_ + _)
/**
* 对上步操作细化
*
* // 对获取到的数据进行扁平化处理,即把输入的数据以空格为切割方式
* val flatMapDS: DStream[String] = socketDS.flatMap(_.split(" "))
*
* // 对数据进行结构上的转换
* val mapDS: DStream[(String, Int)] = flatMapDS.map((_, 1))
*
* // 对上述的数据进行聚合处理
* val reduceDS: DStream[(String, Int)] = mapDS.reduceByKey(_ + _)
*/
// 输出结果 注意:调用的是 DS的 print 函数
reduceDS.print()
// 启动采集器
scc.start()
// 默认情况下,上下文对象不能关闭
// scc.stop()
// 等待采集器结束,终止上下文环境对象
scc.awaitTermination()
}
}
2.POM依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>1.1.1</version>
</dependency>
tips:
provided
如果在依赖中加入了这一行,会出现这种问题
NoClassDefFoundError: org/apache/spark/streaming/StreamingContext
这时候只需要删除这一行就可以
3.Producer
端口传输主要有三种方式:
nc nmap telnet
[root@hadoop ~]# yum install -y nc
[root@hadoop ~]# nc -lk 9999
然后就可以在这边发送数据,在Consumer端接收数据,并进行WordCount统计,Consumer端每3S都会统计一次,不管这边有没有发送数据
Time: 1624336719000 ms
(hive,1)
(word,1)
(hello,4)
(java,1)
(spark,1)
八、从kafka指定的topic中读取如上数据,进行清洗,剔除上述无用字段,保留有用信息,最后将清洗结果送回kafka指定的 topic中
1)数据格式
<<<!>>>3111<<<!>>>
<<<!>>>238<<<!>>>
<<<!>>>20181111132902<<<!>>>
<<<!>>>58.223.1.112<<<!>>>
<<<!>>>202.102.92.18<<<!>>>
<<<!>>>59947<<<!>>>
<<<!>>>80<<<!>>>
<<<!>>>www.sumecjob.com<<<!>>>
<<<!>>><<<!>>>
<<<!>>><<<!>>>
<<<!>>><<<!>>>
<<<!>>><<<!>>>
<<<!>>><<<!>>>
<<<!>>>http://www.sumecjob.com/Social.aspx<<<!>>>
<<<!>>>2556928066<<<!>>>
2)需求
从kafka指定的topic中读取如上数据,进行清洗,剔除上述无用字段,保留有用信息,包括userid 用 户操作时间(timestamp) 用户ip地址:端口 服务地址:服务端口 url,最后将清洗结果送回kafka指定的 topic中,完成在线etl。
1. 所有的<<<!>>>直接替换为空串
2. 所有的日期替换为:yyyy-MM-dd
3. 使用mysql手动保存偏移量
3)代码
package com.zxy.spark.Streaming.day004
import java.util.Properties
import com.zxy.spark.core.Day02.LoggerTrait
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, TaskContext}
import scalikejdbc.{ConnectionPool, DB, _}
object demo5 extends LoggerTrait{
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setAppName("demo5")
.setMaster("local[*]")
val duration = Seconds(2)
val ssc = new StreamingContext(sparkConf, duration)
val kafkaparams = Map[String,String](
"bootstrap.servers" -> "192.168.130.110:9092",
"group.id" -> "zxy",
"auto.offset.reset" -> "largest"
)
val topics: Set[String] = "OldTopic".split(",").toSet
val messageHandler = (msgHandler:MessageAndMetadata[String,String]) => (msgHandler.topic,msgHandler.message())
println(s"getfromoffset->${getfromoffset()}")
val messages: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](
ssc,
kafkaparams,
getfromoffset(),
messageHandler
)
messages.foreachRDD(rdd => {
if(!rdd.isEmpty()){
rdd.foreachPartition(partitionIterator => {
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
val offsetRange: OffsetRange = offsetRanges(TaskContext.getPartitionId())
DB.localTx {
implicit session => {
partitionIterator.foreach(msg => {
//数据清洗
println(s"message->${msg._2}")
val resDate: (String, String, String, String) = dateETL(msg._2)
println(s"resDate -> $resDate")
//将清洗后的数据发送给新的topic
saveDate2Topics(resDate)
})
val offsetSql = sql"update mytopic set offset = ${offsetRange.untilOffset} where topic = ${offsetRange.topic} and partitionId = ${offsetRange.partition}".update().apply()
}
}
})
}
})
ssc.start()
ssc.awaitTermination()
}
/**
* 获取数据库中的偏移量
* @return
*/
def getfromoffset(): Map[TopicAndPartition, Long]={
val url = "jdbc:mysql://localhost:3306/zxy?useSSL=false"
val user = "root"
val pass = "root"
val driver = "com.mysql.jdbc.Driver"
Class.forName(driver)
ConnectionPool.singleton(url,user,pass)
val fromOffsets: Map[TopicAndPartition, Long] = DB.readOnly {
implicit session =>
sql"select topic,partitionId,offset from mytopic"
.map {
result => TopicAndPartition(result.string(1), result.int(2)) -> result.long(3)
}.list().apply().toMap
}
fromOffsets
}
/**
* 进行数据清洗操作
*/
def dateETL(date: String):(String,String,String,String) ={
val lines: Array[String] = date.split("<<<!>>>")
val timestamp = lines(5)
val time: String = getFormatTime(timestamp)
(lines(2),time,lines(7)+":"+lines(11),lines(27))
}
/**
* 将时间戳按指定模式返回
*/
def getFormatTime(timestamp : String): String = {
val year = timestamp.substring(0, 4)
val month = timestamp.substring(4, 6)
val day = timestamp.substring(6, 8)
val ymd = s"${year}-${month}-${day}"
ymd
}
/**
* 将清洗过的数据生产到新的topic中(NewTopic)
*/
def saveDate2Topics(newDate: (String, String, String, String)): Unit ={
val properties = new Properties()
properties.load(demo5.getClass.getClassLoader.getResourceAsStream("producer.properties"))
val producer = new KafkaProducer[String,String](properties)
val userID: String = newDate._1
val value = s"${newDate._2},${newDate._3},${newDate._4}"
val data = new ProducerRecord[String,String]("NewTopic",userID,value)
producer.send(data)
producer.close()
}
}