1.准备数据
spark,java,#,!
spark,java
spark,python,%
hello,java
java,!,%
2.需求:统计字符串字数和特殊字符的总数
3.代码展示和注释详解
在这里插入代码片
/声明一个累加器
val mycounter: LongAccumulator = sc.longAccumulator("mycounter")
//准备/加载规则(就是一些定义好的特殊字符)
val ruleList: List[String] = List(",",".","!","#","$","%")
//将List进行广播,广播到各个worker(各个Task会去各自的Worker去读)
val broadcast: Broadcast[List[String]] = sc.broadcast(ruleList)
val lines: RDD[String] = sc.textFile("in/wc")
val wordresult: RDD[(String, Int)] = lines.filter(StringUtils.isNotEmpty(_))
.flatMap(x=>{x.trim.split(",")})
.filter(ch => {
//获取广播变量的值
val list: List[String] = broadcast.value
if (list.contains(ch)) { //如果是特殊字符
//把特殊字符使用累加器进行计总数
mycounter.add(1)
false
} else {
true
}
}).map((_, 1)).reduceByKey(_ + _)
wordresult.foreach(println)
val chREsult: lang.Long = mycounter.value
println("特殊字符数量"+chREsult)