1.使用Spark完成单词去重
text02.txt的数据
java php hello word
phpp hi exe java
python hello kitty
php happy abc java
import org.apache.spark.{SparkConf, SparkContext}
object WordDistinct {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("WordDistinct").setMaster("local")
val sc = new SparkContext(conf)
val lines = sc.textFile("file:///C:\\Users\\Administrator\\Desktop\\text02.txt")
val words = lines.flatMap(line => line.split(" "))
val uniqueWords = words.distinct()
uniqueWords.collect().foreach(println)
}
}
2.使用Spark统计133 136 139开头的总流量
13326293050 81
13626594101 50
13326435696 30
13926265119 40
13326564191 2106
13626544191 1432
13919199499 300
import org.apache.spark.{SparkConf, SparkContext}
object TrafficCount {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("TrafficCount").setMaster("local")
val sc = new SparkContext(conf)
val data = sc.parallelize(List(
("13326293050", 81),
("13626594101", 50),
("13326435696", 30),
("13926265119", 40),
("13326564191", 2106),
("13626544191", 1432),
("13919199499", 300)
))
val traffic = data.filter { case (number, _) => number.startsWith("133") || number.startsWith("136") || number.startsWith("139") }
.map { case (_, value) => value }
.sum()
println(traffic)
}
}
3.完成统计相同字母组成的单词
text01.txt的数数据
abc acb java
avaj bac
cba abc
jvaa php hpp
pph python thonpy
import org.apache.spark.{SparkConf, SparkContext}
object AnagramCount {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("AnagramCount").setMaster("local")
val sc = new SparkContext(conf)
val lines = sc.textFile("file:///C:\\Users\\Administrator\\Desktop\\text01.txt")
val words = lines.flatMap(line => line.split(" "))
val anagrams = words.map(word => (word.toLowerCase.sorted, 1))
.reduceByKey(_ + _)
anagrams.collect().foreach(println)
}
}