object WordCount{
def main(args:Array[String]):Unit={//建立和spark框架的连接
val sparkConf =newSparkConf().setMaster("local").setAppName("WordCount")
val sc =newSparkContext(sparkConf)//执行业务操作//1. 读取文件,获取一行一行的数据
val lines = sc.textFile("datas")//2. 将一行数据进行才分,形成一个一个的单词
val words = lines.flatMap(_.split(" "))//3. 将数据根据单词进行分组
val wordLists = words.groupBy(word => word)//4. 对分组后的数据进行准换
val wordToCount = wordLists.map {case(word, list)=>{(word, list.size)}}//5. 将转换结构打印
val result = wordToCount.collect()
result.foreach(println)//关闭连接
sc.stop()}}
第二种方式
object WordCount{
def main(args:Array[String]):Unit={//建立和spark框架的连接
val sparkConf =newSparkConf().setMaster("local").setAppName("WordCount")
val sc =newSparkContext(sparkConf)//执行业务操作//1. 读取文件,获取一行一行的数据
val lines = sc.textFile("datas")//2. 将一行数据进行才分,形成一个一个的单词
val words = lines.flatMap(_.split(" "))//3. 将数据根据单词进行分组
val wordToOne = words.map(
word =>(word,1))
val wordLists = wordToOne.groupBy{
t => t._1
}//4. 对分组后的数据进行准换
val wordToCount = wordLists.map {case(word, list)=>{
list.reduce((t1,t2)=>{(t1._1, t1._2 + t2._2)})}}//5. 将转换结构打印
val result = wordToCount.collect()
result.foreach(println)//关闭连接
sc.stop()}}
第三种方式:利用spark自带的方法
object WordCount{
def main(args:Array[String]):Unit={//建立和spark框架的连接
val sparkConf =newSparkConf().setMaster("local").setAppName("WordCount")
val sc =newSparkContext(sparkConf)//执行业务操作//1. 读取文件,获取一行一行的数据
val lines = sc.textFile("datas")//2. 将一行数据进行才分,形成一个一个的单词
val words = lines.flatMap(_.split(" "))//3. 将数据根据单词进行分组
val wordToOne = words.map(
word =>(word,1))//4. spark方式对分组后的数据进行准换//reduceByKey:相同的KEY的数据,可以对value进行reduce聚合
val wordToCount = wordToOne.reduceByKey(_ + _)//5. 将转换结构打印
val result = wordToCount.collect()
result.foreach(println)//关闭连接
sc.stop()}}
教学版本:
// 创建 Spark 运行配置对象
val sparkConf =newSparkConf().setMaster("local[*]").setAppName("WordCount")// 创建 Spark 上下文环境对象(连接对象)
val sc :SparkContext=newSparkContext(sparkConf)// 读取文件数据
val fileRDD: RDD[String]= sc.textFile("input/word.txt")// 将文件中的数据进行分词
val wordRDD: RDD[String]= fileRDD.flatMap( _.split(" "))// 转换数据结构 word => (word, 1)
val word2OneRDD: RDD[(String,Int)]= wordRDD.map((_,1))// 将转换结构后的数据按照相同的单词进行分组聚合
val word2CountRDD: RDD[(String,Int)]= word2OneRDD.reduceByKey(_+_)// 将数据聚合结果采集到内存中
val word2Count:Array[(String,Int)]= word2CountRDD.collect()// 打印结果
word2Count.foreach(println)//关闭 Spark 连接
sc.stop()
log4j.rootCategory=ERROR, console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd
HH:mm:ss} %p %c{1}: %m%n
# Set the default spark-shell log level to ERROR. When running the spark-shell,
the
# log level for this class is used to overwrite the root logger's log level, so
that
# the user can have different defaults for the shell and regular Spark apps.
log4j.logger.org.apache.spark.repl.Main=ERROR
# Settings to quiet third party logs that are too verbose
log4j.logger.org.spark_project.jetty=ERROR
log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=ERROR
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=ERROR
log4j.logger.org.apache.parquet=ERROR
log4j.logger.parquet=ERROR
# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent
UDFs in SparkSQL with Hive support
log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR