1、读取HDFS文件
scala> val lines=sc.textFile("hdfs://master-ubuntu:9000/wordcount/wordcount.txt")
2、压扁文件内容,以空格分割
scala> val rdd1=lines.flatMap(line=>line.split(" "))
3、映射为(word,1)格式
scala> val rdd2=rdd1.map(word=>(word,1))
4、对相同key的value累加合并
scala> val rdd3=rdd2.reduceByKey(_+_)
IDEA开发代码如下
import org.apache.spark.{SparkConf, SparkContext} object WordCount { def main(args:Array[String]): Unit ={ val conf=new SparkConf().setAppName("wordcount").setMaster("local[*]") val sc=new SparkContext(conf) System.setProperty("HADOOP_USER_NAME", "root") val lines=sc.textFile("hdfs://master-ubuntu:9000/wordcount/wordcount.txt") val flatmaprdd=lines.flatMap(line=>line.split(" ")) val maprdd=flatmaprdd.map(word=>(word,1)) val reducerdd=maprdd.reduceByKey{case(x,y)=>x+y} reducerdd.saveAsTextFile("hdfs://master-ubuntu:9000/wordcount/output") } }