spark代码要在集群上运行,需要先在idea上面打成jar包。
1.打包前需要把一些代码删除掉
package org.example
import org.apache.spark.{SparkConf, SparkContext}
object WordCount {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SparkRDDWordCount")
// .setMaster("local[*]") //local[*]是在本地运行,需要去掉
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
// 记得改成集群上的地址
val linesRDD = sc.textFile("hdfs://node01:8020/input")
val wordsRDD = linesRDD.flatMap(x => x.split(" "))
val paresRDD = wordsRDD.map((_, 1))
val wordCountsRDD = paresRDD.reduceByKey(_ + _)
val wordCountsSortRDD = wordCountsRDD.sortBy(_._2, false)
// 记得改成集群上的地址
wordCountsSortRDD.saveAsTextFile("hdfs://node01:8020/output/wordcountoutput")
sc.stop()
}
}
3.