Scala版本的wordCount
object ScalaWordCount {
def main(args: Array[String]): Unit = {
if(args.length != 2){
println("Usage:cn.edu360.spark31.day01.ScalaWordCount <input><output>")
sys.exit(1)
}
val Array(input,output)= args
val conf:SparkConf = new SparkConf()
val sc:SparkContext = new SparkContext(conf)
val files: RDD[String] = sc.textFile(input)
val splitedData: RDD[String] = files.flatMap(_.split(" "))
val wordsAndOne: RDD[(String, Int)] = splitedData.map((_,1))
val result: RDD[(String, Int)] = wordsAndOne.reduceByKey(_+_)
val sorted = result.sortBy(-_._2)
sorted.saveAsTextFile(output)
sc.stop()
}
}
java版本的wordCount
public class JavaWordCount {
public static void main(String[] args) {
if (args.length != 2) {
System.out.println("Usage:cn.edu360.ScalaWordCount <input><output>");
System.exit(1);
}
SparkConf conf = new SparkConf();
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<String> lines = jsc.textFile(args[0]);
JavaRDD<String> flatData = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String line) throws Exception {
return Arrays.asList(line.split(" ")).iterator();
}
});
JavaPairRDD<String, Integer> wordAndOne = flatData.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<String, Integer>(s, 1);
}
});
JavaPairRDD<String, Integer> result = wordAndOne.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
JavaPairRDD<Integer, String> beforeSort = result.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
@Override
public Tuple2<Integer, String> call(Tuple2<String, Integer> tp) throws Exception {
return tp.swap();
}
});
JavaPairRDD<Integer, String> sortedData = beforeSort.sortByKey(false);
JavaPairRDD<String, Integer> finalRes = sortedData.mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Tuple2<Integer, String> tp) throws Exception {
return tp.swap();
}
});
finalRes.saveAsTextFile(args[1]);
jsc.stop();
}
.Javalamda版本的wordCount
public class JavaLambdaWordCount {
public static void main(String[] args) {
if (args.length != 2) {
System.out.println("Usage:cn.edu360.spark31.day02.JavaLambdaWordCount <input><output>");
System.exit(1);
}
SparkConf conf = new SparkConf();
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<String> lines = jsc.textFile(args[0]);
JavaRDD<String> flatData = lines.flatMap(t -> Arrays.asList(t.split(" ")).iterator());
JavaPairRDD<String, Integer> wordAndOne = flatData.mapToPair(line -> new Tuple2<>(line, 1));
JavaPairRDD<String, Integer> result = wordAndOne.reduceByKey((a, b) -> a + b);
JavaPairRDD<Integer, String> beforeSort = result.mapToPair(tp -> tp.swap());
JavaPairRDD<Integer, String> sortedData = beforeSort.sortByKey(false);
JavaPairRDD<String, Integer> finalResu = sortedData.mapToPair(tp -> tp.swap());
finalResu.saveAsTextFile(args[1]);
jsc.stop();
}
}
SparkFrame版本的wordCount
object DataFrameWC {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName(this.getClass.getSimpleName)
.setMaster("local[*]")
val sc = new SparkContext(conf)
val sqlContext: SQLContext = new SQLContext(sc)
import sqlContext.implicits._
val words: RDD[String] = sc.textFile("wc.txt")
val wordsRdd: RDD[String] = words.flatMap(_.split(" "))
val worddf: DataFrame = wordsRdd.map(Word(_)).toDF()
worddf.registerTempTable("t_word")
val result: DataFrame = sqlContext.sql("select name ,count(*) as cnts from t_word group by name order by cnts desc ")
result.show()
sc.stop()
}
}
case class Word(name:String)
DataSet SQL语法的wordCount
object DataSetWC {
def main(args: Array[String]): Unit = {
val session: SparkSession = SparkSession.builder()
.master("local[*]")
.appName(this.getClass.getSimpleName)
.getOrCreate()
import session.implicits._
val file: Dataset[String] = session.read.textFile("wc.txt")
val wDs: Dataset[String] = file.flatMap(_.split(" "))
wDs.printSchema()
wDs.schema
wDs.createTempView("v_word")
val result: DataFrame = session.sql("select value ,count(*) cnts from v_word group by value order by cnts desc ")
result.show()
session.stop()
}
}
Dataset DSL 语法版本的wordCount
object DatasetDSLDemo {
def main(args: Array[String]): Unit = {
val session: SparkSession = SparkSession.builder()
.master("local[*]")
.appName(this.getClass.getSimpleName)
.getOrCreate()
// 导入sparksession实例上的隐式转换
import session.implicits._
// sessionApi 获取到的数据类型就是Dataset
val file: Dataset[String] = session.read.textFile("person.txt")
val pdf: DataFrame = file.map(_.split(" ")).map(t=>(t(0),t(1).toInt,t(2).toInt)).toDF("name","age","fv")
// DSL 语法统计
// select where group by order by count sum max min
// 选择
pdf.select("name","age")
// .show()
// pdf.select(pdf.col("name"),pdf.col("age")).show()
// pdf.select(pdf("name"),pdf("age")).show()
// where 条件 过滤 where 调用的是filterAPI
// pdf.where("fv > 93 ")
pdf.filter("fv > 93") // .show()
// 排序 order by Sort orderBy 调用的就是sort
pdf.orderBy("age").show()
// 多个条件的排序
pdf.sort($"age" ,$"fv" desc) //.show()
// 分组之后还需要进行聚合统计 sum max min avg
pdf.groupBy($"age").sum("fv") //.show()
val result1: DataFrame = pdf.groupBy($"age").count()
// result1.show()
// 聚合条件 需要导入函数
import org.apache.spark.sql.functions._
// 聚合时,指定别名
pdf.groupBy($"age").agg(count("*") as "cnts")// .show()
pdf.groupBy($"age").agg(max("fv"))//.show()
// 重命名列名
pdf.withColumnRenamed("age","myage").show()
// pdf
session.stop()
}
}
Streaming版本的wordcount
object StreamingWordCount {
val logger = Logger.getLogger("org").setLevel(Level.WARN)
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[5]")
.setAppName(this.getClass.getSimpleName)
val ssc: StreamingContext = new StreamingContext(conf, Seconds(2))
val textStream: ReceiverInputDStream[String] = ssc.socketTextStream("hdp-03",9999)
val textStream2: ReceiverInputDStream[String] = ssc.socketTextStream("hdp-03",9998)
textStream2.print()
val result: DStream[(String, Int)] = textStream.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_)
result.print()
result.saveAsTextFiles("stream","log")
ssc.start()
ssc.awaitTermination()
}
}
Streaming SQL 版本的wordcount
object StreamingSQL {
val logger = Logger.getLogger("org").setLevel(Level.WARN)
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[2]")
.setAppName(this.getClass.getSimpleName)
val ssc: StreamingContext = new StreamingContext(conf, Seconds(2))
val textStream: ReceiverInputDStream[String] = ssc.socketTextStream("hdp-03", 9999)
textStream.foreachRDD(rdd => {
val session = SparkSession.builder()
.config(conf)
.getOrCreate()
import session.implicits._
val wordRDD: RDD[String] = rdd.flatMap(_.split(" +"))
val wdf: DataFrame = wordRDD.toDF("word")
wdf.createOrReplaceTempView("v_word")
val dfRes = session.sql("select word,count(1) cnts from v_word group by word order by cnts desc")
dfRes.show()
})
ssc.start()
ssc.awaitTermination()
}
}
Kafka版本的wordCount
object StreamingKafkaWC {
Logger.getLogger("org").setLevel(Level.WARN)
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName(this.getClass.getSimpleName)
val ssc= new StreamingContext(conf,Seconds(2))
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "hdp-02:9092,hdp-03:9092,hdp-04:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "hello_topic_group",
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array("helloTopic6", "topicB")
val stream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
stream.foreachRDD(rdd=>{
val result = rdd.map(_.value()).map((_,1)).reduceByKey(_+_)
result.foreach(println)
})
ssc.start()
ssc.awaitTermination()
}
}