简单通过scala实现wordcount
wordcount简介
大数据的基础项目,类似其他语言的hello world,必学项目。统计文章中词语的个数。
示例1
一个简单的实现word,基于spark使用语言java方式呈现。
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.Arrays;
public class wordCount {
public static void main(String[] args) throws Exception {
SparkConf conf = new SparkConf().setMaster("local").setAppName("wordCount");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<String> rdd = jsc.textFile("D:\\Java_code\\BigDataLogTest\\src\\main\\resources\\nginx.log");
JavaRDD<String> rdd1 = rdd.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterable<String> call(String s) throws Exception {
String[] s1 = s.split(" ");
return Arrays.asList(s1);
}
});
JavaPairRDD<String, Integer> map = rdd1.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<String,Integer>(s,1);
}
});
JavaPairRDD<String, Integer> stringIntegerJavaPairRDD = map.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
stringIntegerJavaPairRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
System.out.println(stringIntegerTuple2);
}
});
jsc.stop();
}
}
示例2
一个简单的实现word,基于spark使用语言scala方式呈现。
// split + flatmap 版本
// split + map + reduceByKey 版本
// split + map + reduceByKey 简写版本
package com.xxx.spark
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WorkCount extends App{
private val conf = new SparkConf()
conf.setMaster("local").setAppName("wc")
private val sparkContext = new SparkContext(conf)
private val file: String = WorkCount.getClass.getClassLoader.getResource("data/wc.txt").getFile
// RDD 是spark的核心 hello nihao,RDD核心 数据操作
private val line: RDD[String] = sparkContext.textFile(file)
// 数据转换
// 通过空格来分割数据
private val word: RDD[String] = line.flatMap(x => x.split(" "))
// 添值 string -> (string , 1) 举例 "aaa" -> ("aaa",1)
private val wordcount: RDD[(String, Int)] = word.map(x => (x, 1))
// 根据key 统计总数 即 ("aaa",1) key为"aaa"
private val result: RDD[(String, Int)] = wordcount.reduceByKey((x, y) => {
x + y
})
// 枚举每个key,并打印结果
result.foreach(println)
// 简单过程 同上面的分割数据+添值+统计+枚举打印
line.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).foreach(println)
// 遍历过程
word.foreach(println)
sparkContext.stop()
}
示例3
// flink stream 版本
public class StreamingJob {
public static void main(String[] args) throws Exception {
// set up the streaming execution environment
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> source = env.readTextFile("data/nginx.log");
source
.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
@Override
public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
String[] s1 = s.split(" ");
for (String s2 : s1) {
collector.collect(Tuple2.of(s2, 1));
}
}
})
.keyBy(0)
.sum(1)
.print();
// lambda 方式
lambdaFunc(source);
// common 普通方式
commonFunc(source);
// rich 富函数方式 可以开后门
richFunc(source);
// process 过程函数方式,可以调用env
processFunc(source);
// execute program
env.execute("Flink Streaming Java API Skeleton");
}
private static void processFunc(DataStreamSource<String> source) {
source
.process(new ProcessFunction<String, Tuple2<String,Integer>>() {
@Override
public void processElement(String value, ProcessFunction<String, Tuple2<String, Integer>>.Context ctx, Collector<Tuple2<String, Integer>> out) throws Exception {
String[] s = value.split(" ");
for (String s1 : s) {
out.collect(Tuple2.of(s1,1));
}
}
})
.keyBy(0)
.sum(1)
.print();
}
private static void richFunc(DataStreamSource<String> source) {
source
.flatMap(new RichFlatMapFunction<String, Tuple2<String, Integer>>() {
@Override
public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
String[] s1 = s.split(" ");
for (String s2 : s1) {
collector.collect(Tuple2.of(s2, 1));
}
}
})
.keyBy(0)
.sum(0)
.print();
}
private static void commonFunc(DataStreamSource<String> source) {
source.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
@Override
public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
String[] s1 = s.split(" ");
for (String s2 : s1) {
collector.collect(Tuple2.of(s2, 1));
}
}
})
.keyBy(0)
.sum(1)
.print();
}
private static void lambdaFunc(DataStreamSource<String> source) {
SingleOutputStreamOperator<String> stringSingleOutputStreamOperator = source.flatMap(new FlatMapFunction<String, String>() {
@Override
public void flatMap(String s, Collector<String> collector) throws Exception {
String[] s1 = s.split(" ");
for (String s2 : s1) {
collector.collect(s2);
}
}
});
SingleOutputStreamOperator<Tuple2<String, Integer>> map = stringSingleOutputStreamOperator.map(new MapFunction<String, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> map(String s) throws Exception {
return Tuple2.of(s, 1);
}
});
KeyedStream<Tuple2<String, Integer>, Tuple> tuple2TupleKeyedStream = map.keyBy(0);
SingleOutputStreamOperator<Tuple2<String, Integer>> sum = tuple2TupleKeyedStream.sum(1);
DataStreamSink<Tuple2<String, Integer>> print = sum.print();
}
}
示例4
// flink batch 版本
package org.example;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
public class BatchJob {
public static void main(String[] args) throws Exception {
// set up the batch execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSource<String> stringDataSource = env.readTextFile("data/nginx.log");
stringDataSource
.flatMap(new FlatMapFunction<String, Tuple2<String,Integer>>() {
@Override
public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
String[] s1 = s.split(" ");
for (String s2 : s1) {
collector.collect(Tuple2.of(s2,1));
}
}
})
.groupBy(0)
.sum(1)
.print();
stringDataSource
.flatMap(new FlatMapFunction<String, Tuple2<String,Integer>>() {
@Override
public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
String[] s1 = s.split(" ");
for (String s2 : s1) {
collector.collect(Tuple2.of(s2,1));
}
}
})
.groupBy(0)
.reduce(new ReduceFunction<Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> reduce(Tuple2<String, Integer> stringIntegerTuple2, Tuple2<String, Integer> t1) throws Exception {
return Tuple2.of(t1.f0,t1.f1+stringIntegerTuple2.f1);
}
})
.print();
// env.execute("Flink Batch Java API Skeleton");
}
}