Apache Flink是另一种新一代通用大数据处理引擎,旨在统一不同的数据负载。Flink正试图解决Spark试图解决的同样问题。这两个系统都旨在构建单一平台,您可以在其中运行批处理,流媒体,交互式,图形处理,ML等。因此,flink与Spark的意识形态中间没有太大差别。但是它们在实现细节方面确实存在很大差异。
这里介绍单词统计flink的scala版本和java版本的代码实现
scala版本实现
import org.apache.flink.streaming.api.windowing.time.Time
/**
* 单词统计scala
*/
object WordCountStreamScala {
def main(args: Array[String]): Unit = {
import org.apache.flink.streaming.api.scala._
//创建流执行环境对象
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//创建socket文本流
val ds1 = env.socketTextStream("192.168.2.10" , 8888 , '\n' )
//压扁行
val ds2 = ds1.flatMap(line => line.split(" "))
//变换,标一成对
val ds3 = ds2.map(w => (w , 1))
//分组
val ds4 = ds3.keyBy("_1") ;
//窗口操作
val ds5 = ds4.timeWindow(Time.seconds(5) , Time.seconds(1))
//统计总数
val ds6 = ds5.sum("_2")
ds6.print().setParallelism(1)
env.execute("Socket Window WordCount")
}
}
JAVA版本实现
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.streaming.api.datastream.AllWindowedStream;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
/**
* 单词统计java
*/
public class WordCountStreamJava {
public static void main(String[] args) {
//创建流执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//socket文本流
DataStream<String> ds1=env.socketTextStream("192.168.2.10",8888,"\n");
//对行进行压扁
DataStream<String> ds2=ds1.flatMap(new FlatMapFunction<String, String>() {
public void flatMap(String value, Collector<String> out) throws Exception {
for(String word:value.split(" ")){
out.collect(word);
}
}
});
//标一成对
DataStream<WordWithCount> ds3=ds2.map(new MapFunction<String, WordWithCount>() {
public WordWithCount map(String word) throws Exception {
return new WordWithCount(word,1);
}
});
//按照word分组
DataStream<WordWithCount> ds4=ds3.keyBy("word");
//窗口化操作
AllWindowedStream<WordWithCount,TimeWindow> ds5=ds4.timeWindowAll(Time.seconds(5),Time.seconds(1));
SingleOutputStreamOperator<WordWithCount> ds6=ds5.reduce(new ReduceFunction<WordWithCount>() {
public WordWithCount reduce(WordWithCount v1, WordWithCount v2) throws Exception {
return new WordWithCount(v1.word,v1.count+v2.count);
}
});
ds6.print().setParallelism(1);
}
//定义javabean
public static class WordWithCount{
public String word;
public long count;
public WordWithCount(){
}
public WordWithCount(String word,long count){
this.word=word;
this.count=count;
}
public String toString(){
return word+":"+count;
}
}
}
pom.xml文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>big</groupId>
<artifactId>myflink</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.11</artifactId>
<version>1.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.11</artifactId>
<version>1.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table_2.11</artifactId>
<version>1.7.2</version>
</dependency>
</dependencies>
</project>
运行代码后在linux服务器上开启nc
nc命令:
hadoop> nc -lk 8888
88 88 88
88
88
在控制台就可以看到运行单词统计结果