单词批处理:
scala版本:
package liangde.tech
import org.apache.flink.api.scala.ExecutionEnvironment
import org.apache.flink.streaming.api.scala._
/**
* Scala版本
*/
object BatchWordCountScala {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
val inputpath = "C:\\Users\\liangde.li\\IdeaProjects\\FlinkExample\\src\\test\\file"
val outpath = "C:\\Users\\liangde.li\\IdeaProjects\\FlinkExample\\src\\test"
val text =env.readTextFile(inputpath)
val count = text.flatMap(_.toLowerCase.split("\\W+"))
.filter(_.nonEmpty)
.map(w=>(w,1))
.groupBy(0)
.sum(1)
count.writeAsCsv(outpath,"\n"," ", org.apache.flink.core.fs.FileSystem.WriteMode.OVERWRITE).setParallelism(1)
env.execute("batch word count!")
}
}
Java 版本:
package liangde.tech;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
/**
*
*/
public class BatchWordCountJava {
public static void main(String[] args) throws Exception {
//获取运行环境
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
String inputpath = "C:\\Users\\liangde.li\\IdeaProjects\\FlinkExample\\src\\test\\file";
String outpath = "C:\\Users\\liangde.li\\IdeaProjects\\FlinkExample\\src\\test";
//读取本地文件
DataSource<String> text = env.readTextFile(inputpath);
DataSet<Tuple2<String,Integer>> counts = text.flatMap(new Tokenizer()).groupBy(0).sum(1);
counts.writeAsCsv(outpath,"\n"," ",org.apache.flink.core.fs.FileSystem.WriteMode.OVERWRITE).setParallelism(1);
env.execute("bath word count");
}
public static class Tokenizer implements FlatMapFunction<String, Tuple2<String,Integer>>{
public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception {
String[] tokens = value.toLowerCase().split("\\W+");
for(String token:tokens){
if(token.length()>0){
out.collect(new Tuple2<String, Integer>(token,1));
}
}
}
}
}
注意创建环境用的是
ExecutionEnvironment
返回类型用的是:
DataSet
流处理:
使用套接字socket来实现
scala 版本:
package liangde.tech
import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time
/**
* 滑动窗口计算
*
* 每隔一秒最近2秒内的数据,打印到控制台
*
*/
object SocketWindowWordCount {
def main(args: Array[String]): Unit = {
//获取socket端口号
val port:Int = try{
ParameterTool.fromArgs(args).getInt("port")
}catch {
case e: Exception=>{
System.err.println("No port set. Use default port 9000!")
}
9000
}
//获取运行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//链接socket获取输入数据
val text = env.socketTextStream("192.168.1.xx",port,'\n')
import org.apache.flink.api.scala._
//解析数据(数据扁平化),分组,窗口计算,并且聚合求sum
val wordWithCount = text.flatMap(line => line.split("\\s")).map(w =>WordWithCount(w,1))
.keyBy("word")//分组
.timeWindow(Time.seconds(2),Time.seconds(1))//指定窗口大小,间隔时间
//.sum("count");//sum 或者reduce 都行
.reduce((a,b)=>WordWithCount(a.word,a.count+b.count));
wordWithCount.print().setParallelism(1); //打印到控制台
env.execute("Socket Window Count!")
}
case class WordWithCount(word:String,count:Long
)
}
Java 版本:
package liangde.tech;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSink;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
/**
* 滑动窗口计算
* 通过socket模拟产生单词数据
* flink对数据进行统计计算
*
* 需要实现每隔一秒对最近两秒内的数据进行汇总计算
*/
public class SocketWindowWordCountJava {
public static void main(String[] args) throws Exception {
// 获取需要的端口号
int port;
try {
ParameterTool parameterTool = ParameterTool.fromArgs(args);
port = parameterTool.getInt("port");
}catch (Exception e){
System.out.print("No port set .Use default port 9000");
port = 9000;
}
//获取Flink的运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
String hostname = "192.168.1.35";
String delimiter = "\n";
//链接socket获取输入的数据
DataStreamSource<String> text = env.socketTextStream(hostname,port,delimiter);
// a b c
// a 1
// b 1
// c 1
DataStream<WordWithCount> windowCounts = text.flatMap(new FlatMapFunction<String, WordWithCount>() {
public void flatMap(String value, Collector<WordWithCount> out) throws Exception {
String[] splits = value.split("\\s");
for(String word:splits){
out.collect(new WordWithCount(word,1));
}
}
}).keyBy("word")
.timeWindow(Time.seconds(2),Time.seconds(1)) //指定时间窗口大小为
.sum("count"); // 在这里使用sum或者reduce都可以
// .reduce(new ReduceFunction<WordWithCount>() {
// public WordWithCount reduce(WordWithCount a, WordWithCount b) throws Exception {
// return new WordWithCount(a.word,a.count+b.count);
// }
// })
//把数据打印到控制台并且设置并行度
windowCounts.print().setParallelism(1);
//这行代码一定要实现,否则程序不执行
env.execute("Socket window count");
}
public static class WordWithCount{
public String word;
public long count;
public WordWithCount(){}
public WordWithCount(String word,long count){
this.word = word;
this.count = count;
}
@Override
public String toString(){
return "WordWithCount{" +
"word='" + word + '\''+
",count =" + count +
'}';
}
}
}
注意这里的API 是:
StreamExecutionEnvironment
返回类型:
DataStream
流处理的时候,要在服务器端输入 nc -l 9000
然后此时就可以输入 words了
pom.xml 如下:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>liangde.tech</groupId>
<artifactId>FlinkExample</artifactId>
<version>1.0-SNAPSHOT</version>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-java -->
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>1.6.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-streaming-java -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.11</artifactId>
<version>1.6.1</version>
<!--<scope>provided</scope>-->
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-scala -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-scala_2.11</artifactId>
<version>1.6.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-streaming-scala -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.11</artifactId>
<version>1.6.1</version>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_2.11</artifactId>
<version>2.2.4</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scalamock</groupId>
<artifactId>scalamock-scalatest-support_2.11</artifactId>
<version>3.2</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>