实战一:通过socket模拟产生单词数据
场景:需要实现每隔1秒对最近2秒内的数据进行汇总计算
package org.jy.data.yh.bigdata.streaming.flink;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
/**
* 流式统计单词
* 滑动窗口计算
* 通过socket模拟产生单词数据
* flink对数据进行统计计算
* 需要实现每隔1秒对最近2秒内的数据进行汇总计算
* Created by yanghong
* [hadoop@centoshadoop1 ~]$ nc -l 9000 需要执行该命令,否则自动报错连接不上,而退出
*/
public class SocketWindowWordCountJava {
public static void main( String[] args ) throws Exception {
// 获取需要的端口好
int port;
try{
ParameterTool parameterTool = ParameterTool.fromArgs(args);
port = parameterTool.getInt("port");
}catch (Exception e){
System.err.println("No port set. use default port 9000--java");
port = 9000;
}
// 获取flink的运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
String hostname = "centoshadoop1";
// 定界符,分隔符
// 连接socket获取输入的数据
String delimiter = "\n";
//连接socket获取输入的数据
DataStreamSource<String> text = env.socketTextStream(hostname, port, delimiter);
DataStream<WordWithCount> windowCounts = text.flatMap(new FlatMapFunction<String, WordWithCount>() {
@Override
public void flatMap(String value, Collector<WordWithCount> collector) throws Exception {
String[] splits = value.split("\\s");
for(String word : splits){
collector.collect(new WordWithCount(word,1L));
}
}
}).keyBy("word")
.timeWindow(Time.seconds(600),Time.seconds(1)) //指定时间窗口大小为2秒,指定时间间隔为1秒 只对追加2秒钟收集
.sum("count");//在这里使用sum或者reduce都可以
/*.reduce(new ReduceFunction<WordWithCount>() {
public WordWithCount reduce(WordWithCount a, WordWithCount b) throws Exception {
return new WordWithCount(a.word,a.count+b.count);
}
})*/
//把数据打印到控制台并且设置并行度
windowCounts.print().setParallelism(1);
//这一行代码一定要实现,否则程序不执行
env.execute("Socket window count");
}
public static class WordWithCount{
public String word;
public long count;
public WordWithCount(){}
public WordWithCount(String word,long count){
this.word = word;
this.count = count;
}
@Override
public String toString() {
return "WordWithCount{" +
"word='" + word + '\'' +
", count=" + count +
'}';
}
}
}
scala版
package com.yh.bigdata.flink.streaming import org.apache.flink.api.java.utils.ParameterTool import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment import org.apache.flink.streaming.api.windowing.time.Time ; /** * 流解析 * 需求: 实现每隔1s对最近2内的数据进行汇总计算 * 分析: 通过socket模拟产生单词,使用Flink程序对数据信息汇总计算 */ object SocketWindowWordCount extends App { override def main(args: Array[String]): Unit = { // 获取socket的端口号 val tmpPort = "9000" val port = try{ ParameterTool.fromArgs(Array(tmpPort)).getInt("port") }catch { case e:Exception => { System.err.println("No port set. use default port 9000--scala") } 9000 // 默认值 } // 获取运行环境 val evn: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment // 连接socket获取输入的数据 val text = evn.socketTextStream("centoshadoop1",port,'\n') // 解析数据(把数据打平),分组,窗口计算,并且集合求sum // 注意:必须添加这一行隐式转换,否则下面的FlatMap方法执行会报错 import org.apache.flink.api.scala._ val windowWordCount = text.flatMap(line => line.split("\\s")) // 打平,把每一行单词都切开 .map(word => WordWithCount(word,1)) // 把单词转成word,1这种形式 .keyBy("word") // 分组 .timeWindow(Time.seconds(10),Time.seconds(1)) // 指定窗口大小,指定间隔时间10内单词计算 .sum("count") // sum 或者reduce都可以 // .reduce(a,b)=>WordWithCount(a.word,a.count+b.count) // 打印到控制台 windowWordCount.print().setParallelism(1) // 执行任务 evn.execute("Socket window count") } case class WordWithCount(word: String, count: Long) }
================================================================================
实战二:批量离线处理:统计单词
需求:统计一个文件中的单词出现的总次数,并且把统计的结果存入文本文件中
package org.jy.data.yh.bigdata.batch.flink;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
/**
* 批量离线处理:统计单词
* 需求:统计一个文件中的单词出现的总次数,并且把统计的结果存入文本文件中
* 输出的result文件的内容为:
* hong 939
hello 2343
world 1404
hadoopyang 463
yang 476
hadoop 1416
bye 1903
*/
public class BatchWordCountJava {
public static void main(String[] args) throws Exception {
String inputPath = "D:/jar/inputPath"; // 目录 ,下面创建一个需要统计的文本文件,加入以下内容
String outPath ="D:/jar/result"; // 输出到jar目录下的result文件,该文件不需要创建
// 获取运行环境
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// 获取文件中的内容
DataSource<String> text = env.readTextFile(inputPath);
DataSet<Tuple2<String,Integer>> counts = text.flatMap(new Tokenizer())
.groupBy(0).sum(1);
counts.writeAsCsv(outPath,"\n"," ").setParallelism(1);
env.execute("batch word count");
}
private static class Tokenizer implements FlatMapFunction<String,Tuple2<String,Integer>>{ // 分词解析解析器
@Override
public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception {
String[] tokens = value.toLowerCase().split("\\W+");
for(String token : tokens){
if(token.length() >0 ){
out.collect(new Tuple2<String,Integer>(token,1));
}
}
}
}
}
scala版本
package com.yh.bigdata.flink.batch import org.apache.flink.api.scala.ExecutionEnvironment /** * 单词计算之流式计算 */ object BatchWordCountScala { def main(args: Array[String]): Unit = { val inputPath="d:/jar/inputPath" val outPath = "d:/jar/result" val env = ExecutionEnvironment.getExecutionEnvironment // 这是一个方法 val text = env.readTextFile(inputPath) // 引入隐式转换 import org.apache.flink.api.scala._ val counts = text.flatMap(_.toUpperCase().split("\\W+")) .filter(_.nonEmpty) .map((_,1)) .groupBy(0) .sum(1) counts.writeAsCsv(outPath,"\n"," ") env.execute("batch word count") } }