hadoop spark flink
hadoop spark flink
hadoop spark flink
hadoop spark flink
hadoop spark flink
hadoop spark flink
1、代码实现
package flink.demo;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
public class WordCount0 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<String> inputDataStream = env.readTextFile("H:\\flink_demo\\flink_test\\src\\main\\resources\\wordcount.txt");
SingleOutputStreamOperator<Tuple2<String, Integer>> resultDataStream = inputDataStream.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
@Override
public void flatMap(String input, Collector<Tuple2<String, Integer>> collector) throws Exception {
String[] words = input.split(" ");
for (String word : words) {
collector.collect(new Tuple2<>(word, 1));
}
}
}).keyBy(0)
.sum(1);
resultDataStream.print();
env.execute();
}
}
![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/1a611cf59fa799533121bcfe041a7b29.png)
2、优化点一 - 使用面向对象
- 优化点:把数据看成对象,遇到字段较多的数据操作比较方便
2.1、自定义对象数据结构
public class WordAndCount {
private String word;
private int count;
public WordAndCount() {
}
public WordAndCount(String word, int count) {
this.word = word;
this.count = count;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
@Override
public String toString() {
return "WordAndCount{" +
"word='" + word + '\'' +
", count=" + count +
'}';
}
}
2.2、main方法实现业务逻辑
public class WordCount {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<String> inputDataStream = env.readTextFile("H:\\flink_demo\\flink_test\\src\\main\\resources\\wordcount.txt");
SingleOutputStreamOperator<WordAndCount> resultData = inputDataStream.flatMap(new FlatMapFunction<String, WordAndCount>() {
@Override
public void flatMap(String line, Collector<WordAndCount> out) throws Exception {
String[] fields = line.split(" ");
for (String word : fields) {
out.collect(new WordAndCount(word, 1));
}
}
}).keyBy("word").sum("count");
resultData.print();
env.execute();
}
}
![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/ef3bd92fa1cb701d73801a35aa7b72c1.png)
3、优化点二 - 抽取业务功能
3.1、自定义对象的数据结构
public class WordAndCount {
private String word;
private int count;
public WordAndCount() {
}
public WordAndCount(String word, int count) {
this.word = word;
this.count = count;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
@Override
public String toString() {
return "WordAndCount{" +
"word='" + word + '\'' +
", count=" + count +
'}';
}
}
3.2、抽取业务逻辑
public static class SplitLine implements FlatMapFunction<String,WordAndCount>{
@Override
public void flatMap(String line, Collector<WordAndCount> out) throws Exception {
String[] fields = line.split(" ");
for (String word : fields) {
out.collect(new WordAndCount(word, 1));
}
}
}
3.3、main方法实现
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<String> inputDataStream = env.readTextFile("H:\\flink_demo\\flink_test\\src\\main\\resources\\wordcount.txt");
SingleOutputStreamOperator<WordAndCount> resultData = inputDataStream.flatMap(new SplitLine()).keyBy("word").sum("count");
resultData.print();
env.execute();
}
![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/798a0a3c17f08fa02ab904a7c89e4dee.png)
4、优化点三 - 数据源传参
- 优化点:flink建议如果程序中需要传入参数,使用它提供的ParameterTool。
4.1、自定义对象的数据结构
public class WordAndCount {
private String word;
private int count;
public WordAndCount() {
}
public WordAndCount(String word, int count) {
this.word = word;
this.count = count;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
@Override
public String toString() {
return "WordAndCount{" +
"word='" + word + '\'' +
", count=" + count +
'}';
}
}
4.2、抽取业务逻辑
public static class SplitLine implements FlatMapFunction<String,WordAndCount>{
@Override
public void flatMap(String line, Collector<WordAndCount> out) throws Exception {
String[] fields = line.split(" ");
for (String word : fields) {
out.collect(new WordAndCount(word, 1));
}
}
}
4.3、main方法实现自定义参数传递
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
ParameterTool parameterTool = ParameterTool.fromArgs(args);
String path = parameterTool.get("path");
DataStreamSource<String> dataStream = env.readTextFile(path);
SingleOutputStreamOperator<WordAndCount> resultData = dataStream.flatMap(new SplitLine()).keyBy("word").sum("count");
resultData.print();
env.execute();
}
![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/08a4eb8d454095e7b3fd045eee6a98fb.png)
![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/f903714e10af00bac9c4e84af172f693.png)
5、生产环境最佳代码实践
5.1、pom文件配置
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="http://maven.apache.org/POM/4.0.0"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>flinkdemo</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<java.version>1.8</java.version>
<scala.version>2.11</scala.version>
<flink.version>1.9.3</flink.version>
<parquet.version>1.10.0</parquet.version>
<hadoop.version>2.7.3</hadoop.version>
<fastjson.version>1.2.72</fastjson.version>
<redis.version>2.9.0</redis.version>
<mysql.version>5.1.35</mysql.version>
<log4j.version>1.2.17</log4j.version>
<slf4j.version>1.7.7</slf4j.version>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<maven.compiler.compilerVersion>1.8</maven.compiler.compilerVersion>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.build.scope>compile</project.build.scope>
<mainClass>com.hainiu.Driver</mainClass>
</properties>
<dependencies>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>${slf4j.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>${log4j.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-hadoop-compatibility_${scala.version}</artifactId>
<version>${flink.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.version}</artifactId>
<version>${flink.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-scala_${scala.version}</artifactId>
<version>${flink.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_${scala.version}</artifactId>
<version>${flink.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-runtime-web_${scala.version}</artifactId>
<version>${flink.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-statebackend-rocksdb_${scala.version}</artifactId>
<version>${flink.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-hbase_${scala.version}</artifactId>
<version>${flink.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-elasticsearch5_${scala.version}</artifactId>
<version>${flink.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka-0.10_${scala.version}</artifactId>
<version>${flink.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-filesystem_${scala.version}</artifactId>
<version>${flink.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>${mysql.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>${redis.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
<version>${parquet.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>${parquet.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-parquet_${scala.version}</artifactId>
<version>${flink.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>${fastjson.version}</version>
<scope>${project.build.scope}</scope>
</dependency>
</dependencies>
<build>
<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
</resources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptors>
<descriptor>src/assembly/assembly.xml</descriptor>
</descriptors>
<archive>
<manifest>
<mainClass>${mainClass}</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.12</version>
<configuration>
<skip>true</skip>
<forkMode>once</forkMode>
<excludes>
<exclude>**/**</exclude>
</excludes>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
<encoding>${project.build.sourceEncoding}</encoding>
</configuration>
</plugin>
</plugins>
</build>
</project>
5.2、自定义对象的数据结构
package flink.demo;
public class WordAndCount {
private String word;
private int count;
public WordAndCount() {
}
public WordAndCount(String word, int count) {
this.word = word;
this.count = count;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
@Override
public String toString() {
return "WordAndCount{" +
"word='" + word + '\'' +
", count=" + count +
'}';
}
}
5.3、入口类实现
package flink.demo;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
public class WordCount {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
ParameterTool parameterTool = ParameterTool.fromArgs(args);
String path = parameterTool.get("path");
DataStreamSource<String> dataStream = env.readTextFile(path);
SingleOutputStreamOperator<WordAndCount> resultData = dataStream.flatMap(new SplitLine()).keyBy("word").sum("count");
resultData.print();
env.execute();
}
public static class SplitLine implements FlatMapFunction<String,WordAndCount>{
@Override
public void flatMap(String line, Collector<WordAndCount> out) throws Exception {
String[] fields = line.split(" ");
for (String word : fields) {
out.collect(new WordAndCount(word, 1));
}
}
}
}
5.4、代码目录结构
![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/9fdc990d814f9bc56111548af83165e9.png)