pom
比较多取自己需要的
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<spring-boot-version>2.1.3.RELEASE</spring-boot-version>
<java-version>1.8</java-version>
<scala-version>2.11.8</scala-version>
<spark-version>2.1.0</spark-version>
<flink.version>1.12.1</flink.version>
<mybatis-spring-boot-version>2.0.0</mybatis-spring-boot-version>
<ali-druid-version>1.1.16</ali-druid-version>
<ali-fastjson-version>1.2.58.sec10</ali-fastjson-version>
<scala-maven-plugin>4.1.1</scala-maven-plugin>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
<exclusions>
<exclusion>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-logging</artifactId>
</exclusion>
</exclusions>
</dependency>
<!--mybatis-->
<dependency>
<groupId>org.mybatis.spring.boot</groupId>
<artifactId>mybatis-spring-boot-starter</artifactId>
<version>${mybatis-spring-boot-version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-runtime-web_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-core</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-java -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-streaming-java -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-scala -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-scala_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-streaming-scala -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-connector-kafka -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-table-common -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-table-api-scala-bridge -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-scala-bridge_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-table-planner -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-core -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-core</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-connector-kafka-0.10 -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka-0.10_2.11</artifactId>
<version>1.11.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.google.code.gson/gson -->
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.6</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>28.1-jre</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-client -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>2.4.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-sql-connector-kafka_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-sql-connector-hive-2.2.0_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark-version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark-version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>${spark-version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark-version}</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
<version>${ali-druid-version}</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>${ali-fastjson-version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
<dependency>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.1.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.springframework.boot/spring-boot-starter-data-redis -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
<version>2.0.4.RELEASE</version>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-pool2 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-pool2</artifactId>
<version>2.6.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.scalaj/scalaj-http -->
<dependency>
<groupId>org.scalaj</groupId>
<artifactId>scalaj-http_2.11</artifactId>
<version>2.4.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/net.databinder/dispatch-http -->
<dependency>
<groupId>net.databinder</groupId>
<artifactId>dispatch-http_2.9.1</artifactId>
<version>0.8.6</version>
</dependency>
</dependencies>
接下来,创建一个POJO对象用于保存数据等操作。
package myflink.pojo;
import java.util.Date;
/**
* @author huangqingshi
* @Date 2019-12-07
*/
public class Person {
private String name;
private int age;
private Date createDate;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getAge() {
return age;
}
public void setAge(int age) {
this.age = age;
}
public Date getCreateDate() {
return createDate;
}
public void setCreateDate(Date createDate) {
this.createDate = createDate;
}
}
创建一个写入kafka的任务,用于将数据写入到kafka。
package myflink.kafka;
import com.alibaba.fastjson.JSON;
import myflink.pojo.Person;
import org.apache.commons.lang3.RandomUtils;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;
import java.util.Date;
import java.util.Properties;
import java.util.concurrent.TimeUnit;
/**
* @author huangqingshi
* @Date 2019-12-07
*/
public class KafkaWriter {
//本地的kafka机器列表
public static final String BROKER_LIST = "localhost:9092";
//kafka的topic
public static final String TOPIC_PERSON = "PERSON";
//key序列化的方式,采用字符串的形式
public static final String KEY_SERIALIZER = "org.apache.kafka.common.serialization.StringSerializer";
//value的序列化的方式
public static final String VALUE_SERIALIZER = "org.apache.kafka.common.serialization.StringSerializer";
public static void writeToKafka() throws Exception{
Properties props = new Properties();
props.put("bootstrap.servers", BROKER_LIST);
props.put("key.serializer", KEY_SERIALIZER);
props.put("value.serializer", VALUE_SERIALIZER);
KafkaProducer<String, String> producer = new KafkaProducer<>(props);
//构建Person对象,在name为hqs后边加个随机数
int randomInt = RandomUtils.nextInt(1, 100000);
Person person = new Person();
person.setName("hqs" + randomInt);
person.setAge(randomInt);
person.setCreateDate(new Date());
//转换成JSON
String personJson = JSON.toJSONString(person);
//包装成kafka发送的记录
ProducerRecord<String, String> record = new ProducerRecord<String, String>(TOPIC_PERSON, null,
null, personJson);
//发送到缓存
producer.send(record);
System.out.println("向kafka发送数据:" + personJson);
//立即发送
producer.flush();
}
public static void main(String[] args) {
while(true) {
try {
//每三秒写一条数据
TimeUnit.SECONDS.sleep(3);
writeToKafka();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
创建一个数据库连接的工具类,用于连接数据库。使用Druid工具,然后放入具体的Driver,Url,数据库用户名和密码,初始化连接数,最大活动连接数,最小空闲连接数也就是数据库连接池,创建好之后返回需要的连接。
package myflink.db;
import com.alibaba.druid.pool.DruidDataSource;
import java.sql.Connection;
/**
* @author huangqingshi
* @Date 2019-12-07
*/
public class DbUtils {
private static DruidDataSource dataSource;
public static Connection getConnection() throws Exception {
dataSource = new DruidDataSource();
dataSource.setDriverClassName("com.mysql.cj.jdbc.Driver");
dataSource.setUrl("jdbc:mysql://localhost:3306/testdb");
dataSource.setUsername("root");
dataSource.setPassword("root");
//设置初始化连接数,最大连接数,最小闲置数
dataSource.setInitialSize(10);
dataSource.setMaxActive(50);
dataSource.setMinIdle(5);
//返回连接
return dataSource.getConnection();
}
}
接下来创建一个MySQLSink,继承RichSinkFunction类。重载里边的open、invoke、close方法,在执行数据sink之前先执行open方法,然后开始调用invoke, 调用完成后最后执行close方法。也就是先在open里边创建数据库连接,创建好之后进行调用invoke,执行具体的数据库写入程序,执行完所有的数据库写入程序之后,最后没有数据之后会调用close方法,将数据库连接资源进行关闭和释放。具体参考如下代码。
package myflink.sink;
import myflink.db.DbUtils;
import myflink.pojo.Person;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.Timestamp;
import java.util.List;
/**
* @author huangqingshi
* @Date 2019-12-07
*/
public class MySqlSink extends RichSinkFunction<List<Person>> {
private PreparedStatement ps;
private Connection connection;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
//获取数据库连接,准备写入数据库
connection = DbUtils.getConnection();
String sql = "insert into person(name, age, createDate) values (?, ?, ?); ";
ps = connection.prepareStatement(sql);
}
@Override
public void close() throws Exception {
super.close();
//关闭并释放资源
if(connection != null) {
connection.close();
}
if(ps != null) {
ps.close();
}
}
@Override
public void invoke(List<Person> persons, Context context) throws Exception {
for(Person person : persons) {
ps.setString(1, person.getName());
ps.setInt(2, person.getAge());
ps.setTimestamp(3, new Timestamp(person.getCreateDate().getTime()));
ps.addBatch();
}
//一次性写入
int[] count = ps.executeBatch();
System.out.println("成功写入Mysql数量:" + count.length);
}
}
创建从kafka读取数据的source,然后sink到数据库。配置连接kafka所需要的环境,然后从kafka里边读取数据然后transform成Person对象,这个就是上边所说的transform。收集5秒钟窗口从kafka获取的所有数据,最后sink到MySQL数据库。
package myflink;
import com.alibaba.fastjson.JSONObject;
import myflink.kafka.KafkaWriter;
import myflink.pojo.Person;
import myflink.sink.MySqlSink;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.shaded.guava18.com.google.common.collect.Lists;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.AllWindowFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010;
import org.apache.flink.util.Collector;
import java.util.List;
import java.util.Properties;
/**
* @author huangqingshi
* @Date 2019-12-07
*/
public class DataSourceFromKafka {
public static void main(String[] args) throws Exception{
//构建流执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//kafka
Properties prop = new Properties();
prop.put("bootstrap.servers", KafkaWriter.BROKER_LIST);
prop.put("zookeeper.connect", "localhost:2181");
prop.put("group.id", KafkaWriter.TOPIC_PERSON);
prop.put("key.serializer", KafkaWriter.KEY_SERIALIZER);
prop.put("value.serializer", KafkaWriter.VALUE_SERIALIZER);
prop.put("auto.offset.reset", "latest");
DataStreamSource<String> dataStreamSource = env.addSource(new FlinkKafkaConsumer010<String>(
KafkaWriter.TOPIC_PERSON,
new SimpleStringSchema(),
prop
)).
//单线程打印,控制台不乱序,不影响结果
setParallelism(1);
//从kafka里读取数据,转换成Person对象
DataStream<Person> dataStream = dataStreamSource.map(value -> JSONObject.parseObject(value, Person.class));
//收集5秒钟的总数
dataStream.timeWindowAll(Time.seconds(5L)).
apply(new AllWindowFunction<Person, List<Person>, TimeWindow>() {
@Override
public void apply(TimeWindow timeWindow, Iterable<Person> iterable, Collector<List<Person>> out) throws Exception {
List<Person> persons = Lists.newArrayList(iterable);
if(persons.size() > 0) {
System.out.println("5秒的总共收到的条数:" + persons.size());
out.collect(persons);
}
}
})
//sink 到数据库
.addSink(new MySqlSink());
//打印到控制台
//.print();
env.execute("kafka 消费任务开始");
}
}
在flink处理数据的scala写法
import com.alibaba.fastjson.JSON
import org.apache.flink.shaded.guava18.com.google.common.collect.Lists
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic}
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010
import org.springframework.beans.factory.annotation.Autowired
import org.springframework.boot.CommandLineRunner
import pj.streamservice.model.{LogFactPoint,}
import pj.streamservice.service.{MySqlSink}
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import java.util.List
import org.apache.flink.streaming.api.scala.function.AllWindowFunction
import org.apache.flink.util.Collector
import scala.collection.JavaConverters._
class kafkaTomysql extends CommandLineRunner{
@Autowired private val kafkaStreamBuilder:KafkaStreamBuilder = null
/**
* @param args
* 从kafka获取日志信息解析得到课程相关字段
*/
override def run(args: String*): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
env.enableCheckpointing(5000)
val kafkaConsume: FlinkKafkaConsumer010[String] = kafkaStreamBuilder.kafkaConsume()
val stream = env.addSource(kafkaConsume) //将kafka生产者发来的数据进行处理,本例子我进任何处理
val dataStream=stream.map(line=>JSON.parseObject(line, classOf[LogFactPoint]))
//收集5秒钟的总数//收集5秒钟的总数
dataStream.timeWindowAll(Time.seconds(5L)).apply(new AllWindowFunction[LogFactPoint,List[LogFactPoint],TimeWindow](){
@Override
def apply(timeWindow: TimeWindow, input: Iterable[LogFactPoint], out: Collector[List[LogFactPoint]]): Unit = {
val logFactPoints:List[LogFactPoint] = Lists.newArrayList(input.asJava)
if (logFactPoints.size > 0) {
System.out.println("5秒的总共收到的条数:" + logFactPoints.size)
out.collect(logFactPoints)
}
}
}).addSink(new MySqlSink())
env.execute("kafka 消费任务开始");
}
}
其中KafkaStreamBuilder为kafka配置自定义方法
public FlinkKafkaConsumer010<String> kafkaConsume() {
Properties properties = new Properties();
properties.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG,"org.apache.kafka.common.serialization.StringDeserializer");
properties.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,"org.apache.kafka.common.serialization.StringDeserializer");
properties.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, consumerConfig.getBootstrapServers());
properties.setProperty(ConsumerConfig.GROUP_ID_CONFIG, consumerConfig.getGroupId());
properties.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, consumerConfig.getAutoOffsetReset());
properties.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, consumerConfig.getAutoCommit());
properties.setProperty(ConsumerConfig.PARTITION_ASSIGNMENT_STRATEGY_CONFIG, "org.apache.kafka.clients.consumer.RangeAssignor");
String topics = consumerConfig.getTopics();
List<String> topicsSet = new ArrayList<String>(Arrays.asList(topics.split(",")));
FlinkKafkaConsumer010<String> myConsumer = new FlinkKafkaConsumer010<String>(topicsSet, new SimpleStringSchema(),
properties);//test0是kafka中开启的topic
myConsumer.assignTimestampsAndWatermarks(new CustomWatermarkEmitter());
return myConsumer;
}
上述方式是为了实现数据批量处理和写入
参考
https://www.cnblogs.com/huangqingshi/p/12003453.html