Flink1.10.1_LogToKafkaToHdfs
一.模拟数据
log_to_kafka
package com.log;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerRecord;
import java.io.*;
import java.util.Properties;
import java.util.concurrent.ExecutionException;
public class CustomProducer {
public static void main(String[] args) throws ExecutionException, InterruptedException, IOException {
Properties props = new Properties();
props.put("bootstrap.servers", "192.168.1.162:9092");//kafka集群,broker-list
props.put("acks", "all");
props.put("retries", 1);//批次大小
props.put("linger.ms", 1);//等待时重试次数
props.put("batch.size", 16384);//间
props.put("buffer.memory", 33554432);//RecordAccumulator缓冲区大小
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
Producer<String, String> producer = new KafkaProducer<>(props);
//读取文件
File file = new File("D:\\code\\log_to_kafka\\src\\main\\resources\\temop");
String encoding = "UTF-8";
StringBuilder result = new StringBuilder();//
FileInputStream fiStream = new FileInputStream(file);
InputStreamReader inputStreamReader = new InputStreamReader(fiStream);
BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
String line = null;
while ((line = bufferedReader.readLine()) != null) {// 使用readLine方法,一次读一行
result.append(line + System.lineSeparator());
System.out.println("append result");
}
bufferedReader.close();
inputStreamReader.close();
fiStream.close();
boolean flag = true;
while (flag) {
producer.send(new ProducerRecord<String, String>("pageLog", result.toString()));
Thread.sleep(5000);
}
producer.close();
}
}
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.kafka.test</groupId>
<artifactId>log_to_kafka</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>2.4.1</version>
</dependency>
</dependencies>
</project>
temop
{
"app_id":"4",
"device_id":"103",
"distinct_id":"bad691f2-95f3-43de-8804-23d8c6f50afb",
"event_name":"-",
"ip":"139.196.217.70",
"last_event_name":"-",
"last_page_id":"0",
"next_event_name":"-",
"next_page_id":"2",
"page_id":"1",
"server_time":"-",
"uid":"768152"
}
二.实现
FromKafka
package kafka_source
import java.time.ZoneId
import java.util.Properties
import bean.PageLog
import com.alibaba.fastjson.{JSON, JSONObject}
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.fs.bucketing.{BucketingSink, DateTimeBucketer}
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011
object FromKafka {
def main(args: Array[String]): Unit = {
// 创建执行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
val properties = new Properties()
properties.setProperty("bootstrap.servers","192.168.1.162:9092")
properties.setProperty("zookeeper.connect","192.168.1.162:2181")
properties.setProperty("group.id", "FromKafka001")
properties.setProperty("key.deserializer",
"org.apache.kafka.common.serialization.StringDeserializer")
properties.setProperty("value.deserializer",
"org.apache.kafka.common.serialization.StringDeserializer")
properties.setProperty("auto.offset.reset", "latest")
import org.apache.flink.api.scala._
val kafkaSource = new FlinkKafkaConsumer011[String]("pageLog", new
SimpleStringSchema(), properties)
val stream3:DataStream[String] = env.addSource(kafkaSource)
val data: DataStream[String] = stream3.map(line => {
val jsonArr: Array[String] = line.split("\r\n")
val result = new StringBuilder()
for (perJson <- jsonArr) {
val logObject: JSONObject = JSON.parseObject(perJson.toString)
//var pl = new PageLog
// pl.app_id = logObject.get("app_id").toString
// pl.device_id = logObject.get("device_id").toString.toInt
// pl.page_id = logObject.get("page_id").toString
// pl.uid = logObject.get("uid").toString.toString.toInt
var strPerJson=logObject.get("app_id").toString+","+logObject.get("device_id").toString+","+logObject.get("page_id").toString+","+logObject.get("uid").toString
result.append(strPerJson+"\r\n")
//println(pl)
}
//print("====================")
println(result.toString())
result.toString()
})
//stream3.print()
//写入hdfs
// val hadoopSink = new BucketingSink[String]("hdfs://gtdata-test01:9000/testw/")
// val hadoopSink = new BucketingSink[String]("hdfs://gtdata-test01:8020/testw/")
val hadoopSink = new BucketingSink[String]("hdfs://192.168.1.162:8020/kafkaTohdfs/")
// 使用东八区时间格式"yyyy-MM-dd--HH"命名存储区
hadoopSink.setBucketer(new DateTimeBucketer[String]("yyyy-MM-dd--HH", ZoneId.of("Asia/Shanghai")))
// 下述两种条件满足其一时,创建新的块文件
// 条件1.设置块大小为100MB
hadoopSink.setBatchSize(1024 * 1024 * 100)
// 条件2.设置时间间隔20min
hadoopSink.setBatchRolloverInterval(60 * 60 * 1000) //1min sec ms
// 设置块文件前缀
hadoopSink.setPendingPrefix("")
// 设置块文件后缀
hadoopSink.setPendingSuffix("")
// 设置运行中的文件前缀
hadoopSink.setInProgressPrefix(".")
data.addSink(hadoopSink)
//参数分别是:写入topic,序列化器,kafka配置惨
// data.addSink(new FlinkKafkaProducer010[String](bootstrap_servers, topicsDelay, new SimpleStringSchema()) )
env.execute()
}
}
PageLog
package bean
import scala.beans.BeanProperty
class PageLog {
@BeanProperty
var app_id:String="";
@BeanProperty
var device_id:Int=0;
@BeanProperty
var page_id:String="";
@BeanProperty
var uid:Int=0;
override def toString = s"PageLog($app_id, $device_id, $page_id, $uid)"
}
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.flink.test</groupId>
<artifactId>kafkaToHdfs</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<slf4j.version>1.7.30</slf4j.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-scala_2.12</artifactId>
<version>1.10.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-streaming-scala -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.12</artifactId>
<version>1.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka-0.11_2.12</artifactId>
<version>1.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_2.12</artifactId>
<version>1.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-scala-bridge_2.12</artifactId>
<version>1.10.1</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.4</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-filesystem_2.11</artifactId>
<version>1.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.2.0</version>
</dependency>
<!-- hdfs-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>3.2.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-shaded-hadoop-2-uber -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-shaded-hadoop-2-uber</artifactId>
<version>2.7.5-7.0</version>
<scope>provided</scope>
</dependency>
</dependencies>
<build>
<plugins>
<!-- 该插件用于将 Scala 代码编译成 class 文件 -->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.4.6</version>
<executions>
<execution>
<!-- 声明绑定到 maven 的 compile 阶段 -->
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.0.0</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>