解析CSV文件为java对象并上传到kafka
java 1.8
kafka 2.6.0
需求:解析csv格式的日志文件,并按照规定的key与json格式的value传到kafka的topic上,需要写一个input工具。
思路整理:
-
java处理csv文件,每行为单位保存到String变量中;
-
将每行的String类型变量进行截取并set到我们的实体类(当然也可以不映射,但是拼json的时候得手动拼接),并按照需求拼接对应的key;
-
每条记录上传到kafka上。
开整:
-
pom文件,其中比较重要的是csv的解析jar和阿里的json包;
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.huyue.flink</groupId> <artifactId>CSVKafkaToFlink</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>CSVKafkaToFlink</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <dependencies> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.56</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.flink/flink-clients --> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-clients_2.12</artifactId> <version>1.11.1</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.flink/flink-streaming-java --> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-java_2.12</artifactId> <version>1.11.1</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.flink/flink-java --> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-java</artifactId> <version>1.11.1</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.flink/flink-connector-kafka --> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-connector-kafka_2.12</artifactId> <version>1.11.1</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.56</version> </dependency> <!-- 读取csv文件的必要jar --> <dependency> <groupId>net.sourceforge.javacsv</groupId> <artifactId>javacsv</artifactId> <version>2.0</version> </dependency> <dependency> <groupId>com.fasterxml.jackson.core</groupId> <artifactId>jackson-databind</artifactId> <version>2.9.10.1</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka --> <dependency> <groupId>org.apache.kafka</groupId> <artifactId>kafka_2.12</artifactId> <version>2.6.0</version> </dependency> <dependency> <groupId>org.apache.kafka</groupId> <artifactId>kafka-clients</artifactId> <version>2.6.0</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> </dependencies> </project>
-
编写对于的Bean实体类,由于我的实体类中字段比较多,我只粘出部分
public class ComponentSFCBean { private String xxx ; private String xxx; private Integer xx; private String xx; private long xx; ...字段 ...getter,setter方法 ...构造函数 ...toString()方法
-
编写kafka生产者
/** * */ package com.huyue.flink.kafka; import java.util.Date; import java.util.List; import java.util.Properties; import org.apache.kafka.clients.producer.ProducerRecord; import com.huyue.flink.pojo.TestBean; /** * @author Hu.Yue * */ public class KafkaProducer{ private final String topic; private final org.apache.kafka.clients.producer.KafkaProducer<String, String> producer; public KafkaProducer(String kafkaTopic) { this.topic = kafkaTopic; this.producer = new org.apache.kafka.clients.producer.KafkaProducer<>(createKafkaProperties()); } /** * @Author: Hu.Yue * @Title: uploadData * @Description: 上传key与value方法 * @param @param key * @param @param value * @return void * @throws */ public void uploadData(String key, String value) { ProducerRecord<String, String> producerRecord = new ProducerRecord<String, String>(topic, key, value); producer.send(producerRecord); } /** * @Author: Hu.Yue * @Title: createKafkaProperties * @Description: 配置 * @param @return * @return Properties * @throws */ private static Properties createKafkaProperties() { Properties properties = new Properties(); //集群地址 properties.put("bootstrap.servers", "localhost"); //安全机制 properties.put("sasl.jaas.config", "org.apache.kafka.common.security.scram.ScramLoginModule required username='xxx' password='xxx';"); properties.put("security.protocol", "SASL_PLAINTEXT"); properties.put("sasl.mechanism", "PLAIN"); //序列化 properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer"); properties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer"); //参数配置 properties.put("acks", "all"); properties.put("retries", 3); properties.put("batch.size", 65536); properties.put("linger.ms", 1); properties.put("buffer.memory", 33554432); properties.put("max.request.size", 10485760); return properties; } /** * @Author: Hu.Yue * @Title: closeClient * @Description: 关闭客户端 * @param * @return void * @throws */ public void closeClient() { producer.flush(); producer.close(); } }
-
编写主函数做调用提交;
/** * */ package com.huyue.flink; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import com.alibaba.fastjson.JSONObject; import com.huyue.flink.kafka.KafkaProducer; import com.huyue.flink.pojo.ComponentSFCBean; import com.huyue.flink.pojo.TestBean; import com.huyue.flink.utils.TestCsvFileReader; import com.huyue.flink.utils.TestCsvReader; import java.util.ArrayList; import java.util.List; import java.util.function.Supplier; import java.util.stream.Stream; /** * @author Hu.Yue * */ public class SendMessageApplication { /** * @throws IOException * @Author: Hu.Yue * @Title: main * @Description: TODO * @param @param args * @return void * @throws */ public static void main(String[] args) throws IOException { //CSV文件地址 String filePath = "E:\\提交\\文件.csv"; //kafka topic String topic = "huyue-topic"; readCsv(filePath); } /** * @Author: Hu.Yue * @Title: readCsv * @Description: 文件读取以及单条kafka提交 * @param @param file * @return void * @throws */ public static void readCsv(String file) { // TODO Auto-generated method stub File csv = new File(file); BufferedReader textFile = null; try { textFile = new BufferedReader(new FileReader(csv)); String lineDta = ""; while ((lineDta = textFile.readLine()) != null) { //1. 依次读取文档中的文件 System.out.println(lineDta); //2. 对单条数据进行字符串截取 String[] componentStr = lineDta.split(","); //3. 依次对字段进行赋值 ComponentSFCBean one = new ComponentSFCBean( componentStr[0], //fdate componentStr[1], //idstation Integer.parseInt(componentStr[2]), componentStr[3], //cmodel componentStr[4], //boardsn componentStr[5], componentStr[6], Long.parseLong(componentStr[7]), Float.parseFloat(componentStr[8]), Float.parseFloat(componentStr[9]), Float.parseFloat(componentStr[10]), Float.parseFloat(componentStr[11]), Float.parseFloat(componentStr[12]), Integer.parseInt(componentStr[13]), Integer.parseInt(componentStr[14]), componentStr[15], componentStr[16], componentStr[17], Long.parseLong(componentStr[18]), componentStr[19], Float.parseFloat(componentStr[20]), componentStr[21] ); // System.out.println("========单条数据java对象"); // System.out.println(one.toString()); //4. 保留一个Key String key = componentStr[0]+componentStr[1]+componentStr[3]+componentStr[4]; // System.out.println("========kafka的Key:"); // System.out.println(key); //5. 将对象转化成json类型 String value = JSONObject.toJSONString(one); // System.out.println("========kafka的value:json对象"); // System.out.println(value); //5. 进行kafka的提交 KafkaProducer kProducer = new KafkaProducer("huyue-topic"); kProducer.uploadData(key, value); //6. 关闭kafka连接!!! kProducer.closeClient(); } }catch (Exception e) { // TODO: handle exception e.printStackTrace(); }finally { if(textFile == null) { try { textFile.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } }