采集文件到kafka

  采集指定目录下文本数据到kafka

package com.shenyuchong;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Date;
import java.util.Properties;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Pattern;
import org.apache.kafka.clients.producer.Callback;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.clients.producer.RecordMetadata;
import org.apache.kafka.common.serialization.StringSerializer;
/*
 * 用途:
 *         用于收集多个文件夹下文件内容到kafka;
 *         文件一行一行发送;
 *         支持发送完成后发出通知
 *         文件发送完成后会将文件添加.COMPLETED后缀
 *         支持采集指定后缀(多个)
 *         支持对行进行正则,不匹配的行丢弃
 *         仅支持对行进行分隔符切分
 *         支持将切分后的字段按新分隔符重组
 * 
 * 用法:
 *         mvn package打包成jar包:
 *             file2kafka-2.0.jar
 *         编写配置文件xxx.conf内容如下:
 *             ip=192.168.1.91
 *             threadnum=20
 *             port=9092
 *             topic=customertopic
 *             path=/home/ftpuser/customer
 *             includesuffix=txt
 *             lineregex=^#\d.*$
 *             delimiter=\s+
 *             noticeurl=http://192.168.1.92:6009/schedule/customer
 *             fieldsquence=id,name,score
 *         执行:
 *             java -jar file2kafka-2.0.jar xxx.conf
 *         建议:用linux crontab进行定时执行(对同一个目录进行多次采集不会造成数据重复发送)
 */
public class App {
    public static String fieldSquence = "";
    public static int    fieldNum = 0;
    public static String ip = "";
    public static String port = "";
    public static String path = "";
    public static String threadNum = "5";
    public static String topic = "";
    public static String lineRegex = "^.*$";
    public static String delimiter = "\\s+";
    public static String delimiter2 = "|||";
    public static String includeSuffix = "aSuffix,bSuffix";
    public static Pattern linePattern =null;
    public static Properties props =null;
    public static String noticeUrl;
    public static void main(String[] args) {
        /*
         * 配置文件若不存在则抛出异常
         */
        if(args.length<1){
            try {
                throw new Exception("无配置文件");
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        try {
            BufferedReader br = new BufferedReader(new FileReader(new File(args[0])));
            String line="";
            while((line=br.readLine())!=null){
                line = line.replaceAll("\\s+", "");
                if(line.indexOf("=")!=-1){
                    String[] kv=line.split("=");
                    String k= kv[0];
                    String v= kv[1];
                    if (k.equals("port"))          port = v;            //kafka 端口
                    if (k.equals("ip"))            ip = v;              //kafka 主机地址
                    if (k.equals("topic"))         topic = v;           //kafka 主题
                    if (k.equals("fieldsquence"))  fieldSquence = v;    //字段序列,逗号隔开
                    if (k.equals("threadnum"))     threadNum = v;       //采集线程数
                    if (k.equals("path"))          path = v;            //采集的目录,多目录逗号隔开
                    if (k.equals("lineregex"))     lineRegex=v;         //行正则,不匹配的行数据丢弃
                    if (k.equals("delimiter"))     delimiter=v;         //字段分隔符
                    if (k.equals("delimiter2"))    delimiter2=v;        //重组分隔符(发送到Kafka)
                    if (k.equals("includesuffix")) includeSuffix=v;     //包含文件的后缀
                    if (k.equals("noticeurl"))     noticeUrl=v;         //采集完成通知的接口

                }
            }
            br.close();
        } catch (IOException e1) {
            e1.printStackTrace();
        }
        /*
         * kafka配置
         */
        props = new Properties();
        props.put("bootstrap.servers", ip+":"+port);
        props.put("acks", "all");
        props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        /*
         * 将字段序列按逗号分隔,并获取字段序数目
         */
        fieldNum = fieldSquence.split(",").length;
        /*
         * 行数据正则Pattern
         */
        linePattern= Pattern.compile(lineRegex);
        /*
         * 线程池
         */
        ExecutorService es = Executors.newFixedThreadPool(Integer.valueOf(threadNum));
        /*
         * 根据path目录获取文件
         * 根据includesuffix选中文件调用send(file)
         * 每个文件创建一个线程(线程实际总数由threadNum决定)
         */
        for(String path:path.split(",")){
            File dir=new File(path);
            File[] files = dir.listFiles();
            for(final File file:files){
                for(String suffix:includeSuffix.split(",")){
                    if(file.getAbsolutePath().endsWith(suffix)){
                        es.submit(new Runnable() {
                            @Override
                            public void run() {
                                send(file);                            
                            }
                        });
                    }
                }
                
            }
        }
        /*
         * 关闭线程池
         */
        es.shutdown();
        /*
         * 线程池停止后通知后续服务直到后续服务接受了请求
         */
        boolean stop=false,noticed=false;
        try {
            while(!stop||!noticed){
                if (es.isTerminated()) { 
                    stop=true;
                } 
                Thread.sleep(2000);
                if(stop){
                    noticed = connectSuccess(noticeUrl);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    /*
     * 读取文件并发送到kafka,文件内容发送完成后将文件添加.COMPLETED后缀
     */
    public static void send(File file){
        BufferedReader bf =null;
        StringBuffer sb = null;
        try {            
            bf = new BufferedReader(new FileReader(file));
            String line = null;
            Producer<String, String> producer = new KafkaProducer<>(props, new StringSerializer(), new StringSerializer());
            while((line = bf.readLine())!=null){
                sb = new StringBuffer();
                line = line.trim();
                if(linePattern.matcher(line).matches()){
                    String[] fields = line.split(delimiter);
                    if(fields.length<fieldNum){
                    }else{
                        for(String fieldValue:fields)
                            sb.append(fieldValue).append(delimiter2);
                        sb.append(file.getAbsolutePath());
                        producer.send(new ProducerRecord<String, String>(topic, String.valueOf((new Date()).getTime()), sb.toString()),new Callback() {
                            @Override
                            public void onCompletion(RecordMetadata rm, Exception e) {
                                if(e!=null)System.out.println("send fail"+rm.toString()+",e:"+e.getMessage());
                            }
                        });
                    }
                }else{
                }
            }
            producer.close();
        } catch (Exception e) {
            System.out.println(e.toString());
        }finally {
            if(bf!=null)
                try {
                    bf.close();
                } catch (Exception e) {
                    e.printStackTrace();
                }
        }
        file.renameTo(new File(file.getAbsolutePath()+".COMPLETED"));
    }
    /*
     * 根据地址请求服务,请求成功则返回true
     */
    public static boolean connectSuccess(String path){
        URL url;
        try {
            url = new URL(noticeUrl);
            HttpURLConnection con = (HttpURLConnection) url.openConnection();
            if(con.getResponseCode()==200) return true;
        } catch (Exception e) {
            return false;
        }
        return false;
    }
}

 

 

   配置文件编写customer2kafka.conf

ip=192.168.1.91
threadnum=20
port=9092
topic=customertopic
path=/home/ftpuser/customer
includesuffix=txt
lineregex=^#\d.*$
delimiter=\s+
noticeurl=http://192.168.1.92:6009/schedule/customer
fieldsquence=id,name,score

  maven打包执行:

java -jar file2kafka-2.0.jar /opt/app/file2kafka/customer2kafka.conf

  pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>com.shenyuchong</groupId>
<artifactId>file2kafka</artifactId>
<version>2.0</version>
<packaging>jar</packaging>

<name>file2hive</name>
<url>http://maven.apache.org</url>

<properties>
  <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
  <dependency>
    <groupId>org.apache.kafka</groupId>
    <artifactId>kafka-clients</artifactId>
    <version>2.0.0</version><!--$NO-MVN-MAN-VER$ -->
  </dependency>
</dependencies>
<build>
  <sourceDirectory>src</sourceDirectory>
  <plugins>
    <plugin>
      <artifactId>maven-assembly-plugin</artifactId>
      <configuration>
        <appendAssemblyId>false</appendAssemblyId>
        <descriptorRefs>
          <descriptorRef>jar-with-dependencies</descriptorRef>
        </descriptorRefs>
        <archive>
          <manifest>
            <mainClass>com.gbd.App</mainClass>
          </manifest>
        </archive>
      </configuration>
      <executions>
        <execution>
        <id>make-assembly</id>
        <phase>package</phase>
        <goals>
          <goal>assembly</goal>
        </goals>
      </execution>
    </executions>
</plugin>

</plugins>
</build>
</project>

转载于:https://www.cnblogs.com/shenyuchong/p/11454506.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值