采集kafka数据以orc格式写往hdfs

1.需求,从kafka采集数据然后以orc格式的文件写往hdfs。然后hdfs上的表以orc格式存储,然后绑定分区可以查询出数据。

 

2.解决需求

    1) 使用flume 采集。写完hdfs。但是无法写orc格式。

    2 )  logstach 可以写往hdfs。但是无法写orc格式。

     3) datax 没有用过 不知道能不能写orc

3.自己写代码实现。写一个kafka的消费者。然后调用hive的api来写orc格式

package rongan.kafka;


import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import rongan.commos.PropertiesUtil;
import rongan.constants.Constans;


import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.URI;
import java.text.SimpleDateFormat;
import java.util.*;

public class KafkaConsumer {
    private static Properties properties = PropertiesUtil.getProperties("commerce.properties");
    private static JobConf configuration = new JobConf();
    private static FileSystem fs = null;
    private static FSDataOutputStream outputStream = null;
    private static Path writePath = null;
    private static String hdfsBasicPath = properties.getProperty(Constans.HDFS_PATH);
    private static OrcSerde serde = new OrcSerde();
    private static OutputFormat outputFormat = new OrcOutputFormat();
    private static StructObjectInspector inspector =
            (StructObjectInspector) ObjectInspectorFactory
                    .getReflectionObjectInspector(RsdTornadoEvent.class,
                            ObjectInspectorFactory.ObjectInspectorOptions.JAVA);


    public static void main(String[] args) throws IOException {
        //1.获取kafka消费者
        org.apache.kafka.clients.consumer.KafkaConsumer<String, String> consumer = getConsumer();
        //2.获取fs
        fs = getFileSystem();
        //3.获取当前时间
        Long lastTime = System.currentTimeMillis();
        //4.获取数据写入全路径(hdsfPath/yyyyMM/dd/HHmm)
        String totalPath = getTotalPath(lastTime);
        System.out.println(totalPath);
        //5.根据路径创建Path对象
        writePath = new Path(totalPath);
        RecordWriter write = null;
        //6.创建文件流
        write = getWriter(writePath);
        //7.开始拉取数据
        startCollect(consumer, lastTime, write);


    }

    /**
     * 开始拉取数据
     * @param consumer
     * @param lastTime
     * @param write
     * @throws IOException
     */
    private static void startCollect(org.apache.kafka.clients.consumer.KafkaConsumer<String, String> consumer, Long lastTime, RecordWriter write) throws IOException {
        while (true) {
            ConsumerRecords<String, String> records = consumer.poll(2000);
            System.out.println("开始拉取数据 " + new Date());
            int count = 0;
            for (ConsumerRecord<String, String> record : records) {
                count++;
                if (System.currentTimeMillis() - lastTime > 720000) {
                    write.close(Reporter.NULL);
                    System.out.println("滚动文件夹" + new Date().toString());
                    //获取当前时间
                    Long currentTime = System.currentTimeMillis();
                    //获取path
                    String newPath = getTotalPath(currentTime);
                    writePath = new Path(newPath);
                    write = getWriter(writePath);
                    lastTime = currentTime;
                }

                String[] values = record.value().split("\t");
                System.out.println(record.value());
                if (values.length < 33) {
                    continue;
                }
                write.write(NullWritable.get(), serde.serialize(new RsdTornadoEvent(values[0], values[1], values[2], values[3], values[4], values[5], values[6],
                        values[7], values[8], values[9], values[10], values[11], values[12], values[13], values[14], values[15], values[16], values[17], values[18], values[19], values[20], values[21],
                        values[22], values[23], values[24], values[25], values[26], values[27], values[28], values[29], values[30], values[31], values[32], values[33]), inspector));
            }

            System.out.println("本次拉取完毕 " + new Date() + "拉取" + count + "条");

        }
    }

    private static RecordWriter getWriter(Path writePath) {
        try {
            if (fs.exists(writePath)) {
                System.out.println(writePath.toString() + "已经存在");
                return outputFormat.getRecordWriter(fs, configuration, writePath.toString() + "/" + System.currentTimeMillis(), Reporter.NULL);
            } else {
                System.out.println(writePath.toString() + "不存在");
                fs.mkdirs(writePath);
                return outputFormat.getRecordWriter(fs, configuration, writePath.toString() + "/" + System.currentTimeMillis(), Reporter.NULL);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    /**
     * 获取文件系统
     * @return
     */
    private static FileSystem getFileSystem() {
        try {
            // 获取HDFS文件系统
            fs = FileSystem.get(new URI(properties.getProperty(Constans.HDFS_ADDRESS)), configuration);
            return fs;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    /**
     * 获取kafka消费者
     * @return
     */
    private static org.apache.kafka.clients.consumer.KafkaConsumer<String, String> getConsumer() {
        // 创建配置
        Properties properties1 = new Properties();
        properties1.put("bootstrap.servers", properties.getProperty(Constans.KAFKA_BROKER_LIST));
        properties1.put("group.id", "getEsEvent");
        properties1.put("zookeeper.session.timeout.ms", "1000");
        properties1.put("zookeeper.sync.time.ms", "250");
        properties1.put("auto.commit.interval.ms", "1000");
        properties1.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        properties1.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        // 创建消费者连接器
        // 消费者客户端会通过消费者连接器(ConsumerConnector)连接ZK集群
        // 获取分配的分区,创建每个分区对应的消息流(MessageStream),最后迭代消息流,读取每条消息
        //  ConsumerConnector consumer = Consumer.createJavaConsumerConnector(new ConsumerConfig(properties));
        org.apache.kafka.clients.consumer.KafkaConsumer<String, String> consumer = new org.apache.kafka.clients.consumer.KafkaConsumer<>(properties1);
        consumer.subscribe(Arrays.asList("t_rsd_tornado_event"));
        return consumer;
    }


    private static void save(String log) {
        try {
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    private static String timeTransform(Long timeInMills) {
        Date time = new Date(timeInMills);
        String formatDate = "";
        try {
            SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd-HHmm");
            formatDate = sdf.format(time);
        } catch (Exception e) {
            e.printStackTrace();
        }

        return formatDate;
    }

    /**
     * 根据当前的日期,获取总的路径名。
     * @param date
     * @return
     */
    private static String getDirectoryFromDate(String date) {
        // yyyy-MM-dd-HHmm
        // date.split("-")   ["yyyy","MM", "dd", "HHmm"]
        String[] directories = date.split("-");
        // yyyyMM/dd/
        String directory = directories[0] + "/" + directories[1] + "/" + directories[2];
        return directory;
    }

    /**
     * 
     * @param date
     * @return
     */
    private static String getFileName(String date) {
        // HHmm
        String[] dateSplit = date.split("-");
        // HHmm
        String fileName = dateSplit[2];
        return fileName;
    }

    /**
     * 获取按照当前时间拼接的地址
     * @param lastTime
     * @return
     */
    private static String getTotalPath(Long lastTime) {
        // 时间格式转换(yyyyMM-dd-HHmm)
        String formatDate = timeTransform(lastTime);
        // 提取目录(yyyy/MM/dd/)
        String directory = getDirectoryFromDate(formatDate);
        // 提取文件名称(HHmm)
        String fileName = getFileName(formatDate);
        // 全路径(yyyyMM/dd/HHmm)
        String totalPath = hdfsBasicPath + directory;

        return totalPath;
    }

    /**
     * 对应hive表的模型
     */
    static class RsdTornadoEvent implements Writable {
        String id;
        String device_Id;
        String src_Obj;
        String dest_Obj;
        String src_Ip;
        String dest_Ip;
        String src_Mac;
        String dest_Mac;
        String protocol;
        String app_Layer_Protocol;
        String src_Domain;

        String dest_Domain;
        String ip_Version;
        String src_Port;
        String dest_Port;
        String packet_Size;

        String package_Data;
        String payload;
        String sig_Id;
        String signame;
        String match_Point;

        String match_Data;
        String action;
        String incident_Level;
        String incident_Time;
        String risk_level;

        String incident_Type;
        String active;
        String lastUpdate_Time;
        String lastUpdate_User;
        String create_Time;

        String creator;
        String data_From;
        String send_Time;

        public RsdTornadoEvent() {
        }

        public RsdTornadoEvent(String id, String device_Id, String src_Obj, String dest_Obj, String src_Ip, String dest_Ip, String src_Mac, String dest_Mac, String protocol, String app_Layer_Protocol, String src_Domain, String dest_Domain, String ip_Version, String src_Port, String dest_Port, String packet_Size, String package_Data, String payload, String sig_Id, String signame, String match_Point, String match_Data, String action, String incident_Level, String incident_Time, String risk_level, String incident_Type, String active, String lastUpdate_Time, String lastUpdate_User, String create_Time, String creator, String data_From, String send_Time) {
            this.id = id;
            this.device_Id = device_Id;
            this.src_Obj = src_Obj;
            this.dest_Obj = dest_Obj;
            this.src_Ip = src_Ip;
            this.dest_Ip = dest_Ip;
            this.src_Mac = src_Mac;
            this.dest_Mac = dest_Mac;
            this.protocol = protocol;
            this.app_Layer_Protocol = app_Layer_Protocol;
            this.src_Domain = src_Domain;
            this.dest_Domain = dest_Domain;
            this.ip_Version = ip_Version;
            this.src_Port = src_Port;
            this.dest_Port = dest_Port;
            this.packet_Size = packet_Size;
            this.package_Data = package_Data;
            this.payload = payload;
            this.sig_Id = sig_Id;
            this.signame = signame;
            this.match_Point = match_Point;
            this.match_Data = match_Data;
            this.action = action;
            this.incident_Level = incident_Level;
            this.incident_Time = incident_Time;
            this.risk_level = risk_level;
            this.incident_Type = incident_Type;
            this.active = active;
            this.lastUpdate_Time = lastUpdate_Time;
            this.lastUpdate_User = lastUpdate_User;
            this.create_Time = create_Time;
            this.creator = creator;
            this.data_From = data_From;
            this.send_Time = send_Time;
        }

        @Override
        public void write(DataOutput dataOutput) throws IOException {
            throw new UnsupportedOperationException("no write");
        }

        @Override
        public void readFields(DataInput dataInput) throws IOException {
            throw new UnsupportedOperationException("no read");
        }
    }
}

 

哈喽,大家好,我现在遇到了一个问题。 我的flume向hdfs文件时,效率比较低 大约1G/3分钟 我单独测试时用put方式 1分钟能达到8G 如果用file sink也能达到1分钟1G 日志没有任何异常 只是DEBUG的时候发现每次提交一个块用时将近20秒 有高手能帮忙分析下是什么原因么 client.sources = r1 client.channels = c1 client.sinks = k1 client.sources.r1.type = spooldir client.sources.r1.spoolDir = /var/data/tmpdata client.sources.r1.fileSuffix = .COMPLETED client.sources.r1.deletePolicy = never client.sources.r1.batchSize = 500 client.sources.r1.channels = c1 client.channels.c1.type = memory client.channels.c1.capacity = 1000000 client.channels.c1.transactionCapacity = 50000 client.channels.c1.keep-alive = 3 client.sinks.k1.type = hdfs client.sinks.k1.hdfs.path = /flume/events/%Y%m%d/%H client.sinks.k1.hdfs.useLocalTimeStamp = true client.sinks.k1.hdfs.rollInterval = 3600 client.sinks.k1.hdfs.rollSize = 1000000000 client.sinks.k1.hdfs.rollCount = 0 client.sinks.k1.hdfs.batchSize = 500 client.sinks.k1.hdfs.callTimeout = 30000 client.sinks.k1.hdfs.fileType = DataStream client.sinks.k1.channel = c1 12 Aug 2015 16:14:24,739 DEBUG [conf-file-poller-0] (org.apache.flume.node.PollingPropertiesFileConfigurationProvider$FileWatcherRunnable.run:126) - Checking file:../conf/flume-client.conf for changes 12 Aug 2015 16:14:54,740 DEBUG [conf-file-poller-0] (org.apache.flume.node.PollingPropertiesFileConfigurationProvider$FileWatcherRunnable.run:126) - Checking file:../conf/flume-client.conf for changes 12 Aug 2015 16:15:24,740 DEBUG [conf-file-poller-0] (org.apache.flume.node.PollingPropertiesFileConfigurationProvider$FileWatcherRunnable.run:126) - Checking file:../conf/flume-client.conf for changes 12 Aug 2015 16:15:54,741 DEBUG [conf-file-poller-0] (org.apache.flume.node.PollingPropertiesFileConfigurationProvider$FileWatcherRunnable.run:126) - Checking file:../conf/flume-client.conf for changes 12 Aug 2015 16:16:24,742 DEBUG [conf-file-poller-0] (org.apache.flume.node.PollingPropertiesFileConfigurationProvider$FileWatcherRunnable.run:126) - Checking file:../conf/flume-client.conf for changes 12 Aug 2015 16:16:54,742 DEBUG [conf-file-poller-0] (org.apache.flume.node.PollingPropertiesFileConfigurationProvider$FileWatcherRunnable.run:126) - Checking file:../conf/flume-client.conf for changes 12 Aug 2015 16:17:24,743 DEBUG [conf-file-poller-0] (org.apache.flume.node.PollingPropertiesFileConfigurationProvider$FileWatcherRunnable.run:126) - Checking file:../conf/flume-client.conf for changes 12 Aug 2015 16:17:54,744 DEBUG [conf-file-poller-0] (org.apache.flume.node.PollingPropertiesFileConfigurationProvider$FileWatcherRunnable.run:126) - Checking file:../conf/flume-client.conf for changes 12 Aug 2015 16:18:24,745 DEBUG [conf-file-poller-0] (org.apache.flume.node.PollingPropertiesFileConfigurationProvider$FileWatcherRunnable.run:126) - Checking file:../conf/flume-client.conf for changes 12 Aug 2015 16:18:54,746 DEBUG [conf-file-poller-0] (org.apache.flume.node.PollingPropertiesFileConfigurationProvider$FileWatcherRunnable.run:126) - Checking file:../conf/flume-client.conf for changes 12 Aug 2015 16:19:24,746 DEBUG [conf-file-poller-0] (org.apache.flume.node.PollingPropertiesFileConfigurationProvider$FileWatcherRunnable.run:126) - Checking file:../conf/flume-client.conf for changes 日志没有问题 就是慢
©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页