采集kafka数据以orc格式写往hdfs

最新推荐文章于 2024-06-27 10:57:59 发布

键盘上的艺术家w

最新推荐文章于 2024-06-27 10:57:59 发布

阅读量2.4k

点赞数 2

分类专栏：大数据文章标签： java api 写orc文件到hdfs orc格式 hive

本文链接：https://blog.csdn.net/lw277232240/article/details/88814146

版权

大数据专栏收录该内容

7 篇文章 0 订阅

订阅专栏

1.需求，从kafka采集数据然后以orc格式的文件写往hdfs。然后hdfs上的表以orc格式存储，然后绑定分区可以查询出数据。

2.解决需求

1）使用flume 采集。写完hdfs。但是无法写orc格式。

2 ) logstach 可以写往hdfs。但是无法写orc格式。

3) datax 没有用过不知道能不能写orc

3.自己写代码实现。写一个kafka的消费者。然后调用hive的api来写orc格式

package rongan.kafka;


import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import rongan.commos.PropertiesUtil;
import rongan.constants.Constans;


import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.URI;
import java.text.SimpleDateFormat;
import java.util.*;

public class KafkaConsumer {
    private static Properties properties = PropertiesUtil.getProperties("commerce.properties");
    private static JobConf configuration = new JobConf();
    private static FileSystem fs = null;
    private static FSDataOutputStream outputStream = null;
    private static Path writePath = null;
    private static String hdfsBasicPath = properties.getProperty(Constans.HDFS_PATH);
    private static OrcSerde serde = new OrcSerde();
    private static OutputFormat outputFormat = new OrcOutputFormat();
    private static StructObjectInspector inspector =
            (StructObjectInspector) ObjectInspectorFactory
                    .getReflectionObjectInspector(RsdTornadoEvent.class,
                            ObjectInspectorFactory.ObjectInspectorOptions.JAVA);


    public static void main(String[] args) throws IOException {
        //1.获取kafka消费者
        org.apache.kafka.clients.consumer.KafkaConsumer<String, String> consumer = getConsumer();
        //2.获取fs
        fs = getFileSystem();
        //3.获取当前时间
        Long lastTime = System.currentTimeMillis();
        //4.获取数据写入全路径（hdsfPath/yyyyMM/dd/HHmm）
        String totalPath = getTotalPath(lastTime);
        System.out.println(totalPath);
        //5.根据路径创建Path对象
        writePath = new Path(totalPath);
        RecordWriter write = null;
        //6.创建文件流
        write = getWriter(writePath);
        //7.开始拉取数据
        startCollect(consumer, lastTime, write);


    }

    /**
     * 开始拉取数据
     * @param consumer
     * @param lastTime
     * @param write
     * @throws IOException
     */
    private static void startCollect(org.apache.kafka.clients.consumer.KafkaConsumer<String, String> consumer, Long lastTime, RecordWriter write) throws IOException {
        while (true) {
            ConsumerRecords<String, String> records = consumer.poll(2000);
            System.out.println("开始拉取数据 " + new Date());
            int count = 0;
            for (ConsumerRecord<String, String> record : records) {
                count++;
                if (System.currentTimeMillis() - lastTime > 720000) {
                    write.close(Reporter.NULL);
                    System.out.println("滚动文件夹" + new Date().toString());
                    //获取当前时间
                    Long currentTime = System.currentTimeMillis();
                    //获取path
                    String newPath = getTotalPath(currentTime);
                    writePath = new Path(newPath);
                    write = getWriter(writePath);
                    lastTime = currentTime;
                }

                String[] values = record.value().split("\t");
                System.out.println(record.value());
                if (values.length < 33) {
                    continue;
                }
                write.write(NullWritable.get(), serde.serialize(new RsdTornadoEvent(values[0], values[1], values[2], values[3], values[4], values[5], values[6],
                        values[7], values[8], values[9], values[10], values[11], values[12], values[13], values[14], values[15], values[16], values[17], values[18], values[19], values[20], values[21],
                        values[22], values[23], values[24], values[25], values[26], values[27], values[28], values[29], values[30], values[31], values[32], values[33]), inspector));
            }

            System.out.println("本次拉取完毕 " + new Date() + "拉取" + count + "条");

        }
    }

    private static RecordWriter getWriter(Path writePath) {
        try {
            if (fs.exists(writePath)) {
                System.out.println(writePath.toString() + "已经存在");
                return outputFormat.getRecordWriter(fs, configuration, writePath.toString() + "/" + System.currentTimeMillis(), Reporter.NULL);
            } else {
                System.out.println(writePath.toString() + "不存在");
                fs.mkdirs(writePath);
                return outputFormat.getRecordWriter(fs, configuration, writePath.toString() + "/" + System.currentTimeMillis(), Reporter.NULL);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    /**
     * 获取文件系统
     * @return
     */
    private static FileSystem getFileSystem() {
        try {
            // 获取HDFS文件系统
            fs = FileSystem.get(new URI(properties.getProperty(Constans.HDFS_ADDRESS)), configuration);
            return fs;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    /**
     * 获取kafka消费者
     * @return
     */
    private static org.apache.kafka.clients.consumer.KafkaConsumer<String, String> getConsumer() {
        // 创建配置
        Properties properties1 = new Properties();
        properties1.put("bootstrap.servers", properties.getProperty(Constans.KAFKA_BROKER_LIST));
        properties1.put("group.id", "getEsEvent");
        properties1.put("zookeeper.session.timeout.ms", "1000");
        properties1.put("zookeeper.sync.time.ms", "250");
        properties1.put("auto.commit.interval.ms", "1000");
        properties1.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        properties1.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        // 创建消费者连接器
        // 消费者客户端会通过消费者连接器（ConsumerConnector）连接ZK集群
        // 获取分配的分区，创建每个分区对应的消息流（MessageStream），最后迭代消息流，读取每条消息
        //  ConsumerConnector consumer = Consumer.createJavaConsumerConnector(new ConsumerConfig(properties));
        org.apache.kafka.clients.consumer.KafkaConsumer<String, String> consumer = new org.apache.kafka.clients.consumer.KafkaConsumer<>(properties1);
        consumer.subscribe(Arrays.asList("t_rsd_tornado_event"));
        return consumer;
    }


    private static void save(String log) {
        try {
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    private static String timeTransform(Long timeInMills) {
        Date time = new Date(timeInMills);
        String formatDate = "";
        try {
            SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd-HHmm");
            formatDate = sdf.format(time);
        } catch (Exception e) {
            e.printStackTrace();
        }

        return formatDate;
    }

    /**
     * 根据当前的日期，获取总的路径名。
     * @param date
     * @return
     */
    private static String getDirectoryFromDate(String date) {
        // yyyy-MM-dd-HHmm
        // date.split("-")   ["yyyy","MM", "dd", "HHmm"]
        String[] directories = date.split("-");
        // yyyyMM/dd/
        String directory = directories[0] + "/" + directories[1] + "/" + directories[2];
        return directory;
    }

    /**
     * 
     * @param date
     * @return
     */
    private static String getFileName(String date) {
        // HHmm
        String[] dateSplit = date.split("-");
        // HHmm
        String fileName = dateSplit[2];
        return fileName;
    }

    /**
     * 获取按照当前时间拼接的地址
     * @param lastTime
     * @return
     */
    private static String getTotalPath(Long lastTime) {
        // 时间格式转换（yyyyMM-dd-HHmm）
        String formatDate = timeTransform(lastTime);
        // 提取目录（yyyy/MM/dd/）
        String directory = getDirectoryFromDate(formatDate);
        // 提取文件名称（HHmm）
        String fileName = getFileName(formatDate);
        // 全路径（yyyyMM/dd/HHmm）
        String totalPath = hdfsBasicPath + directory;

        return totalPath;
    }

    /**
     * 对应hive表的模型
     */
    static class RsdTornadoEvent implements Writable {
        String id;
        String device_Id;
        String src_Obj;
        String dest_Obj;
        String src_Ip;
        String dest_Ip;
        String src_Mac;
        String dest_Mac;
        String protocol;
        String app_Layer_Protocol;
        String src_Domain;

        String dest_Domain;
        String ip_Version;
        String src_Port;
        String dest_Port;
        String packet_Size;

        String package_Data;
        String payload;
        String sig_Id;
        String signame;
        String match_Point;

        String match_Data;
        String action;
        String incident_Level;
        String incident_Time;
        String risk_level;

        String incident_Type;
        String active;
        String lastUpdate_Time;
        String lastUpdate_User;
        String create_Time;

        String creator;
        String data_From;
        String send_Time;

        public RsdTornadoEvent() {
        }

        public RsdTornadoEvent(String id, String device_Id, String src_Obj, String dest_Obj, String src_Ip, String dest_Ip, String src_Mac, String dest_Mac, String protocol, String app_Layer_Protocol, String src_Domain, String dest_Domain, String ip_Version, String src_Port, String dest_Port, String packet_Size, String package_Data, String payload, String sig_Id, String signame, String match_Point, String match_Data, String action, String incident_Level, String incident_Time, String risk_level, String incident_Type, String active, String lastUpdate_Time, String lastUpdate_User, String create_Time, String creator, String data_From, String send_Time) {
            this.id = id;
            this.device_Id = device_Id;
            this.src_Obj = src_Obj;
            this.dest_Obj = dest_Obj;
            this.src_Ip = src_Ip;
            this.dest_Ip = dest_Ip;
            this.src_Mac = src_Mac;
            this.dest_Mac = dest_Mac;
            this.protocol = protocol;
            this.app_Layer_Protocol = app_Layer_Protocol;
            this.src_Domain = src_Domain;
            this.dest_Domain = dest_Domain;
            this.ip_Version = ip_Version;
            this.src_Port = src_Port;
            this.dest_Port = dest_Port;
            this.packet_Size = packet_Size;
            this.package_Data = package_Data;
            this.payload = payload;
            this.sig_Id = sig_Id;
            this.signame = signame;
            this.match_Point = match_Point;
            this.match_Data = match_Data;
            this.action = action;
            this.incident_Level = incident_Level;
            this.incident_Time = incident_Time;
            this.risk_level = risk_level;
            this.incident_Type = incident_Type;
            this.active = active;
            this.lastUpdate_Time = lastUpdate_Time;
            this.lastUpdate_User = lastUpdate_User;
            this.create_Time = create_Time;
            this.creator = creator;
            this.data_From = data_From;
            this.send_Time = send_Time;
        }

        @Override
        public void write(DataOutput dataOutput) throws IOException {
            throw new UnsupportedOperationException("no write");
        }

        @Override
        public void readFields(DataInput dataInput) throws IOException {
            throw new UnsupportedOperationException("no read");
        }
    }
}

键盘上的艺术家w

关注

2
点赞
踩
7

收藏

觉得还不错? 一键收藏
6
评论
采集kafka数据以orc格式写往hdfs

1.需求，从kafka采集数据然后以orc格式的文件写往hdfs。然后hdfs上的表以orc格式存储，然后绑定分区可以查询出数据。2.解决需求 1）使用flume 采集。写完hdfs。但是无法写orc格式。 2 ) logstach 可以写往hdfs。但是无法写orc格式。 3) datax 没有用过不知道能不能写orc3.自己写代码实现。写一个...
复制链接

扫一扫

专栏目录