采集kafka数据以orc格式写往hdfs

1.需求,从kafka采集数据然后以orc格式的文件写往hdfs。然后hdfs上的表以orc格式存储,然后绑定分区可以查询出数据。

 

2.解决需求

    1) 使用flume 采集。写完hdfs。但是无法写orc格式。

    2 )  logstach 可以写往hdfs。但是无法写orc格式。

     3) datax 没有用过 不知道能不能写orc

3.自己写代码实现。写一个kafka的消费者。然后调用hive的api来写orc格式

package rongan.kafka;


import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import rongan.commos.PropertiesUtil;
import rongan.constants.Constans;


import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.URI;
import java.text.SimpleDateFormat;
import java.util.*;

public class KafkaConsumer {
    private static Properties properties = PropertiesUtil.getProperties("commerce.properties");
    private static JobConf configuration = new JobConf();
    private static FileSystem fs = null;
    private static FSDataOutputStream outputStream = null;
    private static Path writePath = null;
    private static String hdfsBasicPath = properties.getProperty(Constans.HDFS_PATH);
    private static OrcSerde serde = new OrcSerde();
    private static OutputFormat outputFormat = new OrcOutputFormat();
    private static StructObjectInspector inspector =
            (StructObjectInspector) ObjectInspectorFactory
                    .getReflectionObjectInspector(RsdTornadoEvent.class,
                            ObjectInspectorFactory.ObjectInspectorOptions.JAVA);


    public static void main(String[] args) throws IOException {
        //1.获取kafka消费者
        org.apache.kafka.clients.consumer.KafkaConsumer<String, String> consumer = getConsumer();
        //2.获取fs
        fs = getFileSystem();
        //3.获取当前时间
        Long lastTime = System.currentTimeMillis();
        //4.获取数据写入全路径(hdsfPath/yyyyMM/dd/HHmm)
        String totalPath = getTotalPath(lastTime);
        System.out.println(totalPath);
        //5.根据路径创建Path对象
        writePath = new Path(totalPath);
        RecordWriter write = null;
        //6.创建文件流
        write = getWriter(writePath);
        //7.开始拉取数据
        startCollect(consumer, lastTime, write);


    }

    /**
     * 开始拉取数据
     * @param consumer
     * @param lastTime
     * @param write
     * @throws IOException
     */
    private static void startCollect(org.apache.kafka.clients.consumer.KafkaConsumer<String, String> consumer, Long lastTime, RecordWriter write) throws IOException {
        while (true) {
            ConsumerRecords<String, String> records = consumer.poll(2000);
            System.out.println("开始拉取数据 " + new Date());
            int count = 0;
            for (ConsumerRecord<String, String> record : records) {
                count++;
                if (System.currentTimeMillis() - lastTime > 720000) {
                    write.close(Reporter.NULL);
                    System.out.println("滚动文件夹" + new Date().toString());
                    //获取当前时间
                    Long currentTime = System.currentTimeMillis();
                    //获取path
                    String newPath = getTotalPath(currentTime);
                    writePath = new Path(newPath);
                    write = getWriter(writePath);
                    lastTime = currentTime;
                }

                String[] values = record.value().split("\t");
                System.out.println(record.value());
                if (values.length < 33) {
                    continue;
                }
                write.write(NullWritable.get(), serde.serialize(new RsdTornadoEvent(values[0], values[1], values[2], values[3], values[4], values[5], values[6],
                        values[7], values[8], values[9], values[10], values[11], values[12], values[13], values[14], values[15], values[16], values[17], values[18], values[19], values[20], values[21],
                        values[22], values[23], values[24], values[25], values[26], values[27], values[28], values[29], values[30], values[31], values[32], values[33]), inspector));
            }

            System.out.println("本次拉取完毕 " + new Date() + "拉取" + count + "条");

        }
    }

    private static RecordWriter getWriter(Path writePath) {
        try {
            if (fs.exists(writePath)) {
                System.out.println(writePath.toString() + "已经存在");
                return outputFormat.getRecordWriter(fs, configuration, writePath.toString() + "/" + System.currentTimeMillis(), Reporter.NULL);
            } else {
                System.out.println(writePath.toString() + "不存在");
                fs.mkdirs(writePath);
                return outputFormat.getRecordWriter(fs, configuration, writePath.toString() + "/" + System.currentTimeMillis(), Reporter.NULL);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    /**
     * 获取文件系统
     * @return
     */
    private static FileSystem getFileSystem() {
        try {
            // 获取HDFS文件系统
            fs = FileSystem.get(new URI(properties.getProperty(Constans.HDFS_ADDRESS)), configuration);
            return fs;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    /**
     * 获取kafka消费者
     * @return
     */
    private static org.apache.kafka.clients.consumer.KafkaConsumer<String, String> getConsumer() {
        // 创建配置
        Properties properties1 = new Properties();
        properties1.put("bootstrap.servers", properties.getProperty(Constans.KAFKA_BROKER_LIST));
        properties1.put("group.id", "getEsEvent");
        properties1.put("zookeeper.session.timeout.ms", "1000");
        properties1.put("zookeeper.sync.time.ms", "250");
        properties1.put("auto.commit.interval.ms", "1000");
        properties1.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        properties1.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        // 创建消费者连接器
        // 消费者客户端会通过消费者连接器(ConsumerConnector)连接ZK集群
        // 获取分配的分区,创建每个分区对应的消息流(MessageStream),最后迭代消息流,读取每条消息
        //  ConsumerConnector consumer = Consumer.createJavaConsumerConnector(new ConsumerConfig(properties));
        org.apache.kafka.clients.consumer.KafkaConsumer<String, String> consumer = new org.apache.kafka.clients.consumer.KafkaConsumer<>(properties1);
        consumer.subscribe(Arrays.asList("t_rsd_tornado_event"));
        return consumer;
    }


    private static void save(String log) {
        try {
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    private static String timeTransform(Long timeInMills) {
        Date time = new Date(timeInMills);
        String formatDate = "";
        try {
            SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd-HHmm");
            formatDate = sdf.format(time);
        } catch (Exception e) {
            e.printStackTrace();
        }

        return formatDate;
    }

    /**
     * 根据当前的日期,获取总的路径名。
     * @param date
     * @return
     */
    private static String getDirectoryFromDate(String date) {
        // yyyy-MM-dd-HHmm
        // date.split("-")   ["yyyy","MM", "dd", "HHmm"]
        String[] directories = date.split("-");
        // yyyyMM/dd/
        String directory = directories[0] + "/" + directories[1] + "/" + directories[2];
        return directory;
    }

    /**
     * 
     * @param date
     * @return
     */
    private static String getFileName(String date) {
        // HHmm
        String[] dateSplit = date.split("-");
        // HHmm
        String fileName = dateSplit[2];
        return fileName;
    }

    /**
     * 获取按照当前时间拼接的地址
     * @param lastTime
     * @return
     */
    private static String getTotalPath(Long lastTime) {
        // 时间格式转换(yyyyMM-dd-HHmm)
        String formatDate = timeTransform(lastTime);
        // 提取目录(yyyy/MM/dd/)
        String directory = getDirectoryFromDate(formatDate);
        // 提取文件名称(HHmm)
        String fileName = getFileName(formatDate);
        // 全路径(yyyyMM/dd/HHmm)
        String totalPath = hdfsBasicPath + directory;

        return totalPath;
    }

    /**
     * 对应hive表的模型
     */
    static class RsdTornadoEvent implements Writable {
        String id;
        String device_Id;
        String src_Obj;
        String dest_Obj;
        String src_Ip;
        String dest_Ip;
        String src_Mac;
        String dest_Mac;
        String protocol;
        String app_Layer_Protocol;
        String src_Domain;

        String dest_Domain;
        String ip_Version;
        String src_Port;
        String dest_Port;
        String packet_Size;

        String package_Data;
        String payload;
        String sig_Id;
        String signame;
        String match_Point;

        String match_Data;
        String action;
        String incident_Level;
        String incident_Time;
        String risk_level;

        String incident_Type;
        String active;
        String lastUpdate_Time;
        String lastUpdate_User;
        String create_Time;

        String creator;
        String data_From;
        String send_Time;

        public RsdTornadoEvent() {
        }

        public RsdTornadoEvent(String id, String device_Id, String src_Obj, String dest_Obj, String src_Ip, String dest_Ip, String src_Mac, String dest_Mac, String protocol, String app_Layer_Protocol, String src_Domain, String dest_Domain, String ip_Version, String src_Port, String dest_Port, String packet_Size, String package_Data, String payload, String sig_Id, String signame, String match_Point, String match_Data, String action, String incident_Level, String incident_Time, String risk_level, String incident_Type, String active, String lastUpdate_Time, String lastUpdate_User, String create_Time, String creator, String data_From, String send_Time) {
            this.id = id;
            this.device_Id = device_Id;
            this.src_Obj = src_Obj;
            this.dest_Obj = dest_Obj;
            this.src_Ip = src_Ip;
            this.dest_Ip = dest_Ip;
            this.src_Mac = src_Mac;
            this.dest_Mac = dest_Mac;
            this.protocol = protocol;
            this.app_Layer_Protocol = app_Layer_Protocol;
            this.src_Domain = src_Domain;
            this.dest_Domain = dest_Domain;
            this.ip_Version = ip_Version;
            this.src_Port = src_Port;
            this.dest_Port = dest_Port;
            this.packet_Size = packet_Size;
            this.package_Data = package_Data;
            this.payload = payload;
            this.sig_Id = sig_Id;
            this.signame = signame;
            this.match_Point = match_Point;
            this.match_Data = match_Data;
            this.action = action;
            this.incident_Level = incident_Level;
            this.incident_Time = incident_Time;
            this.risk_level = risk_level;
            this.incident_Type = incident_Type;
            this.active = active;
            this.lastUpdate_Time = lastUpdate_Time;
            this.lastUpdate_User = lastUpdate_User;
            this.create_Time = create_Time;
            this.creator = creator;
            this.data_From = data_From;
            this.send_Time = send_Time;
        }

        @Override
        public void write(DataOutput dataOutput) throws IOException {
            throw new UnsupportedOperationException("no write");
        }

        @Override
        public void readFields(DataInput dataInput) throws IOException {
            throw new UnsupportedOperationException("no read");
        }
    }
}

 

  • 2
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 6
    评论
以下是一个基于Flink消费Kafka数据并将其写入HDFS的示例: ```java import org.apache.flink.api.common.serialization.SimpleStringSchema; import org.apache.flink.core.fs.FileSystem; import org.apache.flink.formats.orc.OrcSplitReaderUtil; import org.apache.flink.formats.orc.OrcWriterFactory; import org.apache.flink.formats.orc.vector.StringColumnVector; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer; import org.apache.flink.streaming.util.serialization.JSONKeyValueDeserializationSchema; import org.apache.flink.table.api.EnvironmentSettings; import org.apache.flink.table.api.Table; import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; import org.apache.flink.table.descriptors.*; import org.apache.flink.types.Row; import java.util.Properties; public class FlinkKafkaHdfsOrcDemo { public static void main(String[] args) throws Exception { // set up the streaming execution environment final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); // set parallelism to 1 for demo purposes // set up the Kafka consumer properties Properties kafkaProps = new Properties(); kafkaProps.setProperty("bootstrap.servers", "localhost:9092"); kafkaProps.setProperty("group.id", "flink-kafka-consumer-group"); // create a FlinkKafkaConsumer instance to consume Kafka data FlinkKafkaConsumer<String> kafkaConsumer = new FlinkKafkaConsumer<>("my-topic", new SimpleStringSchema(), kafkaProps); // create a data stream from the Kafka source DataStream<String> kafkaStream = env.addSource(kafkaConsumer); // parse the JSON data and create a table from it EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().useBlinkPlanner().build(); StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, settings); tableEnv.connect(new Kafka().version("universal").topic("my-topic").startFromEarliest().property("bootstrap.servers", "localhost:9092").property("group.id", "flink-kafka-consumer-group")) .withFormat(new Json().deriveSchema()) .withSchema(new Schema().field("name", DataTypes.STRING()).field("age", DataTypes.INT())) .createTemporaryTable("myTable"); Table myTable = tableEnv.from("myTable"); // create an OrcWriterFactory to write ORC data OrcWriterFactory<Row> orcWriterFactory = (OrcWriterFactory<Row>) OrcSplitReaderUtil.createRowOrcWriterFactory( new String[]{"name", "age"}, new OrcSplitReaderUtil.TypeDescription[]{ OrcSplitReaderUtil.TypeDescription.createString(), OrcSplitReaderUtil.TypeDescription.createInt() }); // create a FlinkKafkaProducer instance to write Kafka data FlinkKafkaProducer<Row> kafkaProducer = new FlinkKafkaProducer<>( "my-topic", new OrcRowSerializationSchema("/path/to/hdfs/file.orc", orcWriterFactory), kafkaProps, FlinkKafkaProducer.Semantic.EXACTLY_ONCE); // write the table data to HDFS in ORC format myTable.execute().output(kafkaProducer); // execute the job env.execute("Flink Kafka HDFS ORC Demo"); } // implementation of OrcRowSerializationSchema private static class OrcRowSerializationSchema implements FlinkKafkaProducer.SerializationSchema<Row> { private final String filePath; private final OrcWriterFactory<Row> orcWriterFactory; private transient OrcWriterFactory.Writer<Row> orcWriter; public OrcRowSerializationSchema(String filePath, OrcWriterFactory<Row> orcWriterFactory) { this.filePath = filePath; this.orcWriterFactory = orcWriterFactory; } @Override public byte[] serialize(Row row) { try { if (orcWriter == null) { orcWriter = orcWriterFactory.createWriter(filePath, FileSystem.getHadoopFileSystem(new org.apache.flink.core.fs.Path(filePath).toUri()), true); } StringColumnVector nameVector = new StringColumnVector(1); nameVector.vector[0] = row.getField(0).toString(); StringColumnVector ageVector = new StringColumnVector(1); ageVector.vector[0] = row.getField(1).toString(); orcWriter.addRow(nameVector, ageVector); return null; } catch (Exception e) { throw new RuntimeException(e); } } } } ``` 该示例使用Flink的Table APIKafka消费数据,并将其写入HDFS中的ORC文件。示例代码使用`JsonKeyValueDeserializationSchema`解析JSON格式数据,并使用`OrcWriterFactory`将数据写入ORC文件。在示例中,`OrcWriterFactory`被配置为使用String和Int类型的列。还创建了一个`OrcRowSerializationSchema`类,它将Flink的`Row`类型转换为ORC文件中的列向量,并使用`OrcWriterFactory.Writer`将数据写入ORC文件。 注意:在实际使用中,应该根据实际需求修改示例代码,并根据需要添加适当的错误处理和容错机制。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值