sparkstreaming读取kafka流，按自身json数据自带的业务时间，增量写入hdfs对应日期目录

最新推荐文章于 2023-04-07 00:10:03 发布

SWWingceltis

最新推荐文章于 2023-04-07 00:10:03 发布

阅读量572

点赞数

文章标签：大数据

本文链接：https://blog.csdn.net/hhfff123/article/details/105629131

版权

package com.weshare.bigdata.ods.handler;

import com.alibaba.fastjson.JSONObject;
import com.weshare.bigdata.entity.ClusterEnvirEntity;
import com.weshare.bigdata.facility.ClusterEnvirFacility;
import com.weshare.bigdata.ods.constant.DetailConstant;
import com.weshare.bigdata.ods.utils.DateUtils;
import com.weshare.dataframework.spark.DfSparkSession;
import com.weshare.dataframework.spark.entity.SparkApplication;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat;
import org.apache.hadoop.util.Progressable;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.*;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import scala.reflect.ClassTag;
import scala.reflect.ClassTag$;

import java.io.IOException;
import java.util.*;

public class KafkaDataClassifyBySelf {
    private static Logger logger = LoggerFactory.getLogger(KafkaDataClassifyBySelf.class);
    static JavaInputDStream<ConsumerRecord<String, String>> directStream;
    static String hdfspath="/user/admin/FA_OFFLINE/";
    static String classifyTbl;
    static String kuduMaster;
    static String bootstrap;
    public static void main(String[] args){

        //System.setProperty("hadoop.home.dir","etc/dtconf/bin");
        SparkApplication application = new SparkApplication();
        application.setAppName("DemoStreaming");
        application.setSerializer("org.apache.spark.serializer.KryoSerializer");
        SparkSession sparkSession = DfSparkSession.bulid(application);

        JavaSparkContext javaSparkContext = new JavaSparkContext(sparkSession.sparkContext());
        JavaStreamingContext jssc = new JavaStreamingContext(javaSparkContext, Durations.seconds(600));

        //设置kudumaster和时间切片配置表
        ClusterEnvirFacility clusterEnvirFacility = new ClusterEnvirFacility();
        ClusterEnvirEntity environmentInfo = clusterEnvirFacility.getEnvironmentInfo(sparkSession);
        kuduMaster=environmentInfo.getKuduMaster();
        classifyTbl="impala::config.bus_calibration_time";
        //获取kafka节点信息
        bootstrap=environmentInfo.getBootstrap();
        HashMap<String, Object> kafkaMap = new HashMap<>();
        //Kafka服务监听端口
        kafkaMap.put("bootstrap.servers",bootstrap);

        kafkaMap.put("key.deserializer", StringDeserializer.class);

        kafkaMap.put("value.deserializer", StringDeserializer.class);
        //消费者ID，随意指定
        kafkaMap.put("group.id", "KafkaDataClassifyData");
        //指定从latest(最新,其他版本的是largest这里不行)earliest(最早)处开始读取数据
        kafkaMap.put("auto.offset.reset", "earliest");
        //如果true,consumer定期地往zookeeper写入每个分区的offset
        kafkaMap.put("enable.auto.commit", "false");

        Map<String, String> regularTime = KafkaDataClassifyBySelf.getRegularTime(sparkSession,kuduMaster,classifyTbl);
        ClassTag<Object> mapApply = ClassTag$.MODULE$.apply(Map.class);
        Broadcast<Object> broadcast = sparkSession.sparkContext().broadcast(regularTime, mapApply);
        String[] kafkaTopicsSplited = "data-mysql".split(",");
        Collection<String> topics = new HashSet<>();
        for (String kafkaTopic : kafkaTopicsSplited) {
            topics.add(kafkaTopic);
        }

        directStream = KafkaUtils.createDirectStream(
                jssc,
                LocationStrategies.PreferConsistent(),
                ConsumerStrategies.Subscribe(topics, kafkaMap)
        );


        directStream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
            @Override
            public void call(JavaRDD<ConsumerRecord<String, String>> s) throws Exception {
                //获取批次数据的offset
                OffsetRange[] offsetRanges = ((HasOffsetRanges) s.rdd()).offsetRanges();

                if (!s.isEmpty()){
                    //遍历批次kafka数据

                    JavaRDD<String> value = s.map(new Function<ConsumerRecord<String, String>, String>() {
                        @Override
                        public String call(ConsumerRecord<String, String> v1) throws Exception {
                            Map<String, String> map = (Map<String, String>) broadcast.value();
                            return KafkaDataClassifyBySelf.dataClassify(v1.value().replaceAll("\r\n",""), map);
                        }
                    });

                    JavaPairRDD<String, String> javaPairRDD = value.mapToPair(new PairFunction<String, String, String>() {
                        @Override
                        public Tuple2<String, String> call(String s) throws Exception {
                            JSONObject jsonObject = JSONObject.parseObject(s);
                            String classifyDate = jsonObject.get("ClassifyDate").toString();
                            jsonObject.remove("ClassifyDate");
                            return new Tuple2<String, String>(classifyDate, jsonObject.toJSONString());
                        }
                    });
                    javaPairRDD.saveAsHadoopFile(hdfspath, String.class, String.class, RDDMultipleTextOutputFormat.class);
                    //kafka自身维护offset
                    ((CanCommitOffsets) directStream.inputDStream()).commitAsync(offsetRanges);
                }
            }
        });
        jssc.start();
        try {
            jssc.awaitTermination();
        } catch (InterruptedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    public static String dataClassify(String s,Map<String, String> map){
        JSONObject jsonObject = JSONObject.parseObject(s);
        Object content = jsonObject.get("content");
        if (content!=null){
            JSONObject contentJson = JSONObject.parseObject(content.toString());
            String dbName = contentJson.getString("Database").toLowerCase();
            String timestamp = contentJson.getString("Timestamp");
            String tabName = contentJson.getString("Table").toLowerCase();
            String type = contentJson.getString("Type").toUpperCase();
            if (!"DELETE".equals(type)){
                if (DetailConstant.DB_SPILTD_DB.contains(dbName)){
                    String ccsTime = map.get("ccs");
                    boolean flag = DateUtils.dateCompare(timestamp, ccsTime);
                    if (flag){
                        jsonObject.put("ClassifyDate",DateUtils.getDateByTimeStamp(timestamp));
                    }else {
                        jsonObject.put("ClassifyDate",DateUtils.getLastDateByTimeStamp(timestamp));
                    }
                }else if (DetailConstant.TAB_SPILTD_TAB.contains(tabName)){
                    String tableTime = map.get(tabName);
                    boolean flag = DateUtils.dateCompare(timestamp, tableTime);
                    if (flag){
                        jsonObject.put("ClassifyDate",DateUtils.getDateByTimeStamp(timestamp));
                    }else {
                        jsonObject.put("ClassifyDate",DateUtils.getLastDateByTimeStamp(timestamp));
                    }
                }else if (DetailConstant.TAB_SPILTD_COL.contains(tabName)){
                    Object data = contentJson.get("Data");
                    JSONObject colJson = JSONObject.parseObject(data.toString());
                    if (DetailConstant.VAL_SPILTD_COL.contains(colJson.get("PRODUCT_CODE").toString())){
                        String proTime = map.get(colJson.get("PRODUCT_CODE").toString().toUpperCase());
                        boolean flag = DateUtils.dateCompare(timestamp, proTime);
                        if (flag){
                            jsonObject.put("ClassifyDate",DateUtils.getDateByTimeStamp(timestamp));
                        }else {
                            jsonObject.put("ClassifyDate",DateUtils.getLastDateByTimeStamp(timestamp));
                        }
                    }else {
                        //产品id不在产品切分列表里的错误数据
                        jsonObject.put("ClassifyDate","error_productcode");
                    }

                }else {
                    //非核心的数据（目前为催收的数据）
                    jsonObject.put("ClassifyDate","non_account");
                    //logger.error("不是核心的数据");
                }
            }else {
                //类型为删除的数据放入delete目录下
                jsonObject.put("ClassifyDate","delete");
            }
        }
        return jsonObject.toJSONString();
    }

    public static Map<String,String> getRegularTime(SparkSession sparkSession,String kuduMaster,String tbl_name){
        HashMap<String, String> regularTimeMap = new HashMap<>();
        Map<String, String> kuduProperties = new HashMap<>();
        kuduProperties.put("kudu.master", kuduMaster);
        kuduProperties.put("kudu.table", tbl_name);
        Dataset<Row> allRow = sparkSession.read().options(kuduProperties).format("kudu").load();
        List<Tuple2<String, String>> dbTime = allRow.select(functions.col("db_name"),
                functions.col("time"))
                .where(functions.col("type").equalTo("DB"))
                .as(Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
                .toJavaRDD().collect();
        dbTime.forEach(f->{
            regularTimeMap.put(f._1,f._2);
        });
        List<Tuple2<String, String>> tblTime = allRow.select(functions.col("tbl_name"),
                functions.col("time"))
                .where(functions.col("type").equalTo("TBL"))
                .as(Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
                .toJavaRDD().collect();
        tblTime.forEach(f->{
            regularTimeMap.put(f._1,f._2);
        });
        List<Tuple2<String, String>> colTime = allRow.select(functions.col("col_value"),
                functions.col("time"))
                .where(functions.col("type").equalTo("COL"))
                .as(Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
                .toJavaRDD().collect();
        colTime.forEach(f->{
            regularTimeMap.put(f._1,f._2);
        });
        return regularTimeMap;
    }
}

//重写RDDMultipleTextOutputFormat、AppendTextOutputFormat方法，实现按时间建目录，和数据增量写入文件

public class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat<String, String> {
    private AppendTextOutputFormat theTextOutputFormat = null;
    public String generateFileNameForKeyValue(String key, String value, String name) {
        //输出格式 /ouput/key/key.csv
        return key + "/"+name;
    }
    @Override
    protected RecordWriter getBaseRecordWriter(FileSystem fs, JobConf job, String name, Progressable progressable) throws IOException {
        if (this.theTextOutputFormat == null) {
            this.theTextOutputFormat = new AppendTextOutputFormat();
        }
        return this.theTextOutputFormat.getRecordWriter(fs, job, name, progressable);
    }
}

public class AppendTextOutputFormat extends TextOutputFormat<Text,Text> {

    protected static class MyLineRecordWriter<K, V> implements RecordWriter<K, V> {
        private static final byte[] NEWLINE;
        protected DataOutputStream out;
        private final byte[] keyValueSeparator;

        public MyLineRecordWriter(DataOutputStream out, String keyValueSeparator) {
            this.out = out;
            this.keyValueSeparator = keyValueSeparator.getBytes(StandardCharsets.UTF_8);
        }

        public MyLineRecordWriter(DataOutputStream out) {
            this(out, "\t");
        }

        private void writeObject(Object o) throws IOException {
            if (o instanceof Text) {
                Text to = (Text)o;
                this.out.write(to.getBytes(), 0, to.getLength());
            } else {
                this.out.write(o.toString().getBytes(StandardCharsets.UTF_8));
            }

        }

        public synchronized void write(K key, V value) throws IOException {
            boolean nullKey = key == null || key instanceof NullWritable;
            boolean nullValue = value == null || value instanceof NullWritable;
            if (!nullKey || !nullValue) {
                /*if (!nullKey) {
                    this.writeObject(key);
                }

                if (!nullKey && !nullValue) {
                    this.out.write(this.keyValueSeparator);
                }*/
                if (!nullValue) {
                    this.writeObject(value);
                }

                this.out.write(NEWLINE);
            }
        }

        public synchronized void close(Reporter reporter) throws IOException {
            this.out.close();
        }

        static {
            NEWLINE = "\n".getBytes(StandardCharsets.UTF_8);
        }
    }

    @Override
    public RecordWriter getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException {
        boolean isCompressed = getCompressOutput(job);
        String keyValueSeparator = job.get("mapreduce.output.textoutputformat.separator", "\t");
        if (!isCompressed) {
            Path file = FileOutputFormat.getTaskOutputPath(job, name);
            FileSystem fs = file.getFileSystem(job);
            Path newFile = new Path(FileOutputFormat.getOutputPath(job), name);
            FSDataOutputStream fileOut = null;
            if (fs.exists(newFile)) {
                //存在，追加写
                fileOut = fs.append(newFile);
            } else {
                fileOut = fs.create(file, progress);
            }
            return new AppendTextOutputFormat.MyLineRecordWriter(fileOut, keyValueSeparator);
        } else {
            Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
            CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, job);
            Path file = FileOutputFormat.getTaskOutputPath(job, name + codec.getDefaultExtension());
            FileSystem fs = file.getFileSystem(job);
            Path newFile = new Path(FileOutputFormat.getOutputPath(job), name);
            FSDataOutputStream fileOut = null;
            if (fs.exists(newFile)) {
                //存在，追加写
                fileOut = fs.append(newFile);
            } else {
                fileOut = fs.create(file, progress);
            }
            return new AppendTextOutputFormat.MyLineRecordWriter(new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator);
        }
    }
}