package com.weshare.bigdata.ods.handler; import com.alibaba.fastjson.JSONObject; import com.weshare.bigdata.entity.ClusterEnvirEntity; import com.weshare.bigdata.facility.ClusterEnvirFacility; import com.weshare.bigdata.ods.constant.DetailConstant; import com.weshare.bigdata.ods.utils.DateUtils; import com.weshare.dataframework.spark.DfSparkSession; import com.weshare.dataframework.spark.entity.SparkApplication; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordWriter; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat; import org.apache.hadoop.util.Progressable; import org.apache.kafka.clients.consumer.ConsumerRecord; import org.apache.kafka.common.serialization.StringDeserializer; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.VoidFunction; import org.apache.spark.broadcast.Broadcast; import org.apache.spark.sql.*; import org.apache.spark.streaming.Durations; import org.apache.spark.streaming.api.java.JavaInputDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.apache.spark.streaming.kafka010.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.Tuple2; import scala.reflect.ClassTag; import scala.reflect.ClassTag$; import java.io.IOException; import java.util.*; public class KafkaDataClassifyBySelf { private static Logger logger = LoggerFactory.getLogger(KafkaDataClassifyBySelf.class); static JavaInputDStream<ConsumerRecord<String, String>> directStream; static String hdfspath="/user/admin/FA_OFFLINE/"; static String classifyTbl; static String kuduMaster; static String bootstrap; public static void main(String[] args){ //System.setProperty("hadoop.home.dir","etc/dtconf/bin"); SparkApplication application = new SparkApplication(); application.setAppName("DemoStreaming"); application.setSerializer("org.apache.spark.serializer.KryoSerializer"); SparkSession sparkSession = DfSparkSession.bulid(application); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkSession.sparkContext()); JavaStreamingContext jssc = new JavaStreamingContext(javaSparkContext, Durations.seconds(600)); //设置kudumaster和时间切片配置表 ClusterEnvirFacility clusterEnvirFacility = new ClusterEnvirFacility(); ClusterEnvirEntity environmentInfo = clusterEnvirFacility.getEnvironmentInfo(sparkSession); kuduMaster=environmentInfo.getKuduMaster(); classifyTbl="impala::config.bus_calibration_time"; //获取kafka节点信息 bootstrap=environmentInfo.getBootstrap(); HashMap<String, Object> kafkaMap = new HashMap<>(); //Kafka服务监听端口 kafkaMap.put("bootstrap.servers",bootstrap); kafkaMap.put("key.deserializer", StringDeserializer.class); kafkaMap.put("value.deserializer", StringDeserializer.class); //消费者ID,随意指定 kafkaMap.put("group.id", "KafkaDataClassifyData"); //指定从latest(最新,其他版本的是largest这里不行)earliest(最早)处开始读取数据 kafkaMap.put("auto.offset.reset", "earliest"); //如果true,consumer定期地往zookeeper写入每个分区的offset kafkaMap.put("enable.auto.commit", "false"); Map<String, String> regularTime = KafkaDataClassifyBySelf.getRegularTime(sparkSession,kuduMaster,classifyTbl); ClassTag<Object> mapApply = ClassTag$.MODULE$.apply(Map.class); Broadcast<Object> broadcast = sparkSession.sparkContext().broadcast(regularTime, mapApply); String[] kafkaTopicsSplited = "data-mysql".split(","); Collection<String> topics = new HashSet<>(); for (String kafkaTopic : kafkaTopicsSplited) { topics.add(kafkaTopic); } directStream = KafkaUtils.createDirectStream( jssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.Subscribe(topics, kafkaMap) ); directStream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() { @Override public void call(JavaRDD<ConsumerRecord<String, String>> s) throws Exception { //获取批次数据的offset OffsetRange[] offsetRanges = ((HasOffsetRanges) s.rdd()).offsetRanges(); if (!s.isEmpty()){ //遍历批次kafka数据 JavaRDD<String> value = s.map(new Function<ConsumerRecord<String, String>, String>() { @Override public String call(ConsumerRecord<String, String> v1) throws Exception { Map<String, String> map = (Map<String, String>) broadcast.value(); return KafkaDataClassifyBySelf.dataClassify(v1.value().replaceAll("\r\n",""), map); } }); JavaPairRDD<String, String> javaPairRDD = value.mapToPair(new PairFunction<String, String, String>() { @Override public Tuple2<String, String> call(String s) throws Exception { JSONObject jsonObject = JSONObject.parseObject(s); String classifyDate = jsonObject.get("ClassifyDate").toString(); jsonObject.remove("ClassifyDate"); return new Tuple2<String, String>(classifyDate, jsonObject.toJSONString()); } }); javaPairRDD.saveAsHadoopFile(hdfspath, String.class, String.class, RDDMultipleTextOutputFormat.class); //kafka自身维护offset ((CanCommitOffsets) directStream.inputDStream()).commitAsync(offsetRanges); } } }); jssc.start(); try { jssc.awaitTermination(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static String dataClassify(String s,Map<String, String> map){ JSONObject jsonObject = JSONObject.parseObject(s); Object content = jsonObject.get("content"); if (content!=null){ JSONObject contentJson = JSONObject.parseObject(content.toString()); String dbName = contentJson.getString("Database").toLowerCase(); String timestamp = contentJson.getString("Timestamp"); String tabName = contentJson.getString("Table").toLowerCase(); String type = contentJson.getString("Type").toUpperCase(); if (!"DELETE".equals(type)){ if (DetailConstant.DB_SPILTD_DB.contains(dbName)){ String ccsTime = map.get("ccs"); boolean flag = DateUtils.dateCompare(timestamp, ccsTime); if (flag){ jsonObject.put("ClassifyDate",DateUtils.getDateByTimeStamp(timestamp)); }else { jsonObject.put("ClassifyDate",DateUtils.getLastDateByTimeStamp(timestamp)); } }else if (DetailConstant.TAB_SPILTD_TAB.contains(tabName)){ String tableTime = map.get(tabName); boolean flag = DateUtils.dateCompare(timestamp, tableTime); if (flag){ jsonObject.put("ClassifyDate",DateUtils.getDateByTimeStamp(timestamp)); }else { jsonObject.put("ClassifyDate",DateUtils.getLastDateByTimeStamp(timestamp)); } }else if (DetailConstant.TAB_SPILTD_COL.contains(tabName)){ Object data = contentJson.get("Data"); JSONObject colJson = JSONObject.parseObject(data.toString()); if (DetailConstant.VAL_SPILTD_COL.contains(colJson.get("PRODUCT_CODE").toString())){ String proTime = map.get(colJson.get("PRODUCT_CODE").toString().toUpperCase()); boolean flag = DateUtils.dateCompare(timestamp, proTime); if (flag){ jsonObject.put("ClassifyDate",DateUtils.getDateByTimeStamp(timestamp)); }else { jsonObject.put("ClassifyDate",DateUtils.getLastDateByTimeStamp(timestamp)); } }else { //产品id不在产品切分列表里的错误数据 jsonObject.put("ClassifyDate","error_productcode"); } }else { //非核心的数据(目前为催收的数据) jsonObject.put("ClassifyDate","non_account"); //logger.error("不是核心的数据"); } }else { //类型为删除的数据放入delete目录下 jsonObject.put("ClassifyDate","delete"); } } return jsonObject.toJSONString(); } public static Map<String,String> getRegularTime(SparkSession sparkSession,String kuduMaster,String tbl_name){ HashMap<String, String> regularTimeMap = new HashMap<>(); Map<String, String> kuduProperties = new HashMap<>(); kuduProperties.put("kudu.master", kuduMaster); kuduProperties.put("kudu.table", tbl_name); Dataset<Row> allRow = sparkSession.read().options(kuduProperties).format("kudu").load(); List<Tuple2<String, String>> dbTime = allRow.select(functions.col("db_name"), functions.col("time")) .where(functions.col("type").equalTo("DB")) .as(Encoders.tuple(Encoders.STRING(), Encoders.STRING())) .toJavaRDD().collect(); dbTime.forEach(f->{ regularTimeMap.put(f._1,f._2); }); List<Tuple2<String, String>> tblTime = allRow.select(functions.col("tbl_name"), functions.col("time")) .where(functions.col("type").equalTo("TBL")) .as(Encoders.tuple(Encoders.STRING(), Encoders.STRING())) .toJavaRDD().collect(); tblTime.forEach(f->{ regularTimeMap.put(f._1,f._2); }); List<Tuple2<String, String>> colTime = allRow.select(functions.col("col_value"), functions.col("time")) .where(functions.col("type").equalTo("COL")) .as(Encoders.tuple(Encoders.STRING(), Encoders.STRING())) .toJavaRDD().collect(); colTime.forEach(f->{ regularTimeMap.put(f._1,f._2); }); return regularTimeMap; } }
//重写RDDMultipleTextOutputFormat、AppendTextOutputFormat方法,实现按时间建目录,和数据增量写入文件
public class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat<String, String> { private AppendTextOutputFormat theTextOutputFormat = null; public String generateFileNameForKeyValue(String key, String value, String name) { //输出格式 /ouput/key/key.csv return key + "/"+name; } @Override protected RecordWriter getBaseRecordWriter(FileSystem fs, JobConf job, String name, Progressable progressable) throws IOException { if (this.theTextOutputFormat == null) { this.theTextOutputFormat = new AppendTextOutputFormat(); } return this.theTextOutputFormat.getRecordWriter(fs, job, name, progressable); } }
public class AppendTextOutputFormat extends TextOutputFormat<Text,Text> { protected static class MyLineRecordWriter<K, V> implements RecordWriter<K, V> { private static final byte[] NEWLINE; protected DataOutputStream out; private final byte[] keyValueSeparator; public MyLineRecordWriter(DataOutputStream out, String keyValueSeparator) { this.out = out; this.keyValueSeparator = keyValueSeparator.getBytes(StandardCharsets.UTF_8); } public MyLineRecordWriter(DataOutputStream out) { this(out, "\t"); } private void writeObject(Object o) throws IOException { if (o instanceof Text) { Text to = (Text)o; this.out.write(to.getBytes(), 0, to.getLength()); } else { this.out.write(o.toString().getBytes(StandardCharsets.UTF_8)); } } public synchronized void write(K key, V value) throws IOException { boolean nullKey = key == null || key instanceof NullWritable; boolean nullValue = value == null || value instanceof NullWritable; if (!nullKey || !nullValue) { /*if (!nullKey) { this.writeObject(key); } if (!nullKey && !nullValue) { this.out.write(this.keyValueSeparator); }*/ if (!nullValue) { this.writeObject(value); } this.out.write(NEWLINE); } } public synchronized void close(Reporter reporter) throws IOException { this.out.close(); } static { NEWLINE = "\n".getBytes(StandardCharsets.UTF_8); } } @Override public RecordWriter getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException { boolean isCompressed = getCompressOutput(job); String keyValueSeparator = job.get("mapreduce.output.textoutputformat.separator", "\t"); if (!isCompressed) { Path file = FileOutputFormat.getTaskOutputPath(job, name); FileSystem fs = file.getFileSystem(job); Path newFile = new Path(FileOutputFormat.getOutputPath(job), name); FSDataOutputStream fileOut = null; if (fs.exists(newFile)) { //存在,追加写 fileOut = fs.append(newFile); } else { fileOut = fs.create(file, progress); } return new AppendTextOutputFormat.MyLineRecordWriter(fileOut, keyValueSeparator); } else { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class); CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, job); Path file = FileOutputFormat.getTaskOutputPath(job, name + codec.getDefaultExtension()); FileSystem fs = file.getFileSystem(job); Path newFile = new Path(FileOutputFormat.getOutputPath(job), name); FSDataOutputStream fileOut = null; if (fs.exists(newFile)) { //存在,追加写 fileOut = fs.append(newFile); } else { fileOut = fs.create(file, progress); } return new AppendTextOutputFormat.MyLineRecordWriter(new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator); } } }