spark-streaming多目录追加写

最新推荐文章于 2022-01-04 22:20:00 发布

ukakasu

最新推荐文章于 2022-01-04 22:20:00 发布

阅读量2.5k

点赞数

分类专栏： spark

本文链接：https://blog.csdn.net/ukakasu/article/details/80067068

版权

spark 专栏收录该内容

4 篇文章 1 订阅

订阅专栏

spark-streaming以每分钟为间隔消费kafka中的数据，过滤出对应分区的数据写入到hdfs对应目录中，原生的TextOutputFormat也会产生大量小文件，因此自定义了AppendTextOutputFormat会在已存在文件的基础上进行追加。但此时文件写到一个目录下，通过继承MultipleOutputFormat实现多目录写入，目录结构按照hive分区表的结构即可。

//文本文件追加写
class AppendTextOutputFormat extends TextOutputFormat[Any, Any] {
  override def getRecordWriter(ignored: FileSystem, job: JobConf, iname: String, progress: Progressable): RecordWriter[Any, Any] = {
    val isCompressed: Boolean = FileOutputFormat.getCompressOutput(job)
    val keyValueSeparator: String = job.get("mapreduce.output.textoutputformat.separator", "\t")
    //自定义输出文件名
    val name = job.get("filename",iname)
    if (!isCompressed) {
      val file: Path = FileOutputFormat.getTaskOutputPath(job, name)
      val fs: FileSystem = file.getFileSystem(job)
      val newFile : Path = new Path(FileOutputFormat.getOutputPath(job), name)
      val fileOut : FSDataOutputStream = if (fs.exists(newFile)) {
        //存在，追加写
        fs.append(newFile)
      } else {
        fs.create(file, progress)
      }
      new TextOutputFormat.LineRecordWriter[Any, Any](fileOut, keyValueSeparator)
    } else {
      val codecClass: Class[_ <: CompressionCodec] = FileOutputFormat.getOutputCompressorClass(job, classOf[GzipCodec])
      // create the named codec
      val codec: CompressionCodec = ReflectionUtils.newInstance(codecClass, job)
      // build the filename including the extension
      val file: Path = FileOutputFormat.getTaskOutputPath(job, name + codec.getDefaultExtension)
      val fs: FileSystem = file.getFileSystem(job)
      val newFile : Path = new Path(FileOutputFormat.getOutputPath(job), name + codec.getDefaultExtension)

      val fileOut: FSDataOutputStream = if (fs.exists(newFile)) {
        //存在，追加写
        fs.append(newFile)
      } else {
        fs.create(file, progress)
      }
      new TextOutputFormat.LineRecordWriter[Any, Any](new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator)
    }
  }

}

//自定义多目录写与追加写
class RDDMultipleAppendTextOutputFormat extends MultipleOutputFormat[Any, Any]{
   private var theTextOutputFormat: AppendTextOutputFormat = null

   //产生分区目录
   override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String ={
     key+"/"+name
   }

   //追加写
   override def getBaseRecordWriter(fs: FileSystem, job: JobConf, name: String, arg3: Progressable): RecordWriter[Any, Any] = {
     if (this.theTextOutputFormat == null) {
       this.theTextOutputFormat = new AppendTextOutputFormat()
     }
     return this.theTextOutputFormat.getRecordWriter(fs, job, name, arg3)
   }

   //key重置为空
   override def generateActualKey(key: Any, value: Any): Any =
     NullWritable.get()
 }

spark-streaming中

   //kafka数据解析后存到hdfs
   def kafka2HDFS(stream: InputDStream[(String, String)]): Unit = {
     val lines = stream.map(_._2)   // 取出value
     lines.filter(_.contains("_REALTIME_BL")).map(line=>{
       parserLine(line)
     }).foreachRDD(rdd=>rdd.saveAsHadoopFile("/user/hive/warehouse/",classOf[Text],classOf[Text],classOf[RDDMultipleAppendTextOutputFormat]))
   }

注意hive增加新分区文件后需要更新hive分区信息。