spark-streaming多目录追加写

spark-streaming以每分钟为间隔消费kafka中的数据,过滤出对应分区的数据写入到hdfs对应目录中,原生的TextOutputFormat也会产生大量小文件,因此自定义了AppendTextOutputFormat会在已存在文件的基础上进行追加。但此时文件写到一个目录下,通过继承MultipleOutputFormat实现多目录写入,目录结构按照hive分区表的结构即可。

//文本文件追加写
class AppendTextOutputFormat extends TextOutputFormat[Any, Any] {
  override def getRecordWriter(ignored: FileSystem, job: JobConf, iname: String, progress: Progressable): RecordWriter[Any, Any] = {
    val isCompressed: Boolean = FileOutputFormat.getCompressOutput(job)
    val keyValueSeparator: String = job.get("mapreduce.output.textoutputformat.separator", "\t")
    //自定义输出文件名
    val name = job.get("filename",iname)
    if (!isCompressed) {
      val file: Path = FileOutputFormat.getTaskOutputPath(job, name)
      val fs: FileSystem = file.getFileSystem(job)
      val newFile : Path = new Path(FileOutputFormat.getOutputPath(job), name)
      val fileOut : FSDataOutputStream = if (fs.exists(newFile)) {
        //存在,追加写
        fs.append(newFile)
      } else {
        fs.create(file, progress)
      }
      new TextOutputFormat.LineRecordWriter[Any, Any](fileOut, keyValueSeparator)
    } else {
      val codecClass: Class[_ <: CompressionCodec] = FileOutputFormat.getOutputCompressorClass(job, classOf[GzipCodec])
      // create the named codec
      val codec: CompressionCodec = ReflectionUtils.newInstance(codecClass, job)
      // build the filename including the extension
      val file: Path = FileOutputFormat.getTaskOutputPath(job, name + codec.getDefaultExtension)
      val fs: FileSystem = file.getFileSystem(job)
      val newFile : Path = new Path(FileOutputFormat.getOutputPath(job), name + codec.getDefaultExtension)

      val fileOut: FSDataOutputStream = if (fs.exists(newFile)) {
        //存在,追加写
        fs.append(newFile)
      } else {
        fs.create(file, progress)
      }
      new TextOutputFormat.LineRecordWriter[Any, Any](new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator)
    }
  }

}
//自定义多目录写与追加写
class RDDMultipleAppendTextOutputFormat extends MultipleOutputFormat[Any, Any]{
   private var theTextOutputFormat: AppendTextOutputFormat = null

   //产生分区目录
   override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String ={
     key+"/"+name
   }

   //追加写
   override def getBaseRecordWriter(fs: FileSystem, job: JobConf, name: String, arg3: Progressable): RecordWriter[Any, Any] = {
     if (this.theTextOutputFormat == null) {
       this.theTextOutputFormat = new AppendTextOutputFormat()
     }
     return this.theTextOutputFormat.getRecordWriter(fs, job, name, arg3)
   }

   //key重置为空
   override def generateActualKey(key: Any, value: Any): Any =
     NullWritable.get()
 }

spark-streaming中

   //kafka数据解析后存到hdfs
   def kafka2HDFS(stream: InputDStream[(String, String)]): Unit = {
     val lines = stream.map(_._2)   // 取出value
     lines.filter(_.contains("_REALTIME_BL")).map(line=>{
       parserLine(line)
     }).foreachRDD(rdd=>rdd.saveAsHadoopFile("/user/hive/warehouse/",classOf[Text],classOf[Text],classOf[RDDMultipleAppendTextOutputFormat]))
   }
注意hive增加新分区文件后需要更新hive分区信息。


评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值