spark-streaming以每分钟为间隔消费kafka中的数据,过滤出对应分区的数据写入到hdfs对应目录中,原生的TextOutputFormat也会产生大量小文件,因此自定义了AppendTextOutputFormat会在已存在文件的基础上进行追加。但此时文件写到一个目录下,通过继承MultipleOutputFormat实现多目录写入,目录结构按照hive分区表的结构即可。
//文本文件追加写
class AppendTextOutputFormat extends TextOutputFormat[Any, Any] {
override def getRecordWriter(ignored: FileSystem, job: JobConf, iname: String, progress: Progressable): RecordWriter[Any, Any] = {
val isCompressed: Boolean = FileOutputFormat.getCompressOutput(job)
val keyValueSeparator: String = job.get("mapreduce.output.textoutputformat.separator", "\t")
//自定义输出文件名
val name = job.get("filename",iname)
if (!isCompressed) {
val file: Path = FileOutputFormat.getTaskOutputPath(job, name)
val fs: FileSystem = file.getFileSystem(job)
val newFile : Path = new Path(FileOutputFormat.getOutputPath(job), name)
val fileOut : FSDataOutputStream = if (fs.exists(newFile)) {
//存在,追加写
fs.append(newFile)
} else {
fs.create(file, progress)
}
new TextOutputFormat.LineRecordWriter[Any, Any](fileOut, keyValueSeparator)
} else {
val codecClass: Class[_ <: CompressionCodec] = FileOutputFormat.getOutputCompressorClass(job, classOf[GzipCodec])
// create the named codec
val codec: CompressionCodec = ReflectionUtils.newInstance(codecClass, job)
// build the filename including the extension
val file: Path = FileOutputFormat.getTaskOutputPath(job, name + codec.getDefaultExtension)
val fs: FileSystem = file.getFileSystem(job)
val newFile : Path = new Path(FileOutputFormat.getOutputPath(job), name + codec.getDefaultExtension)
val fileOut: FSDataOutputStream = if (fs.exists(newFile)) {
//存在,追加写
fs.append(newFile)
} else {
fs.create(file, progress)
}
new TextOutputFormat.LineRecordWriter[Any, Any](new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator)
}
}
}
//自定义多目录写与追加写
class RDDMultipleAppendTextOutputFormat extends MultipleOutputFormat[Any, Any]{
private var theTextOutputFormat: AppendTextOutputFormat = null
//产生分区目录
override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String ={
key+"/"+name
}
//追加写
override def getBaseRecordWriter(fs: FileSystem, job: JobConf, name: String, arg3: Progressable): RecordWriter[Any, Any] = {
if (this.theTextOutputFormat == null) {
this.theTextOutputFormat = new AppendTextOutputFormat()
}
return this.theTextOutputFormat.getRecordWriter(fs, job, name, arg3)
}
//key重置为空
override def generateActualKey(key: Any, value: Any): Any =
NullWritable.get()
}
spark-streaming中
//kafka数据解析后存到hdfs
def kafka2HDFS(stream: InputDStream[(String, String)]): Unit = {
val lines = stream.map(_._2) // 取出value
lines.filter(_.contains("_REALTIME_BL")).map(line=>{
parserLine(line)
}).foreachRDD(rdd=>rdd.saveAsHadoopFile("/user/hive/warehouse/",classOf[Text],classOf[Text],classOf[RDDMultipleAppendTextOutputFormat]))
}
注意hive增加新分区文件后需要更新hive分区信息。