最近工作遇到一个小问题。隔壁部门非要输出gbk格式的hadoop file。虽然这个要求很奇怪,但是仔细搞了搞发现也没有那么容易。翻了翻书,对implicit这个关键字理解的更深入了一些。编写了这么一个小工具类,可以实现sc.gbkTextFile(path)来读取gbk文件。RDD[String].saveAsGBKTextFile(path)来写入gbk文件。
import java.io.{DataOutputStream, IOException}
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.io._
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapred._
import org.apache.hadoop.util.{Progressable, ReflectionUtils}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
object RDDTextEncoding {
//隐式转换值类。可以实现RDD[String].saveAsGBKTextFile
implicit class GBKSaver(val rdd: RDD[String]) extends AnyVal {
def saveAsGBKTextFile(path: String): Unit = {
rdd.map(row => (org.apache.hadoop.io.NullWritable.get(), row))
.saveAsHadoopFile[GBKTextOutputFormat[org.apache.hadoop.io.NullWritable, String]](path)
}
}
//隐式转换值类。可以实现sc.gbkTextFile
implicit class GBKReader(val sc: SparkContext) extends AnyVal {
def gbkTextFile(path: String): RDD[String] = {
sc.hadoopFile(path,classOf[TextInputFormat],classOf[LongWritable],classOf[Text],1).map(p => new String(p._2.getBytes, 0, p._2.getLength, "GBK"))
}
}
//自定义的GBK的OutputFormat
class GBKTextOutputFormat[K, V] extends FileOutputFormat[K, V] {
val encoding = "GBK"
class LineRecordWriter[K, V](var out: DataOutputStream) extends RecordWriter[K, V] {
@throws[IOException]
def writeObject(o: Any): Unit = {
out.write(o.toString.getBytes(encoding))
}
@throws[IOException]
override def write(key: K, value: V): Unit = {
val nullKey = key == null || key.isInstanceOf[NullWritable]
val nullValue = value == null || value.isInstanceOf[NullWritable]
if (nullKey && !nullValue) {
writeObject(value)
out.write("\n".getBytes(encoding))
}
}
@throws[IOException]
override def close(reporter: Reporter): Unit = {
out.close()
}
}
override def getRecordWriter(ignored: FileSystem, job: JobConf, name: String, progress: Progressable): RecordWriter[K, V] = {
if (FileOutputFormat.getCompressOutput(job)) {
val codecClass = FileOutputFormat.getOutputCompressorClass(job, classOf[GzipCodec])
val codec = ReflectionUtils.newInstance(codecClass, job)
val file = FileOutputFormat.getTaskOutputPath(job, name + codec.getDefaultExtension)
val fs = file.getFileSystem(job)
val fileOut = fs.create(file, progress)
new LineRecordWriter[K, V](new DataOutputStream(codec.createOutputStream(fileOut)))
} else {
val file = FileOutputFormat.getTaskOutputPath(job, name)
val fs = file.getFileSystem(job)
val fileOut = fs.create(file, progress)
new LineRecordWriter[K, V](fileOut)
}
}
}
}