spark读取保存gbk的hadoop文件

最近工作遇到一个小问题。隔壁部门非要输出gbk格式的hadoop file。虽然这个要求很奇怪,但是仔细搞了搞发现也没有那么容易。翻了翻书,对implicit这个关键字理解的更深入了一些。编写了这么一个小工具类,可以实现sc.gbkTextFile(path)来读取gbk文件。RDD[String].saveAsGBKTextFile(path)来写入gbk文件。

import java.io.{DataOutputStream, IOException}
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.io._
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapred._
import org.apache.hadoop.util.{Progressable, ReflectionUtils}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

object RDDTextEncoding {

  //隐式转换值类。可以实现RDD[String].saveAsGBKTextFile
  implicit class GBKSaver(val rdd: RDD[String]) extends AnyVal {
    def saveAsGBKTextFile(path: String): Unit = {
      rdd.map(row => (org.apache.hadoop.io.NullWritable.get(), row))
        .saveAsHadoopFile[GBKTextOutputFormat[org.apache.hadoop.io.NullWritable, String]](path)
    }
  }

  //隐式转换值类。可以实现sc.gbkTextFile
  implicit class GBKReader(val sc: SparkContext) extends AnyVal {
    def gbkTextFile(path: String): RDD[String] = {
      sc.hadoopFile(path,classOf[TextInputFormat],classOf[LongWritable],classOf[Text],1).map(p => new String(p._2.getBytes, 0, p._2.getLength, "GBK"))
    }
  }

  //自定义的GBK的OutputFormat
  class GBKTextOutputFormat[K, V] extends FileOutputFormat[K, V] {
    val encoding = "GBK"

    class LineRecordWriter[K, V](var out: DataOutputStream) extends RecordWriter[K, V] {
      @throws[IOException]
      def writeObject(o: Any): Unit = {
        out.write(o.toString.getBytes(encoding))
      }

      @throws[IOException]
      override def write(key: K, value: V): Unit = {
        val nullKey = key == null || key.isInstanceOf[NullWritable]
        val nullValue = value == null || value.isInstanceOf[NullWritable]
        if (nullKey && !nullValue) {
          writeObject(value)
          out.write("\n".getBytes(encoding))
        }
      }

      @throws[IOException]
      override def close(reporter: Reporter): Unit = {
        out.close()
      }
    }

    override def getRecordWriter(ignored: FileSystem, job: JobConf, name: String, progress: Progressable): RecordWriter[K, V] = {
      if (FileOutputFormat.getCompressOutput(job)) {
        val codecClass = FileOutputFormat.getOutputCompressorClass(job, classOf[GzipCodec])
        val codec = ReflectionUtils.newInstance(codecClass, job)
        val file = FileOutputFormat.getTaskOutputPath(job, name + codec.getDefaultExtension)
        val fs = file.getFileSystem(job)
        val fileOut = fs.create(file, progress)
        new LineRecordWriter[K, V](new DataOutputStream(codec.createOutputStream(fileOut)))
      } else {
        val file = FileOutputFormat.getTaskOutputPath(job, name)
        val fs = file.getFileSystem(job)
        val fileOut = fs.create(file, progress)
        new LineRecordWriter[K, V](fileOut)
      }
    }
  }

}

 

  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值