由于spark在读取文件时的默认编码格式为utf-8,所以spark在处理gbk格式的文件时会出现乱码问题,以下代码则可以解决这一问题
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.spark.rdd.RDD
val input = Utils.Basepath + "/viewLog/in/" //用户日志输入路径
val conf = new SparkConf
val context = new SparkContext(conf)
val inputRdd = context.hadoopFile(input, classOf[TextInputFormat],
classOf[LongWritable], classOf[Text]).map(
pair => new String(pair._2.getBytes, 0, pair._2.getLength, "GBK"))