rdd写压缩
//sep是字段分隔符
def saveDFAsTxtGz(df: DataFrame, path: String, sep: String) = {
val rdd = df.rdd.map(_.mkString(sep))
rdd.saveAsTextFile(path, classOf[GzipCodec])
}
dataframe写orc压缩
df.write.mode(SaveMode.Overwrite)
.option(“orc.compress”, “zlib”)
.orc(output)
Spark禁用打_SUCCESS标记
df.write.mode(SaveMode.Overwrite)
.option(“orc.compress”, “zlib”)
.option(“mapreduce.fileoutputcommitter.marksuccessfuljobs”, false)
.orc(output)
冲突处理
import spark.sqlContext.implicits._
import spark.implicits._
转换处理
import scala.collection.JavaConversions._
import scala.collection.JavaConverters._
函数包
import org.apache.spark.sql.functions._