spark 输出文件的默认文件名 :
part-000xxx
part-000xxx
比如:
代码:
import org.apache.spark._
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
/** 第一步:
* 自定义类继承MultipleTextOutputFormat
* 重写generateFileNameForKeyValue方法
* 指定文件名生成规则,
* 这里测试以RDD的key命名生成csv文件
*/
class MyTextOutputFormat extends MultipleTextOutputFormat[Any, Any] {
override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String = {
/**我文件命名规则*/
key.asInstanceOf[String] + ".csv"
}
}
/**
* 测试命名规则
*/
object MyOutPutHdfsFileName {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("SplitTest").setMaster("local[2]")
val sc = new SparkContext(conf)
/** 第二步:造数据*/
val rdd = sc.parallelize(List(
("id", "salemoney", "saleQty", "time"),
("1011", "450", "3", "2020-01-13 00:00:05"),
("1012","496", "20","2020-01-13 10:20:15"),
("1013","498", "8","2020-01-13 10:20:15"),
("1014","400", "9","2020-01-13 10:20:15")))
/** 第三步:指定RDD可以的构成,同时value拼成csv默认的逗号隔开*/
rdd.map(r => ("liucf-file-name-test",","+r._1+","+r._2+","+r._3+","+r._4))
.partitionBy(new HashPartitioner(1))
/** 第四步:调用saveAsHadoopFile方法指定输出目录,和输出格式使用自己的MyTextOutputFormat类*/
.saveAsHadoopFile("/data/lucf", classOf[String], classOf[String], classOf[MyTextOutputFormat])
sc.stop()
}
}
测试结果: