import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by maokm on 2017/7/20.
*/
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any] {
override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String =
key.asInstanceOf[String]
}
object test {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("test").setMaster("local[2]")
val sc = new SparkContext(conf)
val arr = Array("hadoop","hadoop","spark","hadoop","hello","work","spark")
val word = sc.parallelize(arr)
val tuples = word.flatMap(_.split(",")).map((_,1)).reduceByKey(_ + _)
tuples.saveAsHadoopFile("E://out",classOf[String],classOf[Integer],classOf[RDDMultipleTextOutputFormat])
sc.stop()
}
}
RDDMultipleTextOutputFormat类中的generateFileNameForKeyValue函数有三个参数,key和value是我们的key和value,name参数是每个Reduce的编号。
结果目录文件格式