1、rdd转PairRDD
package test.rddtest
import org.apache.spark.{SparkConf, SparkContext}
object RDD2PairRDDDemo {
def myfunc1(index: Int, iter: Iterator[(String)]) : Iterator[String] = {
iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator
}
def myfunc2(index:Int,iter:Iterator[(Int,String)]):Iterator[String]={
iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator
}
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("pair RDD").setMaster("local")
val sc = new SparkContext(conf)
val SingleRDD = sc.parallelize(List("scala","python","java","Spark","hadoop"),2)
SingleRDD.mapPartitionsWithIndex(myfunc1).collect.foreach(println)
//[partID:0, val: scala]
//[p