filter:
数据文件:in/sample.txt
举例:将sample.txt文件中包含zks的行的内容给找出来
aa bb cc aa aa aa dd dd ee ee ee ee
ff aa bb zks
ee kks
ee zz zks
Scala
package nj.zb.kb09
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Filter {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("filter")
val sc = new SparkContext(conf)
val rdd: RDD[String] = sc.textFile("in/sample.txt")
rdd.filter(x=>x.contains("zks")).collect.foreach(println)
}
}
java
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import java.util.List;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/11/7 14:08
* @Description:
**/
public class FilterJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[1]").setAppName("filter");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile("in/sample.txt");
/*JavaRDD<String> filterRDD = lines.filter(new Function<String, Boolean>() {
public Boolean call(String v1) throws Exception {
return v1.contains("zks");
}
});*/
//可以使用lambda表达式简写
JavaRDD<String> filterRDD = lines.filter( (v1) -> v1.contains("zks") );
List<String> collect = filterRDD.collect();
for (String s : collect) {
System.out.println(s);
}
}
}
运行结果:
ff aa bb zks
ee zz zks
map:
map() 接收一个函数,把这个函数用于 RDD 中的每个元素,将函数的返回结果作为结果RDD 中对应元素的值
map是一对一的关系
Scala:
object MapDemo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[1]").setAppName("mapdemo")
val sc: SparkContext = SparkContext.getOrCreate(conf)
val rdd1: RDD[String] = sc.textFile("in/sample.txt")
val rdd2: RDD[Array[String]] = rdd1.map(x=>x.split("\\s+"))
rdd2.collect.foreach(println)
//输出结果:
//[Ljava.lang.String;@b2f4ece
//[Ljava.lang.String;@7e1f584d
//[Ljava.lang.String;@7dff6d05
//[Ljava.lang.String;@45d64d27
rdd2.collect.foreach(_.foreach(println))
//输出结果:aa bb cc aa aa aa dd dd ee ee ee ee ff aa bb zks ee kks ee zz zks
}
}
java:
public class MapJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[1]").setAppName("map");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile("in/sample.txt");
JavaRDD<Iterable> filterRDD = lines.map(new Function<String, Iterable>() {
@Override
public Iterable<String> call(String v1) throws Exception {
String[] split = v1.split("\\s+");
return Arrays.asList(split);
}
});
Iterator<Iterable> iterator = mapRdd.collect().iterator();
while (iterator.hasNext()) {
System.out.println(iterator.next());
}
List<Iterable> collect = mapRdd.collect();
for(Iterable it:collect){
Iterator iterator = it.iterator();
while (iterator.hasNext()){
System.out.println(iterator.next());
}
}
}
}
flatMap:
有时候,我们希望对某个元素生成多个元素,实现该功能的操作叫作 flatMap()
faltMap的函数应用于每一个元素,对于每一个元素返回的是多个元素组成的迭代器
例如我们将数据切分为单词
scala:
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object FlatMapDemo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[1]").setAppName("flatmapdemo")
val sc: SparkContext = SparkContext.getOrCreate(conf)
val rdd1: RDD[String] = sc.textFile("in/sample.txt")
val rdd2: RDD[String] = rdd1.flatMap(_.split("\\s+"))
rdd2.collect.foreach(println)
}
}
java:
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/11/7 14:08
* @Description:
**/
public class FlatMapJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("flatMap");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile("in/sample.txt");
JavaRDD<String> stringJavaRDD = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) throws Exception {
String[] split = s.split("\\s+");
Iterator<String> iterator = Arrays.asList(split).iterator();
return iterator;
}
});
List<String> collect = stringJavaRDD.collect();
for (String s : collect) {
System.out.println(s);
}
}
}
distinct:
Return a new RDD containing the distinct elements in this RDD.
(返回一个新的RDD包含这个RDD中的不同元素)
distinct用于去重, 我们生成的RDD可能有重复的元素,使用distinct方法可以去掉重复的元素, 不过此方法涉及到混洗,操作开销很大
scala:
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object DistinctDemo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[1]").setAppName("distinctDemo")
val sc: SparkContext = SparkContext.getOrCreate(conf)
val rdd1: RDD[String] = sc.parallelize(List("aa","bb","cc","dd","aa","bb"))
rdd1.distinct().collect.foreach(println)
}
}
java:
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.util.Arrays;
import java.util.List;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/11/7 16:59
* @Description:
**/
public class DistinctJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("distinct");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> rdd1 = sc.parallelize(Arrays.asList("aa","bb","cc","dd","aa","bb","dd"));
JavaRDD<String> distinct = rdd1.distinct();
List<String> collect = distinct.collect();
for (String s : collect) {
System.out.println(s);
}
}
}
union
两个RDD进行合并
scala:
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object UnionDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("union")
val sc: SparkContext = SparkContext.getOrCreate(conf)
val rdd1: RDD[String] = sc.parallelize(List("aa", "bb", "cc"))
val rdd2: RDD[String] = sc.parallelize(List("A", "B", "C"))
val union: RDD[String] = rdd1.union(rdd2)
union.collect.foreach(println)
}
}
java:
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.util.Arrays;
import java.util.List;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/11/7 18:22
* @Description:
**/
public class UnionJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("union");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> rdd1 = sc.parallelize(Arrays.asList("aa", "bb", "cc"));
JavaRDD<String> rdd2 = sc.parallelize(Arrays.asList("A", "B", "C"));
JavaRDD<String> union = rdd1.union(rdd2);
List<String> collect = union.collect();
for (String s : collect) {
System.out.println(s);
}
}
}
intersection
RDD1.intersection(RDD2) 返回两个RDD的交集,并且去重
intersection 需要混洗数据,比较浪费性能
scala
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object IntersectionDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("intersection")
val sc: SparkContext = SparkContext.getOrCreate(conf)
val rdd1: RDD[String] = sc.parallelize(List("aa", "aa" ,"bb", "cc"))
val rdd2: RDD[String] = sc.parallelize(List("aa", "aa", "C"))
val intersection: RDD[String] = rdd1.intersection(rdd2)
intersection.collect.foreach(println) //输出结果:aa
}
}
java
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.util.Arrays;
import java.util.List;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/11/7 18:22
* @Description:
**/
public class IntersectionJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("intersection");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> rdd1 = sc.parallelize(Arrays.asList("aa", "bb", "cc"));
JavaRDD<String> rdd2 = sc.parallelize(Arrays.asList("aa", "bb", "C"));
JavaRDD<String> intersection = rdd1.intersection(rdd2);
List<String> collect = intersection.collect();
for (String s : collect) {
System.out.println(s);
}
}
}
subtract
RDD1.subtract(RDD2),返回在RDD1中出现,但是不在RDD2中出现的元素,不去重
scala
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object SubtractDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("substract")
val sc: SparkContext = SparkContext.getOrCreate(conf)
val rdd1: RDD[String] = sc.parallelize(List("aa", "aa", "cc"))
val rdd2: RDD[String] = sc.parallelize(List("A", "B", "cc"))
val subtract: RDD[String] = rdd1.subtract(rdd2)
subtract.collect.foreach(println) //aa aa
}
}
java
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.util.Arrays;
import java.util.List;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/11/7 18:22
* @Description:
**/
public class SubtractJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("subtract");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> rdd1 = sc.parallelize(Arrays.asList("aa", "aa", "cc"));
JavaRDD<String> rdd2 = sc.parallelize(Arrays.asList("cc", "B", "C"));
JavaRDD<String> subtract = rdd1.subtract(rdd2);
List<String> collect = subtract.collect();
for (String s : collect) {
System.out.println(s);
}
}
}
cartesian
RDD1.cartesian(RDD2) 返回RDD1和RDD2的笛卡儿积,这个开销非常大
scala:
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object CartesianDemo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("cartesian")
val sc: SparkContext = SparkContext.getOrCreate(conf)
val rdd1: RDD[String] = sc.parallelize(List("aa","bb","cc"))
val rdd2: RDD[Int] = sc.parallelize(List(1,2,3,4,5))
val cartesian: RDD[(String, Int)] = rdd1.cartesian(rdd2)
cartesian.collect.foreach(println)
}
}
java:
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.Arrays;
import java.util.List;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/11/7 18:57
* @Description:
**/
public class CartesianJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("cartesian");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> rdd1 = sc.parallelize(Arrays.asList("aa", "bb", "cc"));
JavaRDD<String> rdd2 = sc.parallelize(Arrays.asList("aa", "cc"));
JavaPairRDD<String, String> cartesian = rdd1.cartesian(rdd2);
List<Tuple2<String, String>> collect = cartesian.collect();
for (Tuple2<String, String> stringStringTuple2 : collect) {
System.out.println(stringStringTuple2);
}
}
}
MapToPair
将每一行的第一个单词作为键,1 作为value创建pairRDD
scala版本:
scala是没有mapToPair函数的,scala版本只需要map就可以了
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object MapToPairDemo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("maptopair")
val sc: SparkContext = SparkContext.getOrCreate(conf)
val rdd: RDD[String] = sc.textFile("in/sample.txt")
val rdd1: RDD[(String, Int)] = rdd.map(x=>(x.split("\\s+")(0),1))
rdd1.collect.foreach(println)
}
}
java版本:
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.List;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/11/7 20:15
* @Description:
**/
public class MapToPairJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("maptopair");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> stringJavaRDD = sc.textFile("in/sample.txt");
JavaPairRDD<String, Integer> pairRdd = stringJavaRDD.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
String[] s1 = s.split("\\s+");
return new Tuple2<>(s1[0], 1);
}
});
List<Tuple2<String, Integer>> collect = pairRdd.collect();
for (Tuple2<String, Integer> stringIntegerTuple2 : collect) {
System.out.println(stringIntegerTuple2);
}
}
}
输出结果:
(aa,1)
(ff,1)
(ee,1)
(ee,1)
flatMapToPair
类似于xxx连接 mapToPair是一对一,一个元素返回一个元素,而flatMapToPair可以一个元素返回多个,相当于先flatMap,在mapToPair
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object FlatMapToPair {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("flatmaptopair")
val sc: SparkContext = SparkContext.getOrCreate(conf)
val rdd: RDD[String] = sc.textFile("in/sample.txt")
val flatmaptopairRdd: RDD[(String, Int)] = rdd.flatMap(_.split("\\s+")).map((_,1))
flatmaptopairRdd.collect.foreach(println)
}
}
java版本
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/11/9 17:03
* @Description:
**/
public class FlatMapToPairJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("flatmaptopair");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> stringJavaRDD = sc.textFile("in/sample.txt");
JavaPairRDD<String, Integer> flatMapToPairRdd = stringJavaRDD.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {
@Override
public Iterator<Tuple2<String, Integer>> call(String s) throws Exception {
String[] split = s.split("\\s+");
List<Tuple2<String, Integer>> list = new ArrayList<>();
for (int i = 0; i < split.length; i++) {
String key = split[i];
list.add(new Tuple2<>(key, 1));
}
return list.iterator();
}
});
List<Tuple2<String, Integer>> collect = flatMapToPairRdd.collect();
for (Tuple2<String, Integer> stringIntegerTuple2 : collect) {
System.out.println(stringIntegerTuple2);
}
}
}
CombineByKey:
def combineByKey[C](
createCombiner: V => C,
mergeValue: (C, V) => C,
mergeCombiners: (C, C) => C): RDD[(K, C)] = self.withScope {
combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners)(null)
}
createCombiner: combineByKey() 会遍历分区中的所有元素,因此每个元素的键要么还没有遇到过,要么就和之前的某个元素的键相同。如果这是一个新的元素, combineByKey() 会使用一个叫作 createCombiner() 的函数来创建那个键对应的累加器的初始值
(x: ScoreDetail) => (x.score, 1),
mergeValue: 如果这是一个在处理当前分区之前已经遇到的键, 它会使用 mergeValue() 方法将该键的累加器对应的当前值与这个新的值进行合并
(acc1: (Int, Int), x: ScoreDetail) => (acc1._1 + x.score, acc1._2 + 1),
mergeCombiners: 由于每个分区都是独立处理的, 因此对于同一个键可以有多个累加器。如果有两个或者更多的分区都有对应同一个键的累加器, 就需要使用用户提供的 mergeCombiners() 方法将各个分区的结果进行合并。
(acc2: (Int, Int), acc3: (Int, Int)) => (acc2._1 + acc3._1, acc2._2 + acc3._2)
combineByKey前的数据:
//combineByKey前的数据
(lisi,ScoreDetail(lisi,Math,91))
(zhangsan,ScoreDetail(zhangsan,Math,99))
(lisi,ScoreDetail(lisi,English,89))
(zhangsan,ScoreDetail(zhangsan,English,97))
(wangwu,ScoreDetail(wangwu,Math,91))
(zhaoliu,ScoreDetail(zhaoliu,Math,83))
(wangwu,ScoreDetail(wangwu,English,94))
(zhaoliu,ScoreDetail(zhaoliu,English,90))
combineByKey操作:
val stuScoreInfoRdd: RDD[(String /*学生名字*/, (Int/*总成绩*/, Int/*科目数*/))] = scoreWithKeyRDD.combineByKey(
(x: ScoreDetail) => (x.score, 1),
(acc1: (Int, Int), x: ScoreDetail) => (acc1._1 + x.score, acc1._2 + 1),
(acc2: (Int, Int), acc3: (Int, Int)) => (acc2._1 + acc3._1, acc2._2 + acc3._2)
)
println("----------------")
stuScoreInfoRdd.collect.foreach(println)
输出结果:
(zhangsan,(196,2))
(zhaoliu,(173,2))
(wangwu,(185,2))
(lisi,(180,2))
完整Scala代码:
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object CombineByKey {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("combinebykey")
val sc: SparkContext = SparkContext.getOrCreate(conf)
case class ScoreDetail(name: String, subject: String, score: Int)
val scores = List(
ScoreDetail("zhangsan","Math",99),
ScoreDetail("zhangsan","English",97),
ScoreDetail("lisi","Math",91),
ScoreDetail("lisi","English",89),
ScoreDetail("wangwu","Math",91),
ScoreDetail("wangwu","English",94),
ScoreDetail("zhaoliu","Math",83),
ScoreDetail("zhaoliu","English",90)
)
val rdd: RDD[ScoreDetail] = sc.parallelize(scores)
val combineByKeyRdd: RDD[(String, (Int, Int))] = rdd.map(x => (x.name, x)).combineByKey(
(x: ScoreDetail) => (x.score, 1),
(acc1: (Int, Int), a: ScoreDetail) => ((acc1._1 + a.score), acc1._2 + 1),
(acc2: (Int, Int), acc3: (Int, Int)) => (acc2._1 + acc3._1, acc2._2 + acc3._2)
)
combineByKeyRdd.collect.foreach(println)
}
}
打印结果:
(zhangsan,(196,2))
(zhaoliu,(173,2))
(wangwu,(185,2))
(lisi,(180,2))
java版本:
ScoreDetailJava要求可序列化!
import java.io.Serializable;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/11/9 19:10
* @Description:
**/
public class ScoreDetailJava implements Serializable {
String name;
String subject;
Integer score;
public ScoreDetailJava(String name, String subject, Integer score) {
this.name = name;
this.subject = subject;
this.score = score;
}
}
CombineByKeyJava:
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.List;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/11/9 19:08
* @Description:
**/
public class CombineByKeyJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("combinebykey");
JavaSparkContext sc = new JavaSparkContext(conf);
ArrayList<ScoreDetailJava> arr = new ArrayList<>();
arr.add(new ScoreDetailJava("zhangsan","Math",80));
arr.add(new ScoreDetailJava("zhangsan","English",98));
arr.add(new ScoreDetailJava("lisi","Chinese",85));
arr.add(new ScoreDetailJava("lisi","English",99));
JavaRDD<ScoreDetailJava> rdd = sc.parallelize(arr);
JavaPairRDD<String, ScoreDetailJava> pairRdd = rdd.mapToPair(new PairFunction<ScoreDetailJava, String, ScoreDetailJava>() {
@Override
public Tuple2<String, ScoreDetailJava> call(ScoreDetailJava scoreDetailJava) throws Exception {
return new Tuple2<>(scoreDetailJava.name, scoreDetailJava);
}
});
JavaPairRDD<String, Tuple2<Integer, Integer>> combineByKeyRdd = pairRdd.combineByKey(
new Function<ScoreDetailJava, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(ScoreDetailJava v1) throws Exception {
return new Tuple2<>(v1.score, 1);
}
}, new Function2<Tuple2<Integer, Integer>, ScoreDetailJava, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> v1, ScoreDetailJava v2) throws Exception {
return new Tuple2<>(v1._1 + v2.score, v1._2 + 1);
}
}, new Function2<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> v1, Tuple2<Integer, Integer> v2) throws Exception {
return new Tuple2<>(v1._1 + v2._1, v1._2 + v2._2);
}
}
);
List<Tuple2<String, Tuple2<Integer, Integer>>> collect = combineByKeyRdd.collect();
for (Tuple2<String, Tuple2<Integer, Integer>> tp2 : collect) {
System.out.println(tp2._1+" : "+tp2._2._1/tp2._2._2);
}
}
}