1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 | | package com.lyzx.spark.streaming;
import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function2; import org.junit.Test; import scala.Tuple2; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List;
/** * 测试java语言开发Spark程序的各个算子 * 由于java语言是强类型的语言,所以这儿尽量使用lambda表达式以尽量使代码简洁 */ public class Day01 {
/** * 测试map和mapToPair算子 * map: 是一个映射算子即把RDD中每个Partition中的数据做一个映射即转变为别的格式/类型 * JavaRDD[M]=>JavaRDD[N],对每一条数据都调用一次map操作 * mapToPair:也是一个映射算子,把一个RDD映射为[K,V]格式的RDD * mapPartitions:对每一个Partition调用一次映射算子,是对map算子的优化 * mapPartitionsWithIndex:对每一个Partition调用一次映射算子,并带有每个partition的编号即索引 * @param ctx */ public void testMap(JavaSparkContext ctx){ JavaRDD<Integer> rdd = ctx.parallelize(Arrays.asList(1,2,3,4,5,6,7,8,9,10)); rdd.map(x->x+100) .mapToPair(x->new Tuple2<>(x,x)) .mapPartitions(itr->{ List<String> result = new ArrayList<>(); while (itr.hasNext()){ Tuple2 t = itr.next(); result.add(t._1+"-"+t._2); } return result; }) .mapPartitionsWithIndex((index,itr)->{ List<String> result = new ArrayList<>(); System.out.println("==="+index); while(itr.hasNext()){ String item = itr.next(); System.out.printf(" "+item); result.add(item); } return result.iterator(); },true) .foreach(x-> System.out.println(x)); }
/** * 测试filter算子 * 通过对每个元素做判断并返回boolean值类型的参数, * 如果返回true就保留该元素,否则去除掉 * @param ctx */ public void testFilter(JavaSparkContext ctx){ JavaRDD<Integer> rdd = ctx.parallelize(Arrays.asList(1,2,3,4,5,6,7,8,9,10));
//x代表这个RDD的每一个元素,迭代每个元素判断是否保留 JavaPairRDD pairRDD = rdd.filter(x->x>5) .mapToPair(x->new Tuple2<>(x,x+1000));
pairRDD.foreach(x-> System.out.println(x)); }
/** * 测试flatMap算子,这个算子就是先map后flat即 * 先做map操作如下做完map操作后的返回值是String[] * 然后把数组中的每一项掏出来作为单独的一项 * @param ctx */ public void testFlatMap(JavaSparkContext ctx){ JavaRDD<String> rdd = ctx.parallelize(Arrays.asList("hello java","hello c++","hello c","hello java","hello 鸣人 "," hello 雏田")); List<Tuple2<String,Integer>> items = rdd.flatMap(x->Arrays.asList(x.trim().split(" "))) .mapToPair(x->new Tuple2<>(x,1)) .reduceByKey((x,y)->x+y) .take(3);
Iterator itr = items.iterator(); while(itr.hasNext()){ System.out.println(itr.next()); } }
/** * Sample:随机抽样算子 * withReplacement:表示取出来的元素是否要放回去,true表示要放回去 * fraction:取多少,用一个小数表示,0.2表示取出20% * seed:表示种子,用于在调试时取出相同的元素 * @param ctx */ public void testSample(JavaSparkContext ctx){ String[] table = {"A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"}; JavaRDD rdd = ctx.parallelize(Arrays.asList(table)); JavaRDD sampleRdd = rdd.sample(false,0.2); sampleRdd.foreach(x-> System.out.println(x)); }
/** * RDD还可以当做一个集合使用 * union:表示求并集但是不去重 * intersection:求交集 * subtract:求余集 * cartesian:求笛卡尔积 * @param ctx */ public void testSetOperate(JavaSparkContext ctx){ JavaRDD r1 = ctx.parallelize(Arrays.asList(1,2,3,4,5,6,7,8,9,10)); JavaRDD r2 = ctx.parallelize(Arrays.asList(10,11,12,13,14,15,16,17,18,19));
//求并集不去重 r1.union(r2) .distinct() .foreach(x-> System.out.print(x+" ")); System.out.println("=======================");
//求交集 r1.intersection(r2) .foreach(x-> System.out.print(x+"="));
System.out.println("======================="); //差集/余集 r1-r2 r1.subtract(r2) .foreach(x-> System.out.print(x+" <> "));
//笛卡尔积 System.out.println("======================="); r1.cartesian(r2) .foreach(x-> System.out.print(x+"|")); }
/** * groupByKey:按照key分组,把key相同的数据放在一个组里,组成二元组格式Key是相同的键, * 把对应的值都放在一个Iterable集合里 * 注意:这个算子时基于[K,V]格式的RDD的 * 对于[K,V]格式的RDD即JavaPairRDD一定要指定泛型要不然不能使用具体的类型 * 如下在foreach时x是一个Object类型而不是Tuple2类型 * * JavaPairRDD[K,V]=>JavaPairRDD[k,Iterable<V>] * e.g: * rdd1:[(1,1),(2,2),(3,3),(1,1),(2,2)] * rdd2 = rdd1.groupByKey() * rdd2:[(1,[1,1]),(2,(2,2)),(3,[3])] * 返回值类型二元组,key就是相同的键,值时相同的键对应的值的Iterable集合 * @param ctx */ public void testGroupByKey(JavaSparkContext ctx){ List<Tuple2<String,Integer>> pairData = Arrays.asList(new Tuple2<>("A",1), new Tuple2<>("B",2), new Tuple2<>("C",3), new Tuple2<>("B",2), new Tuple2<>("C",3));
JavaPairRDD<String,Integer> r1 = ctx.parallelizePairs(pairData); r1.groupByKey() .foreach(x->{ System.out.println(x._1); Iterable<Integer> itr = x._2; for(Integer i : itr){ System.out.print(i+" "); } System.out.println(); }); }
/** * 按照key做聚合 * 有两个类型相同的参数(T t1,T t2)最后的结果是T类型 * 也就是说把键相同的值多一次聚合即[K,V]是["one",Iterable<1,2,3,4,5,6,7,8,9,10>] * (Int,Int)=>Int 例如:(x,y)=>x+y * 第一次把1赋给x,把2赋给y做相加的操作后把结果3再次赋值给x * 然后把第三个元素赋给y在做相加操作,以此类推 * @param ctx */ public void testReduceByKey(JavaSparkContext ctx){ JavaRDD<String> rdd = ctx.parallelize(Arrays.asList("A","B","D","Z","X","Y","A","B","D","Z","X","Y","A","B","D","A","B","D","A","B","D","A","B","D")); rdd.mapToPair(x->new Tuple2<>(x,1)) .reduceByKey((x,y)->x+y) .sortByKey(false) .foreach(x-> System.out.println(x)); }
/** * join:类似于SQL语言中的等值inner join * 把键相同的值们放入一个元组中,下面是这个例子程序的运算结果 * (B,(2,B2)) * (B,(2,B2)) * (C,(3,C2)) * (C,(3,C2)) * @param ctx */ public void testJoin(JavaSparkContext ctx){ JavaPairRDD<String,Integer> r1 = ctx.parallelizePairs(Arrays.asList( new Tuple2<>("A",1), new Tuple2<>("B",2), new Tuple2<>("C",3), new Tuple2<>("A",1), new Tuple2<>("B",2), new Tuple2<>("C",3) ));
JavaPairRDD<String,String> r2 = ctx.parallelizePairs(Arrays.asList( new Tuple2<>("B","B2"), new Tuple2<>("C","C2"), new Tuple2<>("D","D2") ));
r1.join(r2) .foreach(x-> System.out.println(x)); }
/** * cogroup:做一个等值连接,不过是把键相同的值现在map端做一下聚合 * 如下的例子中: * 先把r1中B对应的值做一下聚合放入一个Iterable里面形如 [B,Iterable<200,500>] * 再把R2中B对应的值做聚合形如:[B,Iterable<2,5>] * 最后再把B对应的值再次做聚合形成一个二元组,最后B对应的结果如[B,(Iterable<200,500>,Iterable<2,5>)] * 键对应的值时一个二元组,里面放的是两个Iterable集合 * @param ctx */ public void testCogroup(JavaSparkContext ctx){ JavaPairRDD<String,Integer> r1 = ctx.parallelizePairs(Arrays.asList( new Tuple2<>("A",100), new Tuple2<>("B",200), new Tuple2<>("C",300), new Tuple2<>("A",400), new Tuple2<>("B",500), new Tuple2<>("C",600) ));
JavaPairRDD<String,Integer> r2 = ctx.parallelizePairs(Arrays.asList( new Tuple2<>("B",2), new Tuple2<>("C",3), new Tuple2<>("D",4), new Tuple2<>("B",5), new Tuple2<>("C",6), new Tuple2<>("D",7) ));
r1.cogroup(r2) .foreach(x->{ Tuple2<Iterable<Integer>,Iterable<Integer>> t2 = x._2;
//r1对应的值的集合 Iterable<Integer> itr1 = t2._1; for(Integer item : itr1 ){ System.out.print(":"+item+" "); }
//r2对应的值的集合 Iterable<Integer> itr2 = t2._2; for(Integer item : itr2 ){ System.out.println(":::"+item+" "); } }); }
/** * coalesce:重分区算子 * 当第二个参数为true时肯定执行且有shuffle操作 * @param ctx */ public void testCoalesce(JavaSparkContext ctx){ JavaPairRDD<String,Integer> rdd = ctx.parallelize(Arrays.asList("A","B","D","Z","X","Y","A","B","D","Z","X","Y"),3) .mapToPair(x->new Tuple2<>(x,1));
rdd.coalesce(2) .mapPartitionsWithIndex((index,itr)->{ System.out.println(">>::"+index); while(itr.hasNext()){ System.out.print(itr.next()+" "); } return itr; },false) .foreach(x-> System.out.println(x)); }
/** * 聚合操作(T t1,T t2)=>T t * @param ctx */ public void testReduce(JavaSparkContext ctx){ JavaRDD<Integer> rdd = ctx.parallelize(Arrays.asList(1,2,3,4,5,6,7,8,9)); JavaRDD<Integer> cacheRdd = rdd.cache();
long count = cacheRdd.count();
Integer i = cacheRdd.reduce((x,y)->x+y); System.out.println(count+"===="+i); }
/** * takeSample:随机取出N个元素 * takeOrdered:按照顺序取出N个元素 * @param ctx */ public void testTakeSample(JavaSparkContext ctx){ JavaRDD<String> rdd = ctx.parallelize(Arrays.asList("F","G","H","A","B","C","D","E","I","L","M","N")); JavaRDD cacheRdd = rdd.cache(); List<String> elements = cacheRdd.takeSample(false,5); for(String item : elements){ System.out.println(item); }
List<String> orderedList = cacheRdd.takeOrdered(2); for(String item : orderedList){ System.out.println("::"+item); } }
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("Day01").setMaster("local"); JavaSparkContext ctx = new JavaSparkContext(conf);
//Q1:在foreachRDD中怎么取出二元组的每一项 Day01 t = new Day01(); // t.testMap(ctx); // t.testFilter(ctx); // t.testFlatMap(ctx); // t.testSample(ctx); // t.testSetOperate(ctx); // t.testGroupByKey(ctx); // t.testReduceByKey(ctx); // t.testJoin(ctx); t.testCogroup(ctx); // t.testCoalesce(ctx); // t.testReduce(ctx); // t.testTakeSample(ctx); } |