package com.uplooking.bigdata.core.p2;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* java版本的关于Spark中transformation算子的操作
* 1、map:将集合中每个元素乘以7
2、filter:过滤出集合中的奇数
3、flatMap:将行拆分为单词
4、sample:根据给定的随机种子seed,随机抽样出数量为frac的数据
5、union:返回一个新的数据集,由原数据集和参数联合而成
6、groupByKey:对数组进行 group by key操作
7、reduceByKey:统计每个班级的人数
8、join:打印关联的组合信息
9、sortByKey:将学生身高进行排序
10、cogroup:打印每个学生的身高
*/
public class JavaSparkTransformationOps {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName(JavaSparkTransformationOps.class.getSimpleName())
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
// transformation_1_map_Ops(sc);
// transformation_2_filter_Ops(sc);
// transformation_3_flatMap_Ops(sc);
// transformation_4_sample_Ops(sc);
// transformation_5_union_Ops(sc);
// transformation_6_gbk_Ops(sc);
// transformation_7_rbk_Ops(sc);
// transformation_8_join_Ops(sc);
transformation_9_sbk_Ops(sc);
//关闭sparkContext
sc.close();
}
/**
9、sortByKey:将学生身高进行排序
按照key进行排序,就和treemap类似
在此基础之上,先按照姓名的升序排,如果姓名相同,再按照升高的降序排
*/
public static void transformation_9_sbk_Ops(JavaSparkContext sc) {
List<String> list = Arrays.asList(
"zhangsan 176",
"xiaodingding 175",
"xiaobao 173",
"heyajie 174.5",
"liujun 173",
"wangxiaoxiong 150"
);
JavaRDD<String> listRDD = sc.parallelize(list);
JavaPairRDD<String, Double> pairRDD = listRDD.mapToPair(new PairFunction<String, String, Double>() {
@Override
public Tuple2<String, Double> call(String line) throws Exception {
String[] splits = line.split(" ");
String name = splits[0].trim();
double height = Double.valueOf(splits[1].trim());
return new Tuple2<String, Double>(name, height);
}
});
// pairRDD.foreach(t -> System.out.println(t));
// pairRDD.sortByKey(false).foreach(t -> System.out.println(t));
//按照身高进行降序排序
JavaPairRDD<Double, String> reversePairRDD = pairRDD.mapToPair(new PairFunction<Tuple2<String, Double>, Double, String>() {
@Override
public Tuple2<Double, String> call(Tuple2<String, Double> t) throws Exception {
return new Tuple2<Double, String>(t._2(), t._1());
}
});
reversePairRDD.sortByKey(false).foreach(t -> System.out.println(t._2 + " " + t._1));
}
/**
* 8、join:打印关联的组合信息
* join(otherDataset, [numTasks]): 在类型为(K,V)和(K,W)
* 类型的数据集上调用,返回一个(K,(V,W))对,每个key中的所有元素都
* 在一起的数据集
*/
public static void transformation_8_join_Ops(JavaSparkContext sc) {
List<String> maleList = Arrays.asList(
"bd_1 male 20",
"bd_2 male 25",
"bd_3 male 15");
List<String> femaleList = Arrays.asList(
"bd_1 female 2",
"bd_2 female 10",
"bd_3 female 5"
);
JavaRDD<String> maleListRDD = sc.parallelize(maleList);
JavaRDD<String> femaleListRDD = sc.parallelize(femaleList);
JavaPairRDD<String, Integer> malePairRDD = maleListRDD.mapToPair(line -> {
String[] splits = line.split(" ");
String className = splits[0].trim();
int personNum = Integer.valueOf(splits[2].trim());
return new Tuple2<String, Integer>(className, personNum);
});
JavaPairRDD<String, Integer> femalePairRDD = femaleListRDD.mapToPair(line -> {
String[] splits = line.split(" ");
String className = splits[0].trim();
int personNum = Integer.valueOf(splits[2].trim());
return new Tuple2<String, Integer>(className, personNum);
});
JavaPairRDD<String, Tuple2<Integer, Integer>> joinRDD = malePairRDD.join(femalePairRDD);
System.out.println("-------malePairRDD----------");
malePairRDD.foreach(t -> System.out.println(t));
System.out.println("-------femalePairRDD----------");
femalePairRDD.foreach(t -> System.out.println(t));
System.out.println("-------joinRDD----------");
joinRDD.foreach(new VoidFunction<Tuple2<String, Tuple2<Integer, Integer>>>() {
@Override
public void call(Tuple2<String, Tuple2<Integer, Integer>> t) throws Exception {
System.out.println("t._1: " + t._1() + ", t._2._1: " + t._2._1() + ", t._2._2: " + t._2._2);
}
});
}
/**
* 7、reduceByKey:统计每个班级的人数
*/
public static void transformation_7_rbk_Ops(JavaSparkContext sc) {
List<String> list = Arrays.asList(
//className 男/女 num
"bd_1 male 20",
"bd_1 female 2",
"bd_2 male 25",
"bd_2 female 10",
"bd_3 male 15");
JavaRDD<String> listRDD = sc.parallelize(list);
JavaPairRDD<String, Integer> pairRDD = listRDD.mapToPair(new PairFunction<String, String, Integer>() {
/**
* 返回格式<className, 人数>
* @param line
* @return
* @throws Exception
*/
@Override
public Tuple2<String, Integer> call(String line) throws Exception {
String[] splits = line.split(" ");
if (splits == null || splits.length < 3) {
return null;
}
String className = splits[0].trim();
String gender = splits[1].trim();
int num = Integer.valueOf(splits[2].trim());
return new Tuple2<String, Integer>(className, num);
}
});
//统计每个班级中的人数
JavaPairRDD<String, Integer> retRDD = pairRDD.reduceByKey((t1, t2) -> {return t1 + t2;});
retRDD.foreach(t -> System.out.println("className: " + t._1() + ", personNum:" + t._2));
}
/**
* 6、groupByKey:对数组进行 group by key操作
* groupByKey([numTasks]): 在一个由(K,V)对组成的数据集上调用,返回一个(K,Seq[V])对的数据集。注意:
* 默认情况下,使用8个并行任务进行分组,你可以传入numTask可选参数,
* 根据数据量设置不同数目的Task
*
* 注意:
* 不到万不得已,不要轻易是groupByKey,因为要对每一个key进行聚合操作,
* 所以groupByKey会到各个partition中拉去数据,类似于reduceByKey,但是
* groupByKey和reduceByKey有一点不同,就是groupByKey不会进行本地的预聚合,
* 而reduceByKey会进行本地预聚合,所以groupByKey的操作会有大量数据传输,
* 造成大量shuffle,进而影响程序的性能
*/
public static void transformation_6_gbk_Ops(JavaSparkContext sc) {
List<String> list = Arrays.asList("hello you", "hello me", "hello me");
JavaRDD<String> listRDD = sc.parallelize(list);
JavaRDD<String> wordsRDD = listRDD.flatMap(line -> {
return Arrays.asList(line.split(" "));
});
JavaPairRDD<String, Integer> pairRDD = wordsRDD.mapToPair(word -> {
return new Tuple2<String, Integer>(word, 1);
});
pairRDD.foreach(t -> System.out.println(t._1 + " " + t._2()));
JavaPairRDD<String, Iterable<Integer>> gbkRDD = pairRDD.groupByKey();
gbkRDD.foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() {
@Override
public void call(Tuple2<String, Iterable<Integer>> t) throws Exception {
System.out.println(t._1 + " " + t._2);
}
});
}
/**
* 5、union:返回一个新的数据集,由原数据集和参数联合而成
* union(otherDataset): 返回一个新的数据集,由原数据集和参数联合而成
* @param sc
*/
public static void transformation_5_union_Ops(JavaSparkContext sc) {
JavaRDD<Integer> oddRDD = sc.parallelize(Arrays.asList(1, 3, 5, 7, 9));
JavaRDD<Integer> evenRDD = sc.parallelize(Arrays.asList(2, 4, 6, 8, 10));
JavaRDD<Integer> unionRDD = oddRDD.union(evenRDD);
unionRDD.foreach(t -> System.out.println(t));
}
/**
* 4、sample:根据给定的随机种子seed,随机抽样出数量为frac的数据
* 从一个rdd中进行数据的抽样,从整体数据中抽取frac(整体数据的比值),用样本数据来评估整体数据分布的情况
* 经常用作spark中出现数据倾斜的时候进行样本数据评估整体数据
*/
public static void transformation_4_sample_Ops(JavaSparkContext sc) {
List<Integer> list = new ArrayList<Integer>(10001);
for(int i = 0; i < 10000; i++) {
list.add(i);
}
JavaRDD<Integer> listRDD = sc.parallelize(list);
//sample得到的样本数据,不一定是精准的比值数据,在其左右范围内
JavaRDD<Integer> sampleRDD = listRDD.sample(true, 0.01);
System.out.println("sampleRDD的数据规模:" + sampleRDD.count());
sampleRDD.foreach(i -> System.out.println(i));
}
/**
* 3、flatMap:将行拆分为单词
* 比如进入flatMap中的数据为一个字符串"hello you me",那么同构flatMap这个算子,一般就可以对该字符串进行拆解
* 根据定义的规则,将其转换为一个Seq[String]--->
* hello
* you
* me
* @param sc
*/
public static void transformation_3_flatMap_Ops(JavaSparkContext sc) {
List<String> list = Arrays.asList("hello you", "hello me");
JavaRDD<String> listRDD = sc.parallelize(list);
JavaRDD<String> wordsRDD = listRDD.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterable<String> call(String line) throws Exception {
// System.out.println("line==>" + line);
return Arrays.asList(line.split(" "));
}
});
wordsRDD.foreach(word -> System.out.println(word));
}
/*
2、filter(func):过滤出集合中的奇数
filter函数会过滤掉不满足func的数据,func函数的返回值类型为Boolean,
filter会过滤掉func返回值为false的这些数据,保留func返回值为true数据
所以通过filter得到数据是源rdd的子集
*/
public static void transformation_2_filter_Ops(JavaSparkContext sc) {
List<Integer> list = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD<Integer> listRDD = sc.parallelize(list);
JavaRDD<Integer> filteredRDD = listRDD.filter(new Function<Integer, Boolean>() {
@Override
public Boolean call(Integer v1) throws Exception {
return v1 % 2 == 0;
}
});
filteredRDD.foreach(v -> System.out.println(v));
}
/*
map(func):返回一个新的分布式数据集,由每个原元素经过func函数转换后组成
不会改变又有rdd中的元素的个数,新生成的rdd中元素的个数和原rdd中元素的个数相等
1、map:将集合中每个元素乘以7
*/
public static void transformation_1_map_Ops(JavaSparkContext sc) {
List<Integer> list = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD<Integer> listRDD = sc.parallelize(list);
JavaRDD<Integer> retRDD = listRDD.map(num -> {
return num * 7;
});
retRDD.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer i) throws Exception {
System.out.println(i);
}
});
}
}
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* java版本的关于Spark中transformation算子的操作
* 1、map:将集合中每个元素乘以7
2、filter:过滤出集合中的奇数
3、flatMap:将行拆分为单词
4、sample:根据给定的随机种子seed,随机抽样出数量为frac的数据
5、union:返回一个新的数据集,由原数据集和参数联合而成
6、groupByKey:对数组进行 group by key操作
7、reduceByKey:统计每个班级的人数
8、join:打印关联的组合信息
9、sortByKey:将学生身高进行排序
10、cogroup:打印每个学生的身高
*/
public class JavaSparkTransformationOps {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName(JavaSparkTransformationOps.class.getSimpleName())
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
// transformation_1_map_Ops(sc);
// transformation_2_filter_Ops(sc);
// transformation_3_flatMap_Ops(sc);
// transformation_4_sample_Ops(sc);
// transformation_5_union_Ops(sc);
// transformation_6_gbk_Ops(sc);
// transformation_7_rbk_Ops(sc);
// transformation_8_join_Ops(sc);
transformation_9_sbk_Ops(sc);
//关闭sparkContext
sc.close();
}
/**
9、sortByKey:将学生身高进行排序
按照key进行排序,就和treemap类似
在此基础之上,先按照姓名的升序排,如果姓名相同,再按照升高的降序排
*/
public static void transformation_9_sbk_Ops(JavaSparkContext sc) {
List<String> list = Arrays.asList(
"zhangsan 176",
"xiaodingding 175",
"xiaobao 173",
"heyajie 174.5",
"liujun 173",
"wangxiaoxiong 150"
);
JavaRDD<String> listRDD = sc.parallelize(list);
JavaPairRDD<String, Double> pairRDD = listRDD.mapToPair(new PairFunction<String, String, Double>() {
@Override
public Tuple2<String, Double> call(String line) throws Exception {
String[] splits = line.split(" ");
String name = splits[0].trim();
double height = Double.valueOf(splits[1].trim());
return new Tuple2<String, Double>(name, height);
}
});
// pairRDD.foreach(t -> System.out.println(t));
// pairRDD.sortByKey(false).foreach(t -> System.out.println(t));
//按照身高进行降序排序
JavaPairRDD<Double, String> reversePairRDD = pairRDD.mapToPair(new PairFunction<Tuple2<String, Double>, Double, String>() {
@Override
public Tuple2<Double, String> call(Tuple2<String, Double> t) throws Exception {
return new Tuple2<Double, String>(t._2(), t._1());
}
});
reversePairRDD.sortByKey(false).foreach(t -> System.out.println(t._2 + " " + t._1));
}
/**
* 8、join:打印关联的组合信息
* join(otherDataset, [numTasks]): 在类型为(K,V)和(K,W)
* 类型的数据集上调用,返回一个(K,(V,W))对,每个key中的所有元素都
* 在一起的数据集
*/
public static void transformation_8_join_Ops(JavaSparkContext sc) {
List<String> maleList = Arrays.asList(
"bd_1 male 20",
"bd_2 male 25",
"bd_3 male 15");
List<String> femaleList = Arrays.asList(
"bd_1 female 2",
"bd_2 female 10",
"bd_3 female 5"
);
JavaRDD<String> maleListRDD = sc.parallelize(maleList);
JavaRDD<String> femaleListRDD = sc.parallelize(femaleList);
JavaPairRDD<String, Integer> malePairRDD = maleListRDD.mapToPair(line -> {
String[] splits = line.split(" ");
String className = splits[0].trim();
int personNum = Integer.valueOf(splits[2].trim());
return new Tuple2<String, Integer>(className, personNum);
});
JavaPairRDD<String, Integer> femalePairRDD = femaleListRDD.mapToPair(line -> {
String[] splits = line.split(" ");
String className = splits[0].trim();
int personNum = Integer.valueOf(splits[2].trim());
return new Tuple2<String, Integer>(className, personNum);
});
JavaPairRDD<String, Tuple2<Integer, Integer>> joinRDD = malePairRDD.join(femalePairRDD);
System.out.println("-------malePairRDD----------");
malePairRDD.foreach(t -> System.out.println(t));
System.out.println("-------femalePairRDD----------");
femalePairRDD.foreach(t -> System.out.println(t));
System.out.println("-------joinRDD----------");
joinRDD.foreach(new VoidFunction<Tuple2<String, Tuple2<Integer, Integer>>>() {
@Override
public void call(Tuple2<String, Tuple2<Integer, Integer>> t) throws Exception {
System.out.println("t._1: " + t._1() + ", t._2._1: " + t._2._1() + ", t._2._2: " + t._2._2);
}
});
}
/**
* 7、reduceByKey:统计每个班级的人数
*/
public static void transformation_7_rbk_Ops(JavaSparkContext sc) {
List<String> list = Arrays.asList(
//className 男/女 num
"bd_1 male 20",
"bd_1 female 2",
"bd_2 male 25",
"bd_2 female 10",
"bd_3 male 15");
JavaRDD<String> listRDD = sc.parallelize(list);
JavaPairRDD<String, Integer> pairRDD = listRDD.mapToPair(new PairFunction<String, String, Integer>() {
/**
* 返回格式<className, 人数>
* @param line
* @return
* @throws Exception
*/
@Override
public Tuple2<String, Integer> call(String line) throws Exception {
String[] splits = line.split(" ");
if (splits == null || splits.length < 3) {
return null;
}
String className = splits[0].trim();
String gender = splits[1].trim();
int num = Integer.valueOf(splits[2].trim());
return new Tuple2<String, Integer>(className, num);
}
});
//统计每个班级中的人数
JavaPairRDD<String, Integer> retRDD = pairRDD.reduceByKey((t1, t2) -> {return t1 + t2;});
retRDD.foreach(t -> System.out.println("className: " + t._1() + ", personNum:" + t._2));
}
/**
* 6、groupByKey:对数组进行 group by key操作
* groupByKey([numTasks]): 在一个由(K,V)对组成的数据集上调用,返回一个(K,Seq[V])对的数据集。注意:
* 默认情况下,使用8个并行任务进行分组,你可以传入numTask可选参数,
* 根据数据量设置不同数目的Task
*
* 注意:
* 不到万不得已,不要轻易是groupByKey,因为要对每一个key进行聚合操作,
* 所以groupByKey会到各个partition中拉去数据,类似于reduceByKey,但是
* groupByKey和reduceByKey有一点不同,就是groupByKey不会进行本地的预聚合,
* 而reduceByKey会进行本地预聚合,所以groupByKey的操作会有大量数据传输,
* 造成大量shuffle,进而影响程序的性能
*/
public static void transformation_6_gbk_Ops(JavaSparkContext sc) {
List<String> list = Arrays.asList("hello you", "hello me", "hello me");
JavaRDD<String> listRDD = sc.parallelize(list);
JavaRDD<String> wordsRDD = listRDD.flatMap(line -> {
return Arrays.asList(line.split(" "));
});
JavaPairRDD<String, Integer> pairRDD = wordsRDD.mapToPair(word -> {
return new Tuple2<String, Integer>(word, 1);
});
pairRDD.foreach(t -> System.out.println(t._1 + " " + t._2()));
JavaPairRDD<String, Iterable<Integer>> gbkRDD = pairRDD.groupByKey();
gbkRDD.foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() {
@Override
public void call(Tuple2<String, Iterable<Integer>> t) throws Exception {
System.out.println(t._1 + " " + t._2);
}
});
}
/**
* 5、union:返回一个新的数据集,由原数据集和参数联合而成
* union(otherDataset): 返回一个新的数据集,由原数据集和参数联合而成
* @param sc
*/
public static void transformation_5_union_Ops(JavaSparkContext sc) {
JavaRDD<Integer> oddRDD = sc.parallelize(Arrays.asList(1, 3, 5, 7, 9));
JavaRDD<Integer> evenRDD = sc.parallelize(Arrays.asList(2, 4, 6, 8, 10));
JavaRDD<Integer> unionRDD = oddRDD.union(evenRDD);
unionRDD.foreach(t -> System.out.println(t));
}
/**
* 4、sample:根据给定的随机种子seed,随机抽样出数量为frac的数据
* 从一个rdd中进行数据的抽样,从整体数据中抽取frac(整体数据的比值),用样本数据来评估整体数据分布的情况
* 经常用作spark中出现数据倾斜的时候进行样本数据评估整体数据
*/
public static void transformation_4_sample_Ops(JavaSparkContext sc) {
List<Integer> list = new ArrayList<Integer>(10001);
for(int i = 0; i < 10000; i++) {
list.add(i);
}
JavaRDD<Integer> listRDD = sc.parallelize(list);
//sample得到的样本数据,不一定是精准的比值数据,在其左右范围内
JavaRDD<Integer> sampleRDD = listRDD.sample(true, 0.01);
System.out.println("sampleRDD的数据规模:" + sampleRDD.count());
sampleRDD.foreach(i -> System.out.println(i));
}
/**
* 3、flatMap:将行拆分为单词
* 比如进入flatMap中的数据为一个字符串"hello you me",那么同构flatMap这个算子,一般就可以对该字符串进行拆解
* 根据定义的规则,将其转换为一个Seq[String]--->
* hello
* you
* me
* @param sc
*/
public static void transformation_3_flatMap_Ops(JavaSparkContext sc) {
List<String> list = Arrays.asList("hello you", "hello me");
JavaRDD<String> listRDD = sc.parallelize(list);
JavaRDD<String> wordsRDD = listRDD.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterable<String> call(String line) throws Exception {
// System.out.println("line==>" + line);
return Arrays.asList(line.split(" "));
}
});
wordsRDD.foreach(word -> System.out.println(word));
}
/*
2、filter(func):过滤出集合中的奇数
filter函数会过滤掉不满足func的数据,func函数的返回值类型为Boolean,
filter会过滤掉func返回值为false的这些数据,保留func返回值为true数据
所以通过filter得到数据是源rdd的子集
*/
public static void transformation_2_filter_Ops(JavaSparkContext sc) {
List<Integer> list = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD<Integer> listRDD = sc.parallelize(list);
JavaRDD<Integer> filteredRDD = listRDD.filter(new Function<Integer, Boolean>() {
@Override
public Boolean call(Integer v1) throws Exception {
return v1 % 2 == 0;
}
});
filteredRDD.foreach(v -> System.out.println(v));
}
/*
map(func):返回一个新的分布式数据集,由每个原元素经过func函数转换后组成
不会改变又有rdd中的元素的个数,新生成的rdd中元素的个数和原rdd中元素的个数相等
1、map:将集合中每个元素乘以7
*/
public static void transformation_1_map_Ops(JavaSparkContext sc) {
List<Integer> list = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD<Integer> listRDD = sc.parallelize(list);
JavaRDD<Integer> retRDD = listRDD.map(num -> {
return num * 7;
});
retRDD.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer i) throws Exception {
System.out.println(i);
}
});
}
}