import org.apache.spark.HashPartitioner; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.storage.StorageLevel; import scala.Tuple2; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; /** * Created by hadoop on 17-10-18. */ public class JoinSparkJava { //数据拆分的方式 private static final Pattern SPACE = Pattern.compile(" "); public static void main(String[] args) throws Exception { //判断是否拿到对象 // if (args.length < 1) { // System.err.println("Usage: JavaWordCount <file>"); // System.exit(1); // } // //sparkSession对象,为用户提供了一个统一的切入点来使用spark的功能 // SparkSession spark = SparkSession // .builder() // .master("local") // .appName("JavaWordCount") // .getOrCreate(); // System.out.println("star========="); SparkConf conf=new SparkConf().setAppName("join").setMaster("local"); JavaSparkContext sc=new JavaSparkContext(conf); List<Tuple2<String,String>> users=new ArrayList<Tuple2<String,String>>(); Tuple2<String,String> user1=new Tuple2<String,String>("1212","zhouqinru"); Tuple2<String,String> user2=new Tuple2<String,String>("1213","lixiaofang"); Tuple2<String,String> user3=new Tuple2<String,String>("1214","zhaosi"); Tuple2<String,String> user4=new Tuple2<String,String>("1215","ligang"); Tuple2<String,String> user5=new Tuple2<String,String>("1216","wangwu"); users.add(user1); users.add(user2); users.add(user3); users.add(user4); users.add(user5); JavaPairRDD<String,String> userrdd=sc.parallelizePairs(users,2); JavaPairRDD<String,String> userrddG=userrdd.partitionBy(new HashPartitioner(3)).persist(StorageLevel.MEMORY_ONLY()); /* 测试底层分区 */ JavaPairRDD<String,String> newuser=userrdd.coalesce(30,true); System.out.println("分区个数为:"+newuser.getNumPartitions()); List<Tuple2<String,String>> infos=new ArrayList<Tuple2<String,String>>(); Tuple2<String,String> info1=new Tuple2<String,String>("1212","http:www.baidu.com"); Tuple2<String,String> info2=new Tuple2<String,String>("1213","http:www.taobao.com"); Tuple2<String,String> info3=new Tuple2<String,String>("1214","http:www.Ali.comi"); Tuple2<String,String> info4=new Tuple2<String,String>("1215","http:www.Tengxun.com"); Tuple2<String,String> info5=new Tuple2<String,String>("1216","http:www.Jjngdong.comu"); infos.add(info1); infos.add(info2); infos.add(info3); infos.add(info4); infos.add(info5); JavaPairRDD<String,String> infordd=sc.parallelizePairs(users); JavaPairRDD<String,Tuple2<String,String>> join_tow=userrddG.join(infordd); join_tow.collect().forEach(System.out::println); sc.stop(); } }
Spark之join
最新推荐文章于 2024-07-16 23:47:57 发布