spark之join操作

import java.util.Arrays;
import java.util.Iterator;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
 
import scala.Tuple2;
 
public class join {
 
	public static void main(String[] args) {
		SparkConf conf = new SparkConf().setAppName("join").setMaster("local[*]");
		JavaSparkContext jsc = new JavaSparkContext(conf);
		JavaRDD<String> persons = jsc.textFile("spark/input3/person.txt");
		JavaRDD<String> addresses = jsc.textFile("spark/input3/address.txt");
		

		//得到 key: 邮编号,value:学号和名字
		JavaPairRDD<String, String> personkv = persons.mapToPair(new PairFunction<String, String,String>() {
			private static final long serialVersionUID = 1L;
			public Tuple2<String,String> call(String str) throws Exception {
				
					String[] personsplit = str.split(" |\t");     // notice maybe \t
					//System.out.println("length"+personsplit.length);
					//处理缺省数据
					if (personsplit.length == 3)
					{
						//System.out.println("i am not null");
						String code=personsplit[2];
						String value = personsplit[0] +" "+ personsplit[1];		//number + name
						return new Tuple2<String, String>(code, value);
					}
					else {
						
						//System.out.println("null null null ++++++++="+personsplit.length);	
						return new Tuple2<String,String>(null,null);
						}
			}
		});  // to split the person and get the form <210000,1 Aaron>
		
		//得到key: 邮编号 value:城市
		JavaPairRDD<String, String> addresskv = addresses.mapToPair(new PairFunction<String, String,String>() {
			private static final long serialVersionUID = 1L;
			public Tuple2<String,String> call(String str) throws Exception {
				String[] addresssplit = str.split(" |\t");
				String code=addresssplit[0];
				String value = addresssplit[1];		//city
				return new Tuple2<String, String>(code, value);
			}
		});				//to split the address and get the  form <210000,Nanjing>
		
		//进行join操作
		JavaPairRDD<String, Tuple2<String, String>> joinres=personkv.join(addresskv);

		//遍历输出
		joinres.foreach(new VoidFunction<Tuple2<String, Tuple2<String, String>>>() {
			private static final long serialVersionUID = 1L;
			public void call(Tuple2<String, Tuple2<String, String>> t) throws Exception {
				System.out.println(t._2()._1+" "+t._1+ " "+t._2()._2);
			}

		});

		joinres.saveAsTextFile("./spark/output3/");
	}
 
}

输入:

person.txt:
	1 Aaron 210000
	.....
address.txt:
	210000 Nanjing
	.........
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值