背景
MapReduce中提供了表连接操作,其中包括Map join、Reduce join等。
group by
Userinfos 实体类
package com.njbdqn.cust;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class Userinfos implements WritableComparable<Userinfos> {
private int userid;
private String username;
private String className;
private int score;
public int getUserid() {
return userid;
}
public void setUserid(int userid) {
this.userid = userid;
}
public String getUsername() {
return username;
}
public void setUsername(String username) {
this.username = username;
}
public String getClassName() {
return className;
}
public void setClassName(String className) {
this.className = className;
}
public int getScore() {
return score;
}
public void setScore(int score) {
this.score = score;
}
@Override
public String toString() {
return "Userinfos{" +
"userid=" + userid +
", username='" + username + '\'' +
", className='" + className + '\'' +
", score=" + score +
'}';
}
@Override
public int compareTo(Userinfos o) {
if (score > o.getScore()) {
return 1;
} else {
return -1;
}
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(userid);
dataOutput.writeUTF(username);
dataOutput.writeUTF(className);
dataOutput.writeInt(score);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.userid= dataInput.readInt();
this.username = dataInput.readUTF();
this.className = dataInput.readUTF();
this.score = dataInput.readInt();
}
}
UserMapper
package com.njbdqn.cust;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class UserMapper extends Mapper<LongWritable, Text,Userinfos, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String [] sps = value.toString().split(",");
Userinfos us = new Userinfos();
us.setUserid(Integer.parseInt(sps[0]));
us.setUsername(sps[1]);
us.setClassName(sps[2]);
us.setScore(Integer.parseInt(sps[3]));
context.write(us,NullWritable.get());
}
}
Driver
package com.njbdqn.cust;
import com.njbdqn.util.Tools;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Driver {
public static void main(String[] args) throws Exception {
Tools.getInstance().checkpoint();
Job job = Job.getInstance(new Configuration());
job.setJarByClass(Driver.class);
FileInputFormat.addInputPath(job,new Path("f://sourcel"));
job.setMapperClass(UserMapper.class);
job.setMapOutputKeyClass(Userinfos.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Userinfos.class);
job.setOutputValueClass(NullWritable.class);
FileOutputFormat.setOutputPath(job,new Path("f://ff"));
job.waitForCompletion(true);
}
}
Mapper join
Mapper适合一大一小表,小表不超过10M,
package com.njbdqn.mapper;
import com.njbdqn.cust.Driver;
import com.njbdqn.util.Tools;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
public class MyMapInner {
private static class MyMapper extends Mapper<LongWritable, Text,Text, NullWritable>{
private Map myType = new HashMap();//存储小表信息的
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//获取缓存中的小表
String fileName = context.getCacheFiles()[0].getPath();
// String fileName = DistributedCa
final BufferedReader reader = new BufferedReader(new FileReader(fileName));
String str = null;
while ((str=reader.readLine())!=null){
String [] sps = str.split(",");
myType.put(sps[0],sps[1]);
}
reader.close();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String [] goodInfos = value.toString().split(",");
//根据主外键获取商品分类
String type = myType.get(goodInfos[1]).toString();
//将数据填充回数组
goodInfos[1] = type;
//将数组转字符串
Text text = new Text(Arrays.toString(goodInfos));
context.write(text,NullWritable.get());
}
}
public static class MyReduce extends Reducer<Text,NullWritable,Text,NullWritable>{
}
public static void main(String[] args) throws Exception {
Tools.getInstance().checkpoint();
Job job = Job.getInstance(new Configuration());
job.setJarByClass(Driver.class);
FileInputFormat.addInputPath(job,new Path("f://source2"));
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// job.addCacheFile(new URI("hdfs://192.168.56.120:9000/1.csv"));
job.addFileToClassPath(new Path("f://1.csv"));
FileOutputFormat.setOutputPath(job,new Path("f://ff"));
job.waitForCompletion(true);
}
}
Redece join
package com.njbdqn.reducejoin;
import com.njbdqn.util.Tools;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* 二边都是大文件
*/
public class MyReduceInner {
public static class MyMapper extends Mapper<LongWritable, Text,Text,Text>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取当前行数据出自哪个文件的文件名
String path = ((FileSplit)context.getInputSplit()).getPath().toString();
String [] words = value.toString().split(",");
//不同文件输出的键的位置不同
if (path.contains("1")){
context.write(new Text(words[0]),new Text("type:"+words[1]));
}else {
context.write(new Text(words[1]),new Text("context:"+words[0]+":"+words[1]+":"+words[2]));
}
}
}
public static class MyReduce extends Reducer<Text,Text,Text, NullWritable>{
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//按分类编号已经分好组 处理二种不同的数据
//先将迭代器的数据存放到集合中
List<Text> lst = new ArrayList<>();
//将迭代器转化成数组
for (Text tx : values) {
String s = tx.toString();
lst.add(new Text(s));
}
//在数组中找到前面有type:的信息 用来获得商品分类信息
String typeInfo="";
for (Text tx : lst) {
String val = tx.toString();
if (val.contains("type")){
typeInfo = val.substring(val.indexOf(":")+1);
//再结合中移除本条信息
lst.remove(tx);
break;
}
}
//将其他的信息遍历替换中间的内容
for (Text tx : lst) {
String [] infos = tx.toString().split(":");
infos[2] = typeInfo;
context.write(new Text(Arrays.toString(infos)),NullWritable.get());
}
}
}
public static void main(String[] args) throws Exception{
Tools.getInstance().checkpoint();
Job job = Job.getInstance(new Configuration());
job.setJarByClass(MyReduceInner.class);
FileInputFormat.addInputPath(job,new Path("f://source3"));
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileOutputFormat.setOutputPath(job,new Path("f://ff"));
job.waitForCompletion(true);
}
}