两表关联后,并对某字段进行排序
测试数据:
address.txt
#地址ID 地址名称
1 北京
2 上海
3 广州
employee.txt
#人员ID 人员名称 地址ID
1 张三 1
2 李四 2
3 王五 1
4 赵六 3
5 马七 3
首先,bean实体类:User
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class User implements WritableComparable<User> {
private String userNo = "";
private String userName = "";
private String cityNo = "";
private String cityName = "";
public User() {
}
public User(User user) {
this.userNo = user.getUserNo();
this.userName = user.getUserName();
this.cityNo = user.getCityNo();
this.cityName = user.getCityName();
}
public User(String userNo, String userName, String cityNo, String cityName) {
this.userNo = userNo;
this.userName = userName;
this.cityNo = cityNo;
this.cityName = cityName;
}
@Override
public String toString() {
return this.userNo + " " + this.userName + " " + this.cityName;
}
@Override
public int compareTo(User o) {
return 0;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(this.userNo);
dataOutput.writeUTF(this.userName);
dataOutput.writeUTF(this.cityNo);
dataOutput.writeUTF(this.cityName);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.userNo = dataInput.readUTF();
this.userName = dataInput.readUTF();
this.cityNo = dataInput.readUTF();
this.cityName = dataInput.readUTF();
}
public String getUserNo() {
return userNo;
}
public void setUserNo(String userNo) {
this.userNo = userNo;
}
public String getUserName() {
return userName;
}
public void setUserName(String userName) {
this.userName = userName;
}
public String getCityNo() {
return cityNo;
}
public void setCityNo(String cityNo) {
this.cityNo = cityNo;
}
public String getCityName() {
return cityName;
}
public void setCityName(String cityName) {
this.cityName = cityName;
}
}
UserKey:
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class UserKey implements WritableComparable<UserKey> {
private int cityNo;
private boolean isPrimary;//true:city / false:user
public UserKey() {
}
public UserKey(int cityNo, boolean isPrimary) {
this.cityNo = cityNo;
this.isPrimary = isPrimary;
}
@Override
public int hashCode() {//partition使用key的hashCode方法决定该记录发往哪个reduce
return this.cityNo;
}
@Override
public int compareTo(UserKey o) {
if(this.cityNo == o.getCityNo()) {
if(this.isPrimary == o.isPrimary()) {
return 0;
} else {
return this.isPrimary ? -1 : 1;
}
} else {
return this.cityNo - o.getCityNo() > 0 ? 1 : -1;
}
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(this.cityNo);
dataOutput.writeBoolean(this.isPrimary);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.cityNo = dataInput.readInt();
this.isPrimary = dataInput.readBoolean();
}
public int getCityNo() {
return cityNo;
}
public void setCityNo(int cityNo) {
this.cityNo = cityNo;
}
public boolean isPrimary() {
return isPrimary;
}
public void setPrimary(boolean primary) {
isPrimary = primary;
}
}
GroupComparator:
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class GroupComparator extends WritableComparator {
public GroupComparator() {
super(UserKey.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
UserKey a1 = (UserKey) a;
UserKey b1 = (UserKey) b;
if(a1.getCityNo() == b1.getCityNo()) {
return 0;
} else {
return a1.getCityNo() > b1.getCityNo() ? 1 : -1;
}
}
}
Mapper:
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class JoinThreeMapper extends Mapper<LongWritable, Text, UserKey, User> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
if(!line.startsWith("#")) {
String[] arr = line.split("\t");
if (arr.length == 2) {//城市信息
User city = new User();
city.setCityNo(arr[0]);
city.setCityName(arr[1]);
UserKey uKey = new UserKey();
uKey.setCityNo(Integer.valueOf(arr[0]));
uKey.setPrimary(true);
context.write(uKey, city);
} else {
User user = new User();
user.setUserNo(arr[0]);
user.setUserName(arr[1]);
user.setCityNo(arr[2]);
UserKey uKey = new UserKey();
uKey.setCityNo(Integer.valueOf(arr[2]));
uKey.setPrimary(false);
context.write(uKey, user);
}
}
}
}
Reducer:
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class JoinThreeReducer extends Reducer<UserKey, User, NullWritable, Text> {
@Override
protected void reduce(UserKey key, Iterable<User> userues, Context context) throws IOException, InterruptedException {
User city = null;
int num = 0;
for(User user : userues) {
if(num == 0) {
city = new User(user);
num++;
} else {
user.setCityName(city.getCityName());
context.write(NullWritable.get(), new Text(user.toString()));
}
}
}
}
job:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* 将人员的地址ID完善成为地址名称,输出格式:人员ID,姓名,地址----优化方案:对城市进行排序
*/
public class JobMain {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
if(args.length != 2) {
System.err.println("Usage: JoinData<input path> <output path>");
System.exit(-1);
}
Configuration conf = new Configuration();
Job job = Job.getInstance(conf,"Join job3");
job.setJarByClass(JobMain.class);
job.setMapperClass(JoinThreeMapper.class);
job.setMapOutputKeyClass(UserKey.class);
job.setMapOutputValueClass(User.class);
job.setReducerClass(JoinThreeReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
//reduce阶段分组
job.setGroupingComparatorClass(GroupComparator.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
Path outDirPath = new Path(args[1]);
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outDirPath)) {
fs.delete(outDirPath, true);
}
FileOutputFormat.setOutputPath(job, outDirPath);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
结果:
3 王五 北京
1 张三 北京
2 李四 上海
5 马七 广州
4 赵六 广州