目录
一、输入——数据集
1、user.txt(用户数据)
u001,senge,18,male,angelababy
u002,xiaoli,58,male,ruhua
u003,shuaishuai,16,female,chunge
u004,laoyang,28,female,zengge
u005,nana,24,female,huangbo
u006,dingding,19,male,taojiji
2、order1.txt(订单数据1)
order011,u001
order012,u001
order033,u005
order034,u002
order055,u003
order066,u004
3、order2.txt(订单数据2)
order001,u006
order002,u006
order003,u005
order004,u006
order005,u003
order006,u002
二、输出(形式同mysql的“join”-二表关联查询)
order011,u001,senge,18,male,angelababy
order012,u001,senge,18,male,angelababy
order033,u005,nana,24,female,huangbo
order034,u002,xiaoli,58,male,ruhua
order055,u003,shuaishuai,16,female,chunge
order066,u004,laoyang,28,female,zengge
三、思路
1、在mapper中,通过文件名判断文件类型,不同文件类型的数据分别处理,并设置标记,封装Bean对象输出;
2、以uid(用户数据文件的第一列、订单数据文件的第二列)为key,进行汇总,从mapper ——> reducer;
3、在reducer中,通过标记识别不同文件数据,将用户数据向订单数据中填充,并排序输出
四、实现
windows环境下,需要解压hadoop,并设置环境变量
Maven依赖
<properties>
<hadoop.version>3.3.0</hadoop.version>
</properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.75</version>
</dependency>
<!-- https://mvnrepository.com/artifact/cglib/cglib -->
<dependency>
<groupId>cglib</groupId>
<artifactId>cglib</artifactId>
<version>3.3.0</version>
</dependency>
<dependency>
<groupId>org.wicketstuff</groupId>
<artifactId>wicketstuff-springreference</artifactId>
<version>9.0.1-M3</version>
</dependency>
<!--添加hdfs的客户端依赖-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
Bean类——UserOrderBean
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class UserOrderBean implements Writable {
private String orderId;
private String uid;
private String userName;
private Integer age;
private String sex;
private String dishName;
public UserOrderBean() {
}
public UserOrderBean(String orderId, String uid, String userName, Integer age, String sex, String dishName) {
this.orderId = orderId;
this.uid = uid;
this.userName = userName;
this.age = age;
this.sex = sex;
this.dishName = dishName;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(orderId);
out.writeUTF(uid);
out.writeUTF(userName);
out.writeInt(age);
out.writeUTF(sex);
out.writeUTF(dishName);
}
@Override
public void readFields(DataInput in) throws IOException {
orderId = in.readUTF();
uid = in.readUTF();
userName = in.readUTF();
age = in.readInt();
sex = in.readUTF();
dishName = in.readUTF();
}
@Override
public String toString() {
return orderId + "," + uid + "," + userName + "," + age + "," + sex + "," + dishName;
}
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public String getUid() {
return uid;
}
public void setUid(String uid) {
this.uid = uid;
}
public String getUserName() {
return userName;
}
public void setUserName(String userName) {
this.userName = userName;
}
public Integer getAge() {
return age;
}
public void setAge(Integer age) {
this.age = age;
}
public String getSex() {
return sex;
}
public void setSex(String sex) {
this.sex = sex;
}
public String getDishName() {
return dishName;
}
public void setDishName(String dishName) {
this.dishName = dishName;
}
}
Driver类——UserOrderDriver
import com.hermesfuxi.hdfs.application.userorder.domain.UserOrderBean;
import com.hermesfuxi.hdfs.utils.LoggerUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.springframework.cglib.beans.BeanCopier;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
public class UserOrderDriver {
public static void main(String[] args) throws Exception {
Job job = Job.getInstance();
job.setJarByClass(UserOrderDriver.class);
job.setNumReduceTasks(1);
job.setMapperClass(UserOrderMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(UserOrderBean.class);
job.setReducerClass(UserOrderReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job, new Path("E:\\join\\input"));
FileOutputFormat.setOutputPath(job, new Path("E:\\join\\output"));
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
private static class UserOrderMapper extends Mapper<LongWritable, Text, Text, UserOrderBean> {
private String fileName;
private final Text text = new Text();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
FileSplit fileSplit = (FileSplit) context.getInputSplit();
fileName = fileSplit.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] strings = value.toString().split(",");
String uidKey;
UserOrderBean userOrderBean;
// 通过文件名区分用户信息与订单信息,并将用户信息Bean 的 orderId 设置为“userInfo”
if (fileName.startsWith("user") && strings.length >= 5) {
uidKey = strings[0];
userOrderBean = new UserOrderBean("userInfo", strings[0], strings[1], Integer.parseInt(strings[2]), strings[3], strings[4]);
} else if (strings.length >= 2) {
uidKey = strings[1];
// 订单信息中没有的属性,全部设置默认值(序列化与反序列化需要)
userOrderBean = new UserOrderBean(strings[0], strings[1], "", 0, "", "");
} else {
LoggerUtils.error("数据或系统解析错误, data: " + value.toString());
return;
}
// 使用 uid 为 key, 这样才能让组合
text.set(uidKey);
context.write(text, userOrderBean);
}
}
private static class UserOrderReducer extends Reducer<Text, UserOrderBean, UserOrderBean, NullWritable> {
private static final BeanCopier BEAN_COPIER = BeanCopier.create(UserOrderBean.class, UserOrderBean.class, false);
@Override
protected void reduce(Text key, Iterable<UserOrderBean> values, Context context) throws IOException, InterruptedException {
List<UserOrderBean> userOrderBeanList = new ArrayList<>();
String userName = null;
Integer age = null;
String sex = null;
String dishName = null;
for (UserOrderBean value : values) {
// 相同的UID下,应该将 userInfo ——> orderInfo
if ("userInfo".equals(value.getOrderId())) {
userName = value.getUserName();
age = value.getAge();
sex = value.getSex();
dishName = value.getDishName();
} else {
// 使用Bean的深copy
UserOrderBean orderInfoBean = new UserOrderBean();
BEAN_COPIER.copy(value, orderInfoBean, null);
userOrderBeanList.add(orderInfoBean);
}
}
if(StringUtils.isBlank(userName)){
LoggerUtils.error("数据有误!!!!!!!!!!");
}
// 排序后直接输出
userOrderBeanList.sort(Comparator.comparing(UserOrderBean::getUid));
for (UserOrderBean userOrderBean : userOrderBeanList) {
userOrderBean.setUserName(userName);
userOrderBean.setAge(age);
userOrderBean.setSex(sex);
userOrderBean.setDishName(dishName);
context.write(userOrderBean, NullWritable.get());
}
}
}
}
五、结果
order012,u001,senge,18,male,angelababy
order011,u001,senge,18,male,angelababy
order006,u002,xiaoli,58,male,ruhua
order034,u002,xiaoli,58,male,ruhua
order055,u003,shuaishuai,16,female,chunge
order005,u003,shuaishuai,16,female,chunge
order066,u004,laoyang,28,female,zengge
order003,u005,nana,24,female,huangbo
order033,u005,nana,24,female,huangbo
order004,u006,dingding,19,male,taojiji
order002,u006,dingding,19,male,taojiji
order001,u006,dingding,19,male,taojiji