MRUnit mvnrepository中没有找到对应的mrunit的jar包,但是可以在mrunit1.1.0中下载到
1 pom.xml
<properties>
<mrunit.version>1.1.0</mrunit.version>
<mockito.version>1.10.19</mockito.version>
<hadoop.version>2.6.4</hadoop.version>
</properties>
<dependency>
<groupId>org.apache.mrunit</groupId>
<artifactId>mrunit</artifactId>
<version>${mrunit.version}</version>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId>
<version>${mockito.version}</version>
</dependency>
<!-- hadoop相关jar -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>jdk.tools</groupId>
<artifactId>jdk.tools</artifactId>
<version>${java.version}</version>
<scope>system</scope>
<systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
</dependency>
2. 自定义writable
hadoop自带的writable类使用可参考MapReduce 单元测试工具 MRUnit 使用,下图用到的是binarycomparable比较器。
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Date;
import org.apache.hadoop.io.BinaryComparable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import com.dzmsoft.framework.base.util.DateUtil;
/**
* 这里自定义一个Writable类,只标注了几个关键的数据。
* @author dzm
*
*/
public class InviteInfoWritable extends BinaryComparable
implements WritableComparable<BinaryComparable> {
private String id;
/**
* 统计日期
*/
private Date countDate;
/**
* 用户帐户
*/
private String account;
private String username;
/**
* 省份
*/
private String provinceId;
/**
* 数量
*/
private Integer count;
private String departmentName;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public Date getCountDate() {
return countDate;
}
public void setCountDate(Date countDate) {
this.countDate = countDate;
}
public String getAccount() {
return account;
}
public void setAccount(String account) {
this.account = account;
}
public String getUsername() {
return username;
}
public void setUsername(String username) {
this.username = username;
}
public String getProvinceId() {
return provinceId;
}
public void setProvinceId(String provinceId) {
this.provinceId = provinceId;
}
public Integer getCount() {
return count;
}
public void setCount(Integer count) {
this.count = count;
}
public String getDepartmentName() {
return departmentName;
}
public void setDepartmentName(String departmentName) {
this.departmentName = departmentName;
}
/**
* 作为mapper Key使用时请使用
*/
public InviteInfoWritable() {
super();
bytes = EMPTY_BYTES;
}
private static final byte[] EMPTY_BYTES = new byte[0];
private byte[] bytes;
private int length;
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(DateUtil.formatDate(this.getCountDate()));
out.writeUTF(this.getAccount());
out.writeUTF(this.getProvinceId());
}
@Override
public void readFields(DataInput in) throws IOException {
this.setCountDate(DateUtil.parseDate(in.readUTF()));
this.setAccount(in.readUTF());
this.setProvinceId(in.readUTF());
// hadoop比较key时,首先创建两个空的key对象,然后通过readFields方法写入创建的两个对象中,再比较
// 因此byte生成方法必须写在此处,否则比较时byte为空
ByteBuffer bb = Text.encode(countDate + account, true);
bytes = bb.array();
length = bb.limit();
}
@Override
public int getLength() {
return length;
}
@Override
public byte[] getBytes() {
return bytes;
}
}
这里可以看到org.apache.hadoop.io.BinaryComparable
中字节比较抽象方法,所以上面在readFields中做了手工赋值
3 定义mapreduce作业
import java.io.Serializable;
public class Business<T> implements Serializable {
/**
*
*/
private static final long serialVersionUID = 7056019679965982739L;
/**
* 业务类型
* 现在只有一种:userDataAcquisitionInfoUpload
*
*/
private String business;
private T params;
public String getBusiness() {
return business;
}
public void setBusiness(String business) {
this.business = business;
}
public T getParams() {
return params;
}
public void setParams(T params) {
this.params = params;
}
}
public class InviteInfoParam implements Serializable {
/**
*
*/
private static final long serialVersionUID = 6791671668489980464L;
/**
* 用户名
*/
private String username;
/**
* 省份ID
*/
private String provinceId;
/**
* 省份ID
*/
private String subscribe_time;
/**
* 省份ID
*/
private String openId;
public String getUsername() {
return username;
}
public void setUsername(String username) {
this.username = username;
}
public String getProvinceId() {
return provinceId;
}
public void setProvinceId(String provinceId) {
this.provinceId = provinceId;
}
public String getSubscribe_time() {
return subscribe_time;
}
public void setSubscribe_time(String subscribe_time) {
this.subscribe_time = subscribe_time;
}
public String getOpenId() {
return openId;
}
public void setOpenId(String openId) {
this.openId = openId;
}
}
import java.io.IOException;
import java.sql.SQLException;
import java.util.HashSet;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.db.DBOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.Tool;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.dzmsoft.dsj.hadoop.dto.Business;
import com.dzmsoft.dsj.hadoop.dto.InviteInfoParam;
import com.dzmsoft.dsj.hadoop.util.IntegerDefault0Adapter;
import com.dzmsoft.dsj.hadoop.writable.InviteInfoWritable;
import com.dzmsoft.framework.base.util.DateUtil;
import com.dzmsoft.framework.base.util.StringUtil;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.reflect.TypeToken;
public class InviteInfoCountJob extends Configured implements Tool {
private static Logger logger = LoggerFactory.getLogger(InviteInfoCountJob.class);
private static Gson gson;
static{
gson = new GsonBuilder().setDateFormat("yyyy-MM-dd HH:mm:ss")
.registerTypeAdapter(Integer.class, new IntegerDefault0Adapter())
.registerTypeAdapter(int.class, new IntegerDefault0Adapter())
.create();
}
public static class InviteInfoCountMapper extends Mapper<LongWritable, Text, InviteInfoWritable, Text> {
@Override
public void map(LongWritable key, Text line, Context context) throws InterruptedException, IOException {
logger.debug("inpit line:{}", line);
Business<InviteInfoParam> item = gson.fromJson(line.toString(),
new TypeToken<Business<InviteInfoParam>>() {
}.getType());
if(!StringUtil.isBlank(item.getParams().getUsername())) {
InviteInfoWritable outkey = new InviteInfoWritable();
outkey.setAccount(item.getParams().getUsername());
outkey.setProvinceId(item.getParams().getProvinceId());
// 只保存时分秒
outkey.setCountDate(DateUtil.parseDate(item.getParams().getSubscribe_time().substring(0, 10)));
context.write(outkey, new Text(item.getParams().getOpenId()));
}
}
}
public static class InviteInfoCountReducer
extends Reducer<InviteInfoWritable, Text, InviteInfoWritable, Text> {
//在执行过程中的数据不能使用静态变量的方式传递,必须放入config
/**
* 统计个数
*/
@Override
public void reduce(InviteInfoWritable key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
//每次job都是一个新的进程,crmSysUserList不能共享,所有job之间只有config是共享的
//去重
Set<String> set = new HashSet<String>();
for (Text value : values) {
set.add(value.toString());
}
InviteInfoWritable bean = new InviteInfoWritable();
bean.setCountDate(key.getCountDate());// yyyyMMdd to yyyy-MM-dd
bean.setAccount(key.getAccount());
bean.setProvinceId(key.getProvinceId());
bean.setId(StringUtil.getUuidString());
bean.setCount(set.size());
context.write(bean, null);
}
}
public static void main(String[] args) throws Exception {
//recreate参数为创建表结构,可选
// Configuration conf = MyConfiguration.createWithDB();
// int res = ToolRunner.run(conf, new InviteInfoCountJob(), args);
// System.exit(res);
}
public final int run(final String[] args)
throws IOException, InterruptedException, ClassNotFoundException, SQLException {
// Configuration conf = MyConfiguration.createWithDB();
Configuration conf = super.getConf();
String inputPath = null;
Job job = Job.getInstance(conf, "dms_invite_info_count");
job.setJarByClass(InviteInfoCountJob.class);
job.setMapperClass(InviteInfoCountMapper.class);
// 设置输出的表明名
job.setMapOutputKeyClass(InviteInfoWritable.class);
job.setMapOutputValueClass(Text.class);
// 设置输入,可设置多个
FileInputFormat.addInputPath(job, new Path(inputPath));
//遍历子目录
FileInputFormat.setInputDirRecursive(job, true);
// 输出到mysql
job.setReducerClass(InviteInfoCountReducer.class);
job.setOutputFormatClass(DBOutputFormat.class);
DBOutputFormat.setOutput(job, "dms_invite_info_count", "id", "account", "username", "department_name", "province_id", "count", "count_date"); // 后面的这些就是数据库表的字段名
job.setNumReduceTasks(7);// 必须和列数一致
boolean result = job.waitForCompletion(true);
logger.info("job {} is {}!", job.getJobName(), result ? "success" : "failed");
return result ? 0 : 1;
}
}
4 mrunit单元测试用例
import java.io.IOException;
import java.text.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mrunit.mapreduce.MapDriver;
import org.apache.hadoop.mrunit.mapreduce.MapReduceDriver;
import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
import org.junit.Before;
import org.junit.Test;
import com.dzmsoft.dsj.hadoop.job.InviteInfoCountJob;
import com.dzmsoft.dsj.hadoop.writable.InviteInfoWritable;
import com.dzmsoft.framework.base.util.DateUtil;
public class JobMrUnitTest {
Configuration conf ;
MapDriver<LongWritable, Text, InviteInfoWritable, Text> mapDriver;
ReduceDriver<InviteInfoWritable, Text, InviteInfoWritable, Text> reduceDriver;
MapReduceDriver<LongWritable,Text,InviteInfoWritable,Text,InviteInfoWritable,Text> mapReduceDriver;
@Before
public void init(){
conf = new Configuration();
InviteInfoCountJob.InviteInfoCountMapper mapper = new InviteInfoCountJob.InviteInfoCountMapper();
InviteInfoCountJob.InviteInfoCountReducer reducer = new InviteInfoCountJob.InviteInfoCountReducer();
mapDriver = MapDriver.newMapDriver(mapper);
reduceDriver = ReduceDriver.newReduceDriver(reducer);
mapReduceDriver = MapReduceDriver.newMapReduceDriver(mapper,reducer);
}
@Test
public void test_mapper() throws IOException{
String text = "{\"business\":\"wcnInviteInfoUpload\",\"params\":{\"username\":\"wanghui\",\"provinceId\":\"789\",\"subscribe_time\":\"2017-02-10 00:04:28\",\"openId\":\"ou7nSs1fX2IWh3iXBGYbTMVxDQy2\"}}";
mapDriver.withInput(new LongWritable(), new Text(text));
// 按照account、countDate作为分组,provinceId存储留作它用
InviteInfoWritable outkey = new InviteInfoWritable();
outkey.setAccount("wanghui");
outkey.setProvinceId("789");
try {
outkey.setCountDate(DateUtil.parseDate("2017-02-10", "yyyy-MM-dd"));
} catch (ParseException e) {
e.printStackTrace();
}
// 断言运行结果
mapDriver.withOutput(outkey, new Text("ou7nSs1fX2IWh3iXBGYbTMVxDQy2"));
// mapper运行
mapDriver.runTest();
}
}