第2.2.1章 hadoop之mrunit自定义writable

最新推荐文章于 2022-11-20 00:11:10 发布

warrah

最新推荐文章于 2022-11-20 00:11:10 发布

阅读量418

点赞数

分类专栏：岁月云——大数据杂烩文章标签： hadoop mrunit

本文链接：https://blog.csdn.net/warrah/article/details/62237676

版权

岁月云——大数据杂烩专栏收录该内容

72 篇文章 3 订阅

订阅专栏

MRUnit mvnrepository中没有找到对应的mrunit的jar包，但是可以在mrunit1.1.0中下载到
1 pom.xml

<properties>
	<mrunit.version>1.1.0</mrunit.version>
	<mockito.version>1.10.19</mockito.version>
	<hadoop.version>2.6.4</hadoop.version>
</properties>

<dependency>
	<groupId>org.apache.mrunit</groupId>
	<artifactId>mrunit</artifactId>
	<version>${mrunit.version}</version>
</dependency>
<dependency>
	<groupId>org.mockito</groupId>
	<artifactId>mockito-all</artifactId>
	<version>${mockito.version}</version>
</dependency>
<!-- hadoop相关jar -->
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-common</artifactId>
			<version>${hadoop.version}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-hdfs</artifactId>
			<version>${hadoop.version}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-client</artifactId>
			<version>${hadoop.version}</version>
		</dependency>
		<dependency>
			<groupId>jdk.tools</groupId>
			<artifactId>jdk.tools</artifactId>
			<version>${java.version}</version>
			<scope>system</scope>
			<systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
		</dependency>

2. 自定义writable
hadoop自带的writable类使用可参考MapReduce 单元测试工具 MRUnit 使用，下图用到的是binarycomparable比较器。

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Date;

import org.apache.hadoop.io.BinaryComparable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;

import com.dzmsoft.framework.base.util.DateUtil;

/**
 * 这里自定义一个Writable类,只标注了几个关键的数据。
 * @author dzm
 *
 */
public class InviteInfoWritable extends BinaryComparable
		implements WritableComparable<BinaryComparable> {

	private String id;
	/**
	 * 统计日期
	 */
	private Date countDate;
	/**
	 * 用户帐户
	 */
	private String account;
	private String username;
	/**
	 * 省份
	 */
	private String provinceId;
	/**
	 * 数量
	 */
	private Integer count;
	private String departmentName;

	public String getId() {
		return id;
	}

	public void setId(String id) {
		this.id = id;
	}

	public Date getCountDate() {
		return countDate;
	}

	public void setCountDate(Date countDate) {
		this.countDate = countDate;
	}

	public String getAccount() {
		return account;
	}

	public void setAccount(String account) {
		this.account = account;
	}

	public String getUsername() {
		return username;
	}

	public void setUsername(String username) {
		this.username = username;
	}

	public String getProvinceId() {
		return provinceId;
	}

	public void setProvinceId(String provinceId) {
		this.provinceId = provinceId;
	}

	public Integer getCount() {
		return count;
	}

	public void setCount(Integer count) {
		this.count = count;
	}

	public String getDepartmentName() {
		return departmentName;
	}

	public void setDepartmentName(String departmentName) {
		this.departmentName = departmentName;
	}



	/**
	 * 作为mapper Key使用时请使用
	 */
	public InviteInfoWritable() {
		super();
		bytes = EMPTY_BYTES;
	}

	private static final byte[] EMPTY_BYTES = new byte[0];

	private byte[] bytes;
	private int length;
	
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(DateUtil.formatDate(this.getCountDate()));
		out.writeUTF(this.getAccount());
		out.writeUTF(this.getProvinceId());
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.setCountDate(DateUtil.parseDate(in.readUTF()));
		this.setAccount(in.readUTF());
		this.setProvinceId(in.readUTF());
		// hadoop比较key时，首先创建两个空的key对象，然后通过readFields方法写入创建的两个对象中，再比较
		// 因此byte生成方法必须写在此处，否则比较时byte为空
		ByteBuffer bb = Text.encode(countDate + account, true);
		bytes = bb.array();
		length = bb.limit();
	}

	@Override
	public int getLength() {
		return length;
	}

	@Override
	public byte[] getBytes() {
		return bytes;
	}
}

这里可以看到org.apache.hadoop.io.BinaryComparable中字节比较抽象方法,所以上面在readFields中做了手工赋值

3 定义mapreduce作业

import java.io.Serializable;

public class Business<T> implements Serializable {

	/**
	 * 
	 */
	private static final long serialVersionUID = 7056019679965982739L;

	/**
	 * 业务类型
	 * 现在只有一种:userDataAcquisitionInfoUpload
	 * 
	 */
	private String business;
	
	private T params;

	public String getBusiness() {
		return business;
	}

	public void setBusiness(String business) {
		this.business = business;
	}

	public T getParams() {
		return params;
	}

	public void setParams(T params) {
		this.params = params;
	}
}

public class InviteInfoParam implements Serializable {

	/**
	 * 
	 */
	private static final long serialVersionUID = 6791671668489980464L;

	/**
	 * 用户名
	 */
	private String username;
	
	/**
	 * 省份ID
	 */
	private String provinceId;
	
	/**
	 * 省份ID
	 */
	private String subscribe_time;
	
	/**
	 * 省份ID
	 */
	private String openId;

	public String getUsername() {
		return username;
	}

	public void setUsername(String username) {
		this.username = username;
	}

	public String getProvinceId() {
		return provinceId;
	}

	public void setProvinceId(String provinceId) {
		this.provinceId = provinceId;
	}

	public String getSubscribe_time() {
		return subscribe_time;
	}

	public void setSubscribe_time(String subscribe_time) {
		this.subscribe_time = subscribe_time;
	}

	public String getOpenId() {
		return openId;
	}

	public void setOpenId(String openId) {
		this.openId = openId;
	}
}

import java.io.IOException;
import java.sql.SQLException;
import java.util.HashSet;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.db.DBOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.Tool;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.dzmsoft.dsj.hadoop.dto.Business;
import com.dzmsoft.dsj.hadoop.dto.InviteInfoParam;
import com.dzmsoft.dsj.hadoop.util.IntegerDefault0Adapter;
import com.dzmsoft.dsj.hadoop.writable.InviteInfoWritable;
import com.dzmsoft.framework.base.util.DateUtil;
import com.dzmsoft.framework.base.util.StringUtil;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.reflect.TypeToken;

public class InviteInfoCountJob extends Configured implements Tool {
	private static Logger logger = LoggerFactory.getLogger(InviteInfoCountJob.class);
	
	private static Gson gson;
	
	static{
		gson = new GsonBuilder().setDateFormat("yyyy-MM-dd HH:mm:ss")
	            .registerTypeAdapter(Integer.class, new IntegerDefault0Adapter())
	            .registerTypeAdapter(int.class, new IntegerDefault0Adapter())
	            .create();
	}
	
	public static class InviteInfoCountMapper extends Mapper<LongWritable, Text, InviteInfoWritable, Text> {

		@Override
		public void map(LongWritable key, Text line, Context context) throws InterruptedException, IOException {
			logger.debug("inpit line:{}", line);
			
			Business<InviteInfoParam> item = gson.fromJson(line.toString(),
					new TypeToken<Business<InviteInfoParam>>() {
					}.getType());

			if(!StringUtil.isBlank(item.getParams().getUsername())) {
				InviteInfoWritable outkey = new InviteInfoWritable();
				outkey.setAccount(item.getParams().getUsername());
				outkey.setProvinceId(item.getParams().getProvinceId());
				// 只保存时分秒
				outkey.setCountDate(DateUtil.parseDate(item.getParams().getSubscribe_time().substring(0, 10)));
				context.write(outkey, new Text(item.getParams().getOpenId()));
			}
		}
	}

	public static class InviteInfoCountReducer
			extends Reducer<InviteInfoWritable, Text, InviteInfoWritable, Text> {

		//在执行过程中的数据不能使用静态变量的方式传递，必须放入config
		
		
		/**
		 * 统计个数
		 */
		@Override
		public void reduce(InviteInfoWritable key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			//每次job都是一个新的进程，crmSysUserList不能共享，所有job之间只有config是共享的
			
			//去重
			Set<String> set = new HashSet<String>();
			for (Text value : values) {
				set.add(value.toString());
			}
			InviteInfoWritable bean = new InviteInfoWritable();
			bean.setCountDate(key.getCountDate());// yyyyMMdd to yyyy-MM-dd
			bean.setAccount(key.getAccount());
			bean.setProvinceId(key.getProvinceId());

			bean.setId(StringUtil.getUuidString());
			bean.setCount(set.size());
			
			context.write(bean, null);
		}
	}

	public static void main(String[] args) throws Exception {
		//recreate参数为创建表结构，可选
//		Configuration conf = MyConfiguration.createWithDB();
//		int res = ToolRunner.run(conf, new InviteInfoCountJob(), args);
//		System.exit(res);
	}

	public final int run(final String[] args)
			throws IOException, InterruptedException, ClassNotFoundException, SQLException {

//		Configuration conf = MyConfiguration.createWithDB();
		Configuration conf = super.getConf();
		
		String inputPath = null;

		Job job = Job.getInstance(conf, "dms_invite_info_count");
		job.setJarByClass(InviteInfoCountJob.class);
		job.setMapperClass(InviteInfoCountMapper.class);
		// 设置输出的表明名
		job.setMapOutputKeyClass(InviteInfoWritable.class);
		job.setMapOutputValueClass(Text.class);
		// 设置输入，可设置多个
		FileInputFormat.addInputPath(job, new Path(inputPath));
		//遍历子目录
		FileInputFormat.setInputDirRecursive(job, true);

		// 输出到mysql
		job.setReducerClass(InviteInfoCountReducer.class);
		job.setOutputFormatClass(DBOutputFormat.class);
		DBOutputFormat.setOutput(job, "dms_invite_info_count", "id", "account", "username", "department_name", "province_id", "count", "count_date"); // 后面的这些就是数据库表的字段名
		job.setNumReduceTasks(7);// 必须和列数一致


		boolean result = job.waitForCompletion(true);
		logger.info("job {} is {}!", job.getJobName(), result ? "success" : "failed");
		return result ? 0 : 1;
	}
}

4 mrunit单元测试用例

import java.io.IOException;
import java.text.ParseException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mrunit.mapreduce.MapDriver;
import org.apache.hadoop.mrunit.mapreduce.MapReduceDriver;
import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
import org.junit.Before;
import org.junit.Test;

import com.dzmsoft.dsj.hadoop.job.InviteInfoCountJob;
import com.dzmsoft.dsj.hadoop.writable.InviteInfoWritable;
import com.dzmsoft.framework.base.util.DateUtil;

public class JobMrUnitTest {

	Configuration conf ;
	MapDriver<LongWritable, Text, InviteInfoWritable, Text> mapDriver;
	ReduceDriver<InviteInfoWritable, Text, InviteInfoWritable, Text> reduceDriver;
	MapReduceDriver<LongWritable,Text,InviteInfoWritable,Text,InviteInfoWritable,Text> mapReduceDriver;
	
	@Before
	public void init(){
		conf = new Configuration();
		InviteInfoCountJob.InviteInfoCountMapper mapper = new InviteInfoCountJob.InviteInfoCountMapper();
		InviteInfoCountJob.InviteInfoCountReducer reducer = new InviteInfoCountJob.InviteInfoCountReducer();
		mapDriver = MapDriver.newMapDriver(mapper);
		reduceDriver = ReduceDriver.newReduceDriver(reducer);
		mapReduceDriver = MapReduceDriver.newMapReduceDriver(mapper,reducer);
	}
	
	@Test
	public void test_mapper() throws IOException{
		String text = "{\"business\":\"wcnInviteInfoUpload\",\"params\":{\"username\":\"wanghui\",\"provinceId\":\"789\",\"subscribe_time\":\"2017-02-10 00:04:28\",\"openId\":\"ou7nSs1fX2IWh3iXBGYbTMVxDQy2\"}}";
		mapDriver.withInput(new LongWritable(), new Text(text));
		// 按照account、countDate作为分组,provinceId存储留作它用
		InviteInfoWritable outkey = new InviteInfoWritable();
		outkey.setAccount("wanghui");
		outkey.setProvinceId("789");
		try {
			outkey.setCountDate(DateUtil.parseDate("2017-02-10", "yyyy-MM-dd"));
		} catch (ParseException e) {
			e.printStackTrace();
		}
		// 断言运行结果
		mapDriver.withOutput(outkey, new Text("ou7nSs1fX2IWh3iXBGYbTMVxDQy2"));
		// mapper运行
		mapDriver.runTest();
	}
	

	
}

warrah

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
第2.2.1章 hadoop之mrunit自定义writable

MRUnit mvnrepository中没有找到对应的mrunit的jar包，但是可以在mrunit1.1.0中下载到 1 pom.xml<mrunit.version>1.1.0</mrunit.version><mockito.version>1.10.19</mockito.version><dependency> <groupId>org.apache.mrunit</grou
复制链接

扫一扫