一道hadoop面试题

使用Hive或者自定义MR实现如下逻辑
product_no      lac_id  moment  start_time      user_id county_id       staytime        city_id
13429100031     22554   8       2013-03-11 08:55:19.151754088   571     571     282     571
13429100082     22540   8       2013-03-11 08:58:20.152622488   571     571     270     571
13429100082     22691   8       2013-03-11 08:56:37.149593624   571     571     103     571
13429100087     22705   8       2013-03-11 08:56:51.139539816   571     571     220     571
13429100087     22540   8       2013-03-11 08:55:45.150276800   571     571     66      571
13429100082     22540   8       2013-03-11 08:55:38.140225200   571     571     133     571
13429100140     26642   9       2013-03-11 09:02:19.151754088   571     571     18      571
13429100082     22691   8       2013-03-11 08:57:32.151754088   571     571     287     571
13429100189     22558   8       2013-03-11 08:56:24.139539816   571     571     48      571
13429100349     22503   8       2013-03-11 08:54:30.152622440   571     571     211     571
字段解释:
product_no:用户手机号;
lac_id:用户所在基站;
start_time:用户在此基站的开始时间;
staytime:用户在此基站的逗留时间。

需求描述:
根据lac_id和start_time知道用户当时的位置,根据staytime知道用户各个基站的逗留时长。根据轨迹合并连续基站的staytime。
最终得到每一个用户按时间排序在每一个基站驻留时长

期望输出举例:
13429100082     22540   8       2013-03-11 08:58:20.152622488   571     571     270     571
13429100082     22691   8       2013-03-11 08:56:37.149593624   571     571     390     571
13429100082     22540   8       2013-03-11 08:55:38.140225200   571     571     133     571
13429100087     22705   8       2013-03-11 08:56:51.139539816   571     571     220     571

13429100087     22540   8       2013-03-11 08:55:45.150276800   571     571     66      571


思路:自定义输出key类型为product_no+start_time,先按product_no排序,product_no相同的按start_time排序

 自定义分组,把相同product_no的归为一组

 组内lac_id相邻的staytime会相加

 

package hivemr;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class HiveMR {

	public static void main(String[] args) throws Exception {
		Job job = Job.getInstance(new Configuration(),
				HiveMR.class.getSimpleName());
		job.setJarByClass(HiveMR.class);

		FileInputFormat.setInputPaths(job, new Path(args[1]));
		job.setMapperClass(MyMapper.class);
		job.setMapOutputKeyClass(MyWritable.class);
		job.setMapOutputValueClass(Text.class);
		
		job.setGroupingComparatorClass(MyGroupingComparator.class);

	job.setReducerClass(MyReducer.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(Text.class);

		FileOutputFormat.setOutputPath(job, new Path(args[2]));

		job.waitForCompletion(true);
	}

	/**
	 * 自定义分组,把相同product_no的归为一组,忽略start_time
	 * long类型占8个字节,因此偏移量为8
	 *
	 */
	private static class MyGroupingComparator implements RawComparator {
		
		public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {	
			return WritableComparator.compareBytes(b1, s1, 8, b2, s2, 8);
		}

		public int compare(Object o1, Object o2) {
			return 0;
		}
	}

	public static class MyMapper extends
			Mapper<LongWritable, Text, MyWritable, Text> {
		MyWritable k2 = new MyWritable();
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, MyWritable, Text>.Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			String[] split = line.split("\t");

			String product_no = split[0];
			String lac_id = split[1];
			String moment = split[2];
			String start_time = split[3];
			String user_id = split[4];
			String county_id = split[5];
			String staytime = split[6];
			String city_id = split[7];


			k2.set(product_no, start_time);
			context.write(k2, new Text(product_no+ "\t"
					+lac_id + "\t"
					+ moment + "\t" + start_time + "\t" + user_id
					+ "\t" + county_id + "\t" + staytime + "\t" +city_id));

		}
	}

	public static class MyReducer extends Reducer<MyWritable, Text, Text, Text> {

		@Override
		protected void reduce(MyWritable k2, Iterable<Text> v2s,
				Reducer<MyWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			
			String beforeid = null;
			String start=null;
			int time = 0;
			
			String product_no = null;
			String lac_id = null;
			String moment = null;
			String start_time = null;
			String user_id = null;
			String county_id = null;
			String staytime = null;
			String city_id = null;
			
			for (Text text : v2s) {
				
				String line = text.toString();

				String[] split = line.split("\t");

				product_no = split[0];
				lac_id = split[1];
				moment = split[2];
				start_time = split[3];
				user_id = split[4];
				county_id = split[5];
				staytime = split[6];
				city_id = split[7];
				
				if(lac_id.equals(beforeid)){//若本次的id与上次的相同,则time时间相加,若本次为最后一次,则会在循环结束后输出
					start=start_time;//重新赋值否则时间会有问题(显示上次的?)
					time = time+Integer.parseInt(staytime);
				}
				else if(beforeid==null){//判断是否为空,赋值,若本次为最后一次,则会在循环结束后输出
					start=start_time;
					beforeid=lac_id;
					time = Integer.parseInt(staytime);
				}
				else if(!lac_id.equals(beforeid)){//若本次的id与上次的不同,则先输出上次的,然后赋值,若本次为最后一次,则会在循环结束后输出
					context.write(new Text(product_no), new Text(beforeid + "\t"
							+ moment + "\t" + start + "\t" + user_id
							+ "\t" + county_id + "\t" + time + "\t" +city_id));
					beforeid=lac_id;
					time = Integer.parseInt(staytime);
				}

			}
			context.write(new Text(product_no), new Text(lac_id + "\t"
					+ moment + "\t" + start_time + "\t" + user_id
					+ "\t" + county_id + "\t" + time + "\t" +city_id));
		}
	}

	/**
	 * 自定义输出key类型为product_no+start_time,先按product_no排序,product_no相同的按start_time排序
	 * 这样会导致相同product_no的在分组时,不能分到同一组,因此需自定义分组
	 */
	public static class MyWritable implements WritableComparable<MyWritable> {
		public long product_no;
		public long start_time;
		SimpleDateFormat sdf =   new SimpleDateFormat( "yyyy-MM-dd HH:mm:ss" );
		
		public void write(DataOutput out) throws IOException {
			out.writeLong(product_no);
			out.writeLong(start_time);
		}

		public void set(String product_no, String start_time){
			this.product_no=Long.parseLong(product_no);
			try {
				this.start_time=sdf.parse(start_time.split("\\.")[0]).getTime();
			} catch (ParseException e) {
				e.printStackTrace();
			}
		}

		public void readFields(DataInput in) throws IOException {
			this.product_no=in.readLong();
			this.start_time=in.readLong();
		}

		public int compareTo(MyWritable o) {
			if(this.product_no == o.product_no){
				return -1*(int) (this.start_time - o.start_time);
			}
			
			return (int) (this.product_no - o.product_no);
		}
		
		@Override
		public String toString() {
			return this.product_no+"\t"+this.start_time;
		}


	}
}

结果



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值