一道hadoop面试题

最新推荐文章于 2019-12-19 16:34:29 发布

ukakasu

最新推荐文章于 2019-12-19 16:34:29 发布

阅读量480

点赞数

分类专栏： mapreduce

本文链接：https://blog.csdn.net/ukakasu/article/details/47906605

版权

mapreduce 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

使用Hive或者自定义MR实现如下逻辑
product_no    lac_id  moment  start_time    user_id county_id    staytime       city_id
13429100031    22554 8    2013-03-11 08:55:19.151754088 571    571    282    571
13429100082    22540 8    2013-03-11 08:58:20.152622488 571    571    270    571
13429100082    22691 8    2013-03-11 08:56:37.149593624 571    571    103    571
13429100087    22705 8    2013-03-11 08:56:51.139539816 571    571    220    571
13429100087    22540 8    2013-03-11 08:55:45.150276800 571    571    66    571
13429100082    22540 8    2013-03-11 08:55:38.140225200 571    571    133    571
13429100140    26642 9    2013-03-11 09:02:19.151754088 571    571    18    571
13429100082    22691 8    2013-03-11 08:57:32.151754088 571    571    287    571
13429100189    22558 8    2013-03-11 08:56:24.139539816 571    571    48    571
13429100349    22503 8    2013-03-11 08:54:30.152622440 571    571    211    571
字段解释：
product_no：用户手机号；
lac_id：用户所在基站；
start_time：用户在此基站的开始时间；
staytime：用户在此基站的逗留时间。

需求描述：
根据lac_id和start_time知道用户当时的位置，根据staytime知道用户各个基站的逗留时长。根据轨迹合并连续基站的staytime。
最终得到每一个用户按时间排序在每一个基站驻留时长

期望输出举例：
13429100082    22540 8    2013-03-11 08:58:20.152622488 571    571    270    571
13429100082    22691 8    2013-03-11 08:56:37.149593624 571    571    390    571
13429100082    22540 8    2013-03-11 08:55:38.140225200 571    571    133    571
13429100087    22705 8    2013-03-11 08:56:51.139539816 571    571    220    571

13429100087 22540 8 2013-03-11 08:55:45.150276800 571 571 66 571

思路：自定义输出key类型为product_no+start_time，先按product_no排序，product_no相同的按start_time排序

自定义分组，把相同product_no的归为一组

组内lac_id相邻的staytime会相加

package hivemr;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class HiveMR {

	public static void main(String[] args) throws Exception {
		Job job = Job.getInstance(new Configuration(),
				HiveMR.class.getSimpleName());
		job.setJarByClass(HiveMR.class);

		FileInputFormat.setInputPaths(job, new Path(args[1]));
		job.setMapperClass(MyMapper.class);
		job.setMapOutputKeyClass(MyWritable.class);
		job.setMapOutputValueClass(Text.class);
		
		job.setGroupingComparatorClass(MyGroupingComparator.class);

	job.setReducerClass(MyReducer.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(Text.class);

		FileOutputFormat.setOutputPath(job, new Path(args[2]));

		job.waitForCompletion(true);
	}

	/**
	 * 自定义分组，把相同product_no的归为一组，忽略start_time
	 * long类型占8个字节，因此偏移量为8
	 *
	 */
	private static class MyGroupingComparator implements RawComparator {
		
		public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {	
			return WritableComparator.compareBytes(b1, s1, 8, b2, s2, 8);
		}

		public int compare(Object o1, Object o2) {
			return 0;
		}
	}

	public static class MyMapper extends
			Mapper<LongWritable, Text, MyWritable, Text> {
		MyWritable k2 = new MyWritable();
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, MyWritable, Text>.Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			String[] split = line.split("\t");

			String product_no = split[0];
			String lac_id = split[1];
			String moment = split[2];
			String start_time = split[3];
			String user_id = split[4];
			String county_id = split[5];
			String staytime = split[6];
			String city_id = split[7];


			k2.set(product_no, start_time);
			context.write(k2, new Text(product_no+ "\t"
					+lac_id + "\t"
					+ moment + "\t" + start_time + "\t" + user_id
					+ "\t" + county_id + "\t" + staytime + "\t" +city_id));

		}
	}

	public static class MyReducer extends Reducer<MyWritable, Text, Text, Text> {

		@Override
		protected void reduce(MyWritable k2, Iterable<Text> v2s,
				Reducer<MyWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			
			String beforeid = null;
			String start=null;
			int time = 0;
			
			String product_no = null;
			String lac_id = null;
			String moment = null;
			String start_time = null;
			String user_id = null;
			String county_id = null;
			String staytime = null;
			String city_id = null;
			
			for (Text text : v2s) {
				
				String line = text.toString();

				String[] split = line.split("\t");

				product_no = split[0];
				lac_id = split[1];
				moment = split[2];
				start_time = split[3];
				user_id = split[4];
				county_id = split[5];
				staytime = split[6];
				city_id = split[7];
				
				if(lac_id.equals(beforeid)){//若本次的id与上次的相同，则time时间相加，若本次为最后一次，则会在循环结束后输出
					start=start_time;//重新赋值否则时间会有问题（显示上次的？）
					time = time+Integer.parseInt(staytime);
				}
				else if(beforeid==null){//判断是否为空，赋值，若本次为最后一次，则会在循环结束后输出
					start=start_time;
					beforeid=lac_id;
					time = Integer.parseInt(staytime);
				}
				else if(!lac_id.equals(beforeid)){//若本次的id与上次的不同，则先输出上次的，然后赋值，若本次为最后一次，则会在循环结束后输出
					context.write(new Text(product_no), new Text(beforeid + "\t"
							+ moment + "\t" + start + "\t" + user_id
							+ "\t" + county_id + "\t" + time + "\t" +city_id));
					beforeid=lac_id;
					time = Integer.parseInt(staytime);
				}

			}
			context.write(new Text(product_no), new Text(lac_id + "\t"
					+ moment + "\t" + start_time + "\t" + user_id
					+ "\t" + county_id + "\t" + time + "\t" +city_id));
		}
	}

	/**
	 * 自定义输出key类型为product_no+start_time，先按product_no排序，product_no相同的按start_time排序
	 * 这样会导致相同product_no的在分组时，不能分到同一组，因此需自定义分组
	 */
	public static class MyWritable implements WritableComparable<MyWritable> {
		public long product_no;
		public long start_time;
		SimpleDateFormat sdf =   new SimpleDateFormat( "yyyy-MM-dd HH:mm:ss" );
		
		public void write(DataOutput out) throws IOException {
			out.writeLong(product_no);
			out.writeLong(start_time);
		}

		public void set(String product_no, String start_time){
			this.product_no=Long.parseLong(product_no);
			try {
				this.start_time=sdf.parse(start_time.split("\\.")[0]).getTime();
			} catch (ParseException e) {
				e.printStackTrace();
			}
		}

		public void readFields(DataInput in) throws IOException {
			this.product_no=in.readLong();
			this.start_time=in.readLong();
		}

		public int compareTo(MyWritable o) {
			if(this.product_no == o.product_no){
				return -1*(int) (this.start_time - o.start_time);
			}
			
			return (int) (this.product_no - o.product_no);
		}
		
		@Override
		public String toString() {
			return this.product_no+"\t"+this.start_time;
		}


	}
}

结果

ukakasu

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
一道hadoop面试题

使用Hive或者自定义MR实现如下逻辑product_no lac_id moment start_time user_id county_id staytime city_id13429100031 22554 8 2013-03-11 08:55:19.151754088 571 571 28
复制链接

扫一扫