MapReduce案例8——求最频繁访问数据表以及最频繁访问的用户和时长

最新推荐文章于 2021-06-07 18:03:27 发布

jin6872115

最新推荐文章于 2021-06-07 18:03:27 发布

阅读量1k

点赞数

分类专栏： MapReduce 文章标签： MapReduce 最频繁访问

本文链接：https://blog.csdn.net/jin6872115/article/details/79589746

版权

MapReduce 专栏收录该内容

17 篇文章 6 订阅

订阅专栏

题目：

user1	1:00	u1	1.5
user2	2:00	u2	0.5
user3	3:00	u3	0.1
user4	4:00	u1	1.4
user5	5:00	u4	1.3
user6	6:00	u4	1.9
user7	7:00	u5	2.4
user8	8:00	u1	0.1
user9	9:00	u6	0.6
user10	10:00	u1	0.5
user11	1:00	u2	0.2
user12	3:00	u4	0.9
user13	4:00	u2	9.1
user14	6:00	u1	6.1
user15	5:00	u5	5.1
user10	10:00	u2	0.4
user10	10:00	u3	0.4
user3	10:00	u2	0.4
user4	10:00	u2	0.4


用Hadoop分析海量日志文件，每行日志记录了如下数据：
TableName(表名)，Time(时间)，User(用户)，TimeSpan(时间开销)

要求编写MapReduce程序算出高峰时间段（如上午10点）哪张表被访问的最频繁，以及这段时间访问这张表最多的用户，以及这个用户的总时间开销。

思路：本题分为两部分：

第一：先求特定时间段哪张表被访问的最频繁？

处理方式：以表名和时间段作为key值，统计分组后的记录条数，即为访问次数，求取最大值即可。输出结果按照时间段分为不同文件输出。

第二：以表名、时间段和用户作为key值，统计分组后的记录条数，累加时间开销，求取记录最大值即可获得结果

由于本题给出的数据限制，本题以解题为主，结果仅做参考：

第一步代码：

分组代码：

/**
 * @author: lpj   
 * @date: 2018年3月16日 下午10:13:24
 * @Description:
 */
package lpj.reduceWorkbean;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

/**
 *
 */
public class MyPatitionerAccess extends Partitioner<Text, NullWritable>{

	/* (non-Javadoc)
	 * @see org.apache.hadoop.mapreduce.Partitioner#getPartition(java.lang.Object, java.lang.Object, int)
	 */
	@Override
	public int getPartition(Text key, NullWritable value, int numPartitions) {
		//user1	1:00	1
		String[] reads = key.toString().split("\t");//按照时间段输出
		if (reads[1].equals("1:00")) {
			return 0;
		}else if (reads[1].equals("2:00")) {
			return 1;
		}else if (reads[1].equals("3:00")) {
			return 2;
		}else if (reads[1].equals("4:00")) {
			return 3;
		}else if (reads[1].equals("5:00")) {
			return 4;
		}else if (reads[1].equals("6:00")) {
			return 5;
		}else if (reads[1].equals("7:00")) {
			return 6;
		}else if (reads[1].equals("8:00")) {
			return 7;
		}else if (reads[1].equals("9:00")) {
			return 8;
		}else if (reads[1].equals("10:00")) {
			return 9;
		}else if (reads[1].equals("11:00")) {
			return 10;
		}else if (reads[1].equals("12:00")) {
			return 11;
		}else if (reads[1].equals("13:00")) {
			return 12;
		}else if (reads[1].equals("14:00")) {
			return 13;
		}else if (reads[1].equals("15:00")) {
			return 14;
		}else if (reads[1].equals("16:00")) {
			return 15;
		}else if (reads[1].equals("17:00")) {
			return 16;
		}else if (reads[1].equals("18:00")) {
			return 17;
		}else if (reads[1].equals("19:00")) {
			return 18;
		}else {
			return 19;
		}
	}

}

主体代码：

/**
 * @author: lpj   
 * @date: 2018年3月16日 下午7:16:47
 * @Description:
 */
package lpj.reduceWork;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import lpj.reduceWorkbean.MyPatitionerAccess;
/**
 *
 */
public class FrequentAccessMR {
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
//		conf.addResource("hdfs-site.xml");//使用配置文件
//		System.setProperty("HADOOP_USER_NAME", "hadoop");//使用集群
		FileSystem fs = FileSystem.get(conf);//默认使用本地
		
		Job job = Job.getInstance(conf);
		job.setJarByClass(FrequentAccessMR.class);
		job.setMapperClass(FrequentAccessMR_Mapper.class);
		job.setReducerClass(FrequentAccessMR_Reducer.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(NullWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		job.setPartitionerClass(MyPatitionerAccess.class);//指定分组器
		job.setNumReduceTasks(20);//指定输出文件数目
		
		Path inputPath = new Path("d:/a/homework8.txt");
		Path outputPath = new Path("d:/a/homework8");
		if (fs.exists(inputPath)) {
			fs.delete(outputPath, true);
		}
		
		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		boolean isdone = job.waitForCompletion(true);
		System.exit(isdone ? 0 : 1);
	}
	
	public static class FrequentAccessMR_Mapper extends Mapper<LongWritable, Text, Text, NullWritable>{
		Text kout = new Text();
		Text valueout = new Text();
		@Override
		protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
			//user4	10:00	u2	0.4
			String [] reads = value.toString().trim().split("\t");
			String kk = reads[0] + "\t" + reads[1];//组合表名时间段
			kout.set(kk);
			context.write(kout, NullWritable.get());
		}
	}
	public static class FrequentAccessMR_Reducer extends Reducer<Text, NullWritable, Text, NullWritable>{
		Text kout = new Text();
		Text valueout = new Text();
		@Override
		protected void reduce(Text key, Iterable<NullWritable> values, Context context)throws IOException, InterruptedException {
			int accessNum = 0;
			//统计不同时间段不同表的访问次数
			for(NullWritable vin : values){
				accessNum ++;
			}
			String kk = key.toString() + "\t" + accessNum;
			kout.set(kk);
			context.write(kout, NullWritable.get());
		}
		
	}

}

结果输出20个不同时间段的文件结果：

其中10:00的结果文件内容为

user10	10:00	5
user3	10:00	1
user4	10:00	1

即在10：00时间段内，user10的访问量最高

进行第二步：选取表名为user10，时间段为10:00组合用户作为key值，统计分组后的不同用户的访问系数以及开销时间

/**
 * @author: lpj   
 * @date: 2018年3月16日 下午7:16:47
 * @Description:
 */
package lpj.reduceWork;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import lpj.reduceWorkbean.MyPatitionerAccess;
/**
 *
 */
public class FrequentAccessMR2 {
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
//		conf.addResource("hdfs-site.xml");//使用配置文件
//		System.setProperty("HADOOP_USER_NAME", "hadoop");//使用集群
		FileSystem fs = FileSystem.get(conf);//默认使用本地
		
		Job job = Job.getInstance(conf);
		job.setJarByClass(FrequentAccessMR2.class);
		job.setMapperClass(FrequentAccessMR_Mapper.class);
		job.setReducerClass(FrequentAccessMR_Reducer.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		job.setPartitionerClass(MyPatitionerAccess.class);//指定分组器
		
		Path inputPath = new Path("d:/a/homework8.txt");
		Path outputPath = new Path("d:/a/homework8_2");
		if (fs.exists(inputPath)) {
			fs.delete(outputPath, true);
		}
		
		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		boolean isdone = job.waitForCompletion(true);
		System.exit(isdone ? 0 : 1);
	}
	
	public static class FrequentAccessMR_Mapper extends Mapper<LongWritable, Text, Text, Text>{
		Text kout = new Text();
		Text valueout = new Text();
		@Override
		protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
			//user4	10:00	u2	0.4
			String [] reads = value.toString().trim().split("\t");
			if (reads[0].equals("user10") && reads[1].equals("10:00")) {
				kout.set(reads[0] + "\t" + reads[1] + "\t" + reads[2]);//以用为分组
				valueout.set(reads[3]);//以时间量为value
				context.write(kout, valueout);
			}
		}
	}
	public static class FrequentAccessMR_Reducer extends Reducer<Text, Text, Text, Text>{
		Text kout = new Text();
		Text valueout = new Text();
		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {
			int accessNum = 0;
			String [] reads = key.toString().trim().split("\t");//user4	10:00	u2	0.4
			double sumtime = 0;
			//统计不同用户的访问次数以及访问时间
			for(Text vin : values){
				accessNum ++;
				sumtime += Double.parseDouble(vin.toString());
			}
			String kk = key.toString() + "\t" + accessNum;
			kout.set(kk);
			valueout.set(sumtime+"");
			context.write(kout, valueout);
		}
		
	}

}

结果为：

user10	10:00	u1	3	0.8
user10	10:00	u2	1	0.4
user10	10:00	u3	1	0.4

即用户u1访问最频繁，时间总长为0.8

jin6872115

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
MapReduce案例8——求最频繁访问数据表以及最频繁访问的用户和时长

题目：user1 1:00 u1 1.5user2 2:00 u2 0.5user3 3:00 u3 0.1user4 4:00 u1 1.4user5 5:00 u4 1.3user6 6:00 u4 1.9user7 7:00 u5 2.4user8 8:00 u1 0.1user9 9:00 u6 0.6user10 10:00 u1 0.5user11 1:00 u2 ...
复制链接

扫一扫