MapReduce编程小案例.3rd—对网站访问总次数全排序

MapReduce编程小案例.3rd—对网站访问总次数全排序

利用MapReduce处理一个小案例,如下是一批网站的访问次数信息:

2018/05/11 qq.com/a
2018/05/11 qq.com/bx
2018/05/11 qq.com/by
2018/05/11 qq.com/by3
2018/05/11 qq.com/news
2018/05/11 sina.com/news/socail
2018/05/11 163.com/ac
2018/05/11 sina.com/news/socail
2018/05/11 163.com/sport
2018/05/11 163.com/ac
2018/05/11 sina.com/play
2018/05/11 163.com/sport
2018/05/11 163.com/ac
2018/05/11 sina.com/movie
2018/05/11 sina.com/play
2018/05/11 sina.com/movie
2018/05/11 163.com/sport
2018/05/11 sina.com/movie
2018/05/11 163.com/ac
2018/05/11 163.com/ac
2018/05/11 163.com/acc
2018/05/11 qq.com/by
2018/05/11 qq.com/by3
2018/05/11 qq.com/news
2018/05/11 163.com/sport
2018/05/11 sina.com/news/socail
2018/05/11 163.com/sport
2018/05/11 sina.com/movie
2018/05/11 sina.com/news/socail
2018/05/11 sina.com/movie
2018/05/11 qq.com/news
2018/05/11 163.com/bb
2018/05/11 163.com/cc
2018/05/11 sina.com/lady/
2018/05/11 163.com/cc
2018/05/11 qq.com/news
2018/05/11 qq.com/by
2018/05/11 qq.com/by3
2018/05/11 sina.com/lady/
2018/05/11 qq.com/by3
2018/05/11 sina.com/lady/
2018/05/11 qq.com/by3
2018/05/11 qq.com/news
2018/05/11 qq.com/by3
2018/05/11 163.com/sport
2018/05/11 163.com/sport
2018/05/11 sina.com/news/socail
2018/05/11 sina.com/lady/
2018/05/11 sina.com/play
2018/05/11 sina.com/movie
2018/05/11 sina.com/music
2018/05/11 sina.com/sport
2018/05/11 sina.com/sport
2018/05/11 163.com/sport
2018/05/11 sina.com/news/socail
2018/05/11 sohu.com/lady/
2018/05/11 sohu.com/play
2018/05/11 sohu.com/movie
2018/05/11 sohu.com/music
2018/05/11 sohu.com/sport
2018/05/11 sohu.com/sport
2018/05/11 sina.com/news/socail
2018/05/11 baidu.com/lady/
2018/05/11 baidu.com/play
2018/05/11 baidu.com/movie
2018/05/11 baidu.com/music
2018/05/11 baidu.com/movie
2018/05/11 baidu.com/music
2018/05/11 baidu.com/movie
2018/05/11 baidu.com/music
2018/05/11 baidu.com/movie
2018/05/11 baidu.com/music
2018/05/11 baidu.com/movie
2018/05/11 baidu.com/music
2018/05/11 baidu.com/music
2018/05/11 baidu.com/movie
2018/05/11 baidu.com/music
2018/05/11 baidu.com/sport
2018/05/11 baidu.com/sport

这次设计MapReduce有些巧妙,要分2个MapReduce任务,所以新建一个PageCountStep1的类

package cn.edu360.mr.page.count.sort;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class PageCountStep1 {
	
	public static class PageCountStep1Mapper extends Mapper<LongWritable, Text, Text, IntWritable>{
		
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			
			String line = value.toString();
			String[] split = line.split(" ");
			context.write(new Text(split[1]), new IntWritable(1));

		}
	}
	
	public static class PageCountStep1Reducer extends Reducer<Text, IntWritable, Text, IntWritable>{
		
		@Override
		protected void reduce(Text key, Iterable<IntWritable> values,
				Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
                
			int count = 0;
			
			for (IntWritable value : values) {
				count += value.get();
				
			}
			context.write(key, new IntWritable(count));
		}
	}
	
	
	public static void main(String[] args) throws Exception {
		
		/*
		 * 通过加载classpath下的*-site.xml文件解析参数
		 */
		
		Configuration conf = new Configuration();
		
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(PageCountStep1.class);
		
		job.setMapperClass(PageCountStep1Mapper.class);
		job.setReducerClass(PageCountStep1Reducer.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\url\\input"));
		FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\url\\countout"));
		
		job.setNumReduceTasks(3);
		job.waitForCompletion(true);
		
		
	}

}

当然新建一个PageCount类来封装网站的信息,但是这里稍有不同,因为用于传输MapReduce之中的信息,所以必须继承序列化接口,并且覆写其中的方法,还有一个Comparable接口,当然设计框架的人员早就想到了这个问题,所以这个类只需要继承WritableComparable接口就好:

package cn.edu360.mr.page.count.sort;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class PageCount implements WritableComparable<PageCount>{
	private String page;
	private int count;
	
	public void set(String page, int count) {
		this.page = page;
		this.count = count;
	}

	public String getPage() {
		return page;
	}

	public void setPage(String page) {
		this.page = page;
	}

	public int getCount() {
		return count;
	}

	public void setCount(int count) {
		this.count = count;
	}

	public void readFields(DataInput in) throws IOException {
		this.page = in.readUTF();
		this.count = in.readInt();
		
	}

	public void write(DataOutput out) throws IOException {
		out.writeUTF(this.page);
		out.writeInt(this.count);
		
	}

	public int compareTo(PageCount o) {
         return o.getCount()-this.count == 0?this.page.compareTo(o.getPage()):o.getCount()-this.count;
	}
	
	@Override
	public String toString() {
		// TODO Auto-generated method stub
		return this.page + "," + this.count;
	}

}

最后再来设计一个第二步的PageCountStep2的类:

package cn.edu360.mr.page.count.sort;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class PageCountStep2 {
	
	public static class PageCountStep2Mapper extends Mapper<LongWritable, Text, PageCount, NullWritable>{
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, PageCount, NullWritable>.Context context)
				throws IOException, InterruptedException {
              String[] split = value.toString().split("\t");
              
              PageCount pageCount = new PageCount();
              pageCount.set(split[0], Integer.parseInt(split[1]));
              
              context.write(pageCount, NullWritable.get());
		}
		
	}

	public static class PageCountStep2Reducer extends Reducer<PageCount, NullWritable, PageCount, NullWritable>{
		@Override
		protected void reduce(PageCount key, Iterable<NullWritable> values,
				Reducer<PageCount, NullWritable, PageCount, NullWritable>.Context context)
				throws IOException, InterruptedException {
             context.write(key, NullWritable.get());
		}
	}
	
	public static void main(String[] args) throws Exception {
		
		Configuration conf = new Configuration();
		
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(PageCountStep2.class);
		
		job.setMapperClass(PageCountStep2Mapper.class);
		job.setReducerClass(PageCountStep2Reducer.class);
		
		job.setMapOutputKeyClass(PageCount.class);
		job.setMapOutputValueClass(NullWritable.class);
		
		FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\url\\countout"));
		FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\url\\sortcout"));
		
		job.setNumReduceTasks(1);
		job.waitForCompletion(true);
	
		
		
		
		
	}
}


阅读更多
文章标签: MapReduce
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页

不良信息举报

MapReduce编程小案例.3rd—对网站访问总次数全排序

最多只允许输入30个字

加入CSDN,享受更精准的内容推荐,与500万程序员共同成长!
关闭
关闭