练习题:检索出每个二级域名的总流量(总上行流量+总下行流量)
数据:
18878724052 http://www.edu360.cn 10047 11344
15522957721 http://weibo.com/?category=1760 18751 5605
18133396282 https://image.baidu.com 12276 9411
15565139582 http://v.baidu.com/tv 16365 18364
13298997507 http://v.baidu.com/tv 3990 12628
17782703107 https://zhidao.baidu.com/question/1430480451137504979.html 19315 6208
13962416105 http://v.baidu.com/tv 10018 11690
17322020441 http://v.baidu.com/tv 2491 1642
15838846957 http://v.baidu.com/tv 12560 12462
15176696561 http://music.baidu.com 15791 11250
18999082003 http://www.edu360.cn 6301 3592
18550717678 https://www.jianshu.com/p/bb88f7111b9e 4282 16177
15606174825 http://movie.youku.com 7553 1058
18234569039 http://weibo.com/?category=1760 9414 15360
17562510178 http://blog.csdn.net/article/details/18565522 9368 7875
14925717279 https://image.baidu.com 18462 1302
17124924089 http://www.edu360.cn 8540 18133
13304504348 https://image.baidu.com 397 16781
实现代码:
package nuc.edu.ls;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
/**
*
*自定义类做返回值需实现hadopde的序列化机制
*/
public class FlowBean implements Writable{
private long up;
private long down;
private long sum;
@Override
public void readFields(DataInput in) throws IOException {
up = in.readLong();
down = in.readLong();
sum = in.readLong();
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeLong(up);
out.writeLong(down);
out.writeLong(sum);
}
public long getUp() {
return up;
}
public void setUp(long up) {
this.up = up;
}
public long getDown() {
return down;
}
public void setDown(long down) {
this.down = down;
}
public long getSum() {
return sum;
}
public void setSum(long sum) {
this.sum = up+down;
}
public FlowBean(long up, long down) {
super();
this.up = up;
this.down = down;
this.sum = up+down;
}
public FlowBean() {
super();
// TODO Auto-generated constructor stub
}
@Override
public String toString() {
return up+"\t"+down+"\t"+sum;
}
}
package nuc.edu.ls;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class FlowMR {
public static class MapTask extends Mapper<LongWritable, Text, Text, FlowBean> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, FlowBean>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
try {
String[] split = value.toString().split("\t")[1].split("\\s+");
String url = split[0];
String up = split[1];
String down = split[2];
String reg = ".*?//(.*?)/.*?";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(url);
if (matcher.find()) {
context.write(new Text(matcher.group(1)), new FlowBean(Long.parseLong(up), Long.parseLong(down)));
} else {
String[] splits = key.toString().split("//");
context.write(new Text(splits[1]), new FlowBean(Long.parseLong(up), Long.parseLong(down)));
}
} catch (Exception e) {
// TODO Auto-generated catch block
}
}
}
public static class ReduceTask extends Reducer<Text, FlowBean, Text, FlowBean> {
@Override
protected void reduce(Text key, Iterable<FlowBean> values,
Reducer<Text, FlowBean, Text, FlowBean>.Context context) throws IOException, InterruptedException {
long up = 0;
long down = 0;
for (FlowBean flowBean : values) {
up += flowBean.getUp();
down += flowBean.getDown();
}
FlowBean flowBean = new FlowBean(up, down);
context.write(new Text(key), flowBean);
}
}
public static void main(String[] args) throws Exception {
System.setProperty("HADOOP_USER_NAME", "root");
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "eclipseToCluster");
job.setMapperClass(MapTask.class);
job.setReducerClass(ReduceTask.class);
job.setJarByClass(FlowMR.class);
// job.setJar("C:\\Users\\LENOVO\\Desktop\\WordCount.jar");
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
FileInputFormat.addInputPath(job, new Path("d:/http.log"));
FileOutputFormat.setOutputPath(job, new Path("d:/http/"));
boolean completion = job.waitForCompletion(true);
System.out.println(completion ? 0 : 1);
}
}
测试结果: