今天突然发现博客里竟然忘写一个正规的MR
Mapper
package club.drguo.hadoop.mapreduce.topkurl;
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class TopKeyURLMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
private FlowBean bean = new FlowBean();
private Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, FlowBean>.Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] strings = StringUtils.split(line, "\t");
try {
if (strings.length > 32 && StringUtils.isNotEmpty(strings[26]) && strings[26].startsWith("http")) {
String url = strings[26];
long up_flow = Long.parseLong(strings[30]);
long down_flow = Long.parseLong(strings[31]);
k.set(url);
System.out.println("+++++++++++"+up_flow);
bean.set(up_flow, down_flow);
context.write(k, bean);
// System.out.println(url);
}
} catch (Exception e) {
System.out.println("topkeyurlmapper有问题。。。。。");
}
}
}
Reducer
package club.drguo.hadoop.mapreduce.topkurl;
import java.io.IOException;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class TopKeyURLReducer extends Reducer<Text, FlowBean, Text, LongWritable>{
//如果放在reduce里,每放一个url产生一个treemap
private TreeMap<FlowBean, Text> treeMap = new TreeMap<>();
//总流量
private double globalCount = 0;
//<url,{bean,bean...}>
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context)
throws IOException, InterruptedException {
Text url = new Text(key.toString());
long up_sum = 0;
long down_sum = 0;
for(FlowBean bean : values){
up_sum += bean.getUp_flow();
System.out.println("--------------"+up_sum);
down_sum += bean.getDown_flow();
}
FlowBean bean = new FlowBean(up_sum, down_sum);
globalCount += bean.getSum_flow();
System.out.println("=============="+globalCount);
treeMap.put(bean, url);//根据流量排序(flowbean),放到treemap中
}
//reduce任务即将退出时被调用一次,setup()是开始时调用一次
@Override
protected void cleanup(Reducer<Text, FlowBean, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
Set<Entry<FlowBean, Text>> entrySet = treeMap.entrySet();
double tempCount = 0;
for(Entry<FlowBean, Text> ent : entrySet){
//只写入占总流量的百分之八十的网站
if(tempCount / globalCount < 0.8){
context.write(ent.getValue(), new LongWritable(ent.getKey().getSum_flow()));
tempCount += ent.getKey().getSum_flow();
}else{
return;
}
}
}
}
Runner
package club.drguo.hadoop.mapreduce.topkurl;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
//club.drguo.hadoop.mapreduce.topkurl.TopKeyURLRunner
public class TopKeyURLRunner extends Configured implements Tool{
@Override
public int run(String[] args) throws Exception {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(TopKeyURLRunner.class);
job.setMapperClass(TopKeyURLMapper.class);
job.setReducerClass(TopKeyURLReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileInputFormat.setInputPaths(job, "hdfs://ns1/flow/srclog");
FileOutputFormat.setOutputPath(job, new Path("hdfs://ns1/flow/topkurl"));
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new TopKeyURLRunner(), args);
System.exit(res);
}
}