访问次数最多的前十个搜索词
ThirdMapper.java
package com.hniu.bigdata.hadoop.Third;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
public class ThirdMapper extends Mapper<LongWritable, Text,Text, ThirdSortBean> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
// 日期数据处理
String[] values = line.split("\\s");
String time = values[values.length - 1];
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
String dateTime = format.format(new Date(Long.parseLong(time)));
// 域名切分
String url = values[4];
values[values.length - 1] = dateTime;
String domain = url.split("/")[0];
values[4] = domain;
line = StringUtils.join(" ",values);
String keyWords =values[1] +"_"+domain;
ThirdSortBean data = new ThirdSortBean();
data.setKeyWords(values[1]);
data.setDomain(domain);
data.setTotal_click(1);
context.write(new Text(keyWords), data);
}
}
ThirdReduce.java
package com.hniu.bigdata.hadoop.Third;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Map;
import java.util.TreeMap;
public class ThirdReduce extends Reducer<Text,ThirdSortBean, Text, IntWritable> {
private TreeMap<ThirdSortBean,String> topTen = new TreeMap<ThirdSortBean, String>();
@Override
protected void reduce(Text key, Iterable<ThirdSortBean> values, Context context) throws IOException, InterruptedException {
ThirdSortBean result = new ThirdSortBean();
int total_click = 0;
for (ThirdSortBean data : values){
total_click += data.getTotal_click();
result.setKeyWords(data.getKeyWords());
result.setDomain(data.getDomain());
}
result.setTotal_click(total_click);
// context.write(key,new IntWritable(total_click));
String keywords = key.toString().split("_")[0];
if (topTen.values().contains(keywords)){
int index=new ArrayList<String>(topTen.values()).indexOf(keywords);
ThirdSortBean tmpData= (ThirdSortBean) topTen.keySet().toArray()[index];
if(total_click > tmpData.getTotal_click()){
topTen.remove(tmpData);
topTen.put(result,keywords);
}
}else {
topTen.put(result,keywords);
}
if (topTen.size()>10){
topTen.remove(topTen.lastKey());
}
//context.write(key,new IntWritable(total_click));
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
for (Map.Entry<ThirdSortBean,String> entry:topTen.entrySet()){
context.write(new Text(entry.getValue()),new IntWritable(entry.getKey().getTotal_click()));
}
}
}
ThirdSortBean.java
package com.hniu.bigdata.hadoop.Third;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class ThirdSortBean implements WritableComparable<ThirdSortBean> {
private String keyWords;
private String domain;
private int total_click;
public ThirdSortBean(){}
public ThirdSortBean(String keyWords, String domain, int total_click) {
this.keyWords = keyWords;
this.domain = domain;
this.total_click = total_click;
}
public String getKeyWords() {
return keyWords;
}
public void setKeyWords(String keyWords) {
this.keyWords = keyWords;
}
public String getDomain() {
return domain;
}
public void setDomain(String domain) {
this.domain = domain;
}
public int getTotal_click() {
return total_click;
}
public void setTotal_click(int total_click) {
this.total_click = total_click;
}
public int compareTo(ThirdSortBean o) {
return total_click > o.getTotal_click() ? -1 :(total_click == o.getTotal_click() ? 0 :1);
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(keyWords);
dataOutput.writeUTF(domain);
dataOutput.writeInt(total_click);
}
public void readFields(DataInput dataInput) throws IOException {
keyWords = dataInput.readUTF();
domain = dataInput.readUTF();
total_click = dataInput.readInt();
}
@Override
public String toString() {
return "{" +
"keyWords='" + keyWords + '\'' +
", domain='" + domain + '\'' +
", total_click=" + total_click +
'}';
}
}
ThirdDriver.java
package com.hniu.bigdata.hadoop.Third;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class ThirdDriver {
public static void main(String[] args) throws Exception{
Configuration configuration = new Configuration();
configuration.set("fs.defaultFS","hdfs://192.168.179.46:8020");
Job job = Job.getInstance(configuration, "word count");
job.setJarByClass(ThirdDriver.class);
job.setMapperClass(ThirdMapper.class);
job.setReducerClass(ThirdReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(ThirdSortBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(ThirdSortBean.class);
FileInputFormat.addInputPath(job, new Path("/xyz"));
FileOutputFormat.setOutputPath(job, new Path("/Third_Data"));
job.waitForCompletion(true);
}
}