日志分析 mapreduce sogou

最新推荐文章于 2024-07-03 17:03:45 发布

柱子89

最新推荐文章于 2024-07-03 17:03:45 发布

阅读量834

点赞数

分类专栏：搜索日志

搜索日志专栏收录该内容

2 篇文章 0 订阅

订阅专栏

数据来源：SogouQ
统计信息：对每个查询中的查询词的数目进行统计

代码如下：

package Sogou; import java.io.IOException; import java.util.*; import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; import org.apache.hadoop.io.*; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.*; public class SogouQueryWordCountClassifyMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> { public void map(LongWritable key, Text values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { IntWritable one = new IntWritable(1); String line = values.toString(); /* c#a表示统计查询的总行数 */ String outline = "ca::"; output.collect(new Text(outline), one); /* 按查询词的个数进行分类 */ String[] words = line.split("\\+"); int length = words.length; if (0 == length) { outline = "c0::"; /* 0个查询词 */ } else if (1 == length) { outline = "c1::"; /* 1个查询词 */ } else if (2 == length) { outline = "c2::"; /* 2个查询词 */ } else if (3 == length) { outline = "c3::"; /* 3个查询词 */ } else { outline = "c4::"; /* 4个及以上查询词 */ } output.collect(new Text(outline), one); /* map输出，用于reduce计数 */ } }

package Sogou; import org.apache.hadoop.io.*; import org.apache.hadoop.mapred.*; public class PartitionerClass implements Partitioner<Text, IntWritable> { public int getPartition(Text key, IntWritable values, int numPartitions) { /* 根据map的输出来区别不同的统计信息 */ if (numPartitions >= 6) { if (key.toString().startsWith("ca::")) { return 1; } else if (key.toString().startsWith("c0::")) { return 2; } else if (key.toString().startsWith("c1::")) { return 3; } else if (key.toString().startsWith("c2::")) { return 4; } else if (key.toString().startsWith("c3::")) { return 5; } else if (key.toString().startsWith("c4::")) { return 6; } else { return 7; } } else { return 0; } } public void configure(JobConf job) {} }

package Sogou; import java.io.IOException; import java.util.*; import org.apache.hadoop.fs.*; import org.apache.hadoop.io.*; import org.apache.hadoop.conf.*; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.*; public class SogouQueryWordCountClassify { public static void main(String[] args) { JobClient client = new JobClient(); JobConf conf = new JobConf(Sogou.SogouQueryWordCountClassify.class); conf.setJobName("SogouQueryWordCountClassify"); // specify output types conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); // specify input and output DIRECTORIES (not files) FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); // specify a mapper conf.setMapperClass(Sogou.SogouQueryWordCountClassifyMapper.class); // specify a reducer conf.setReducerClass(Sogou.SogouQueryWordCountClassifyReducer.class); // specify a partitioner conf.setPartitionerClass(Sogou.PartitionerClass.class); client.setConf(conf); try { JobClient.runJob(conf); } catch (Exception e) { e.printStackTrace(); } } }