- 数据1:
- huangbo love xuzheng
- huangxiaoming love baby huangxiaoming love mimi
- liangchaowei love liujialing
- 数据2:
- hello huangbo
- hello xuzheng
- hello huangxiaoming
题目一:编写 MapReduce 求出以下格式的结果数据:统计每个关键词在每个文档中当中的第几行出现了多少次。
例如,huangxiaoming 关键词的格式:
huangixaoming mapreduce-4-1.txt:2,2; mapreduce-4-1.txt:4,1;mapreduce-4-2.txt:3,1
首先是进行文件的额切分,拼接添加行号,以单词为key,文件名和行号进行拼接做为value,然后通过第二个MapRudece程序将数据组合成我们需要的。样式。
- 第一个MapReduce程序
- import java.io.IOException;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.NullWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.input.FileSplit;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- public class Question3_1_1 {
- public static class MRMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
- Text k = new Text();
- IntWritable v = new IntWritable(1);
- int num = 0;
- @Override
- protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
- String line = value.toString();
- //行号
- num++;
- String[] words = line.split(" ");
- //huangixaoming mapreduce-4-1.txt:2,2; mapreduce-4-1.txt:4,1;mapreduce-4-2.txt:3,1
- FileSplit inputSplit = (FileSplit) context.getInputSplit();
- //通过切片获取文件的名称
- String fileName = inputSplit.getPath().getName();
- for (String word : words) {
- //单词+文件名+行号 作为key输出
- k.set(word + ":" + fileName+ ":" + (num));
- System.out.println(word + "--" + fileName+ "--" + (num));
- context.write(k, v);
- }
- }
- }
- public static class MRReducer extends Reducer<Text, IntWritable, Text, NullWritable> {
- Text t = new Text();
- @Override
- protected void reduce(Text key, Iterable<IntWritable> values, Context context)
- throws IOException, InterruptedException {
- //获取到key 单词+文件名+行号;
- //根据key相同,进行累加相同的word出现了几次
- int count = 0;
- for (IntWritable value : values) {
- count += value.get();
- }
- //转化输出
- t.set(key.toString()+","+count);
- context.write(t,NullWritable.get());
- }
- }
- public static void main(String[] args) throws Exception {
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(conf);
- Job job = Job.getInstance(conf);
- job.setJarByClass(Question3_1_1.class);
- job.setMapOutputKeyClass(Text.class);
- job.setMapOutputValueClass(IntWritable.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(NullWritable.class);
- FileInputFormat.setInputPaths(job, new Path("G:/test/q3/input"));
- if(fs.exists(new Path("G:/test/q3/output_3_1"))){
- fs.delete(new Path("G:/test/q3/output_3_1"), true);
- }
- FileOutputFormat.setOutputPath(job, new Path("G:/test/q3/output_3_1"));
- job.setMapperClass(MRMapper.class);
- job.setReducerClass(MRReducer.class);
- System.exit(job.waitForCompletion(true) ? 1:0);
- }
- }
第二个MapReduce程序
- import java.io.IOException;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- public class Question3_1_2 {
- public static class MRMapper extends Mapper<LongWritable, Text, Text, Text>{
- @Override
- protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
- String line = value.toString();
- String[] files = line.split(":");
- //k.set(word + ":" + fileName+ ":" + (num));
- //baby:mapreduce-4-1.txt:2,1
- String str = files[1]+":"+files[2];
- context.write(new Text(files[0]), new Text(str));
- }
- }
- public static class MRReducer extends Reducer<Text, Text, Text, Text>{
- @Override
- protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
- StringBuffer sb = new StringBuffer();
- for (Text text : values) {
- sb.append(text.toString()+";");
- }
- context.write(key, new Text(sb.toString()));
- }
- }
- public static void main(String[] args) throws Exception {
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(conf);
- Job job = Job.getInstance(conf);
- job.setJarByClass(Question3_1_2.class);
- job.setMapperClass(MRMapper.class);
- job.setReducerClass(MRReducer.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(Text.class);
- FileInputFormat.setInputPaths(job, new Path("G:/test/q3/output_3_1"));
- if(fs.exists(new Path("G:/test/q3/output_3_2"))){
- fs.delete(new Path("G:/test/q3/output_3_2"), true);
- }
- FileOutputFormat.setOutputPath(job, new Path("G:/test/q3/output_3_2"));
- System.exit(job.waitForCompletion(true) ? 1:0);
- }
- }
例如:
huangixaoming mapreduce-4-1.txt,3;mapreduce-4-2.txt,1
以上答案的含义:
关键词 huangxiaoming 在第一份文档 mapreduce-4-1.txt 中出现了 3 次,在第二份文档mapreduce-4-2.txt 中出现了 1 次。
方案:先统计出每个关键词在某个文件中的出现次数,然后再进行排序。
- import java.io.IOException;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.NullWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.input.FileSplit;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- public class Question3_2_1 {
- public static class MRMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
- Text k = new Text();
- IntWritable v = new IntWritable(1);
- @Override
- protected void map(LongWritable key, Text value, Context context)
- throws IOException, InterruptedException {
- String line = value.toString();
- String[] words = line.split(" ");
- //huangixaoming mapreduce-4-1.txt:2,2; mapreduce-4-1.txt:4,1;mapreduce-4-2.txt:3,1
- FileSplit inputSplit = (FileSplit) context.getInputSplit();
- String fileName = inputSplit.getPath().getName();
- for (String word : words) {
- k.set(word + ":" + fileName);
- context.write(k, v);
- }
- }
- }
- public static class MRReducer extends Reducer<Text, IntWritable, Text, NullWritable> {
- Text t = new Text();
- @Override
- protected void reduce(Text key, Iterable<IntWritable> values, Context context)
- throws IOException, InterruptedException {
- int count = 0;
- for (IntWritable value : values) {
- count += value.get();
- }
- t.set(key.toString()+","+count);
- context.write(t,NullWritable.get());
- }
- }
- public static void main(String[] args) throws Exception {
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(conf);
- Job job = Job.getInstance(conf);
- job.setJarByClass(Question3_2_1.class);
- job.setMapOutputKeyClass(Text.class);
- job.setMapOutputValueClass(IntWritable.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(NullWritable.class);
- FileInputFormat.setInputPaths(job, new Path("G:/test/q3/input"));
- if(fs.exists(new Path("G:/test/q3/output_3_3"))){
- fs.delete(new Path("G:/test/q3/output_3_3"), true);
- }
- FileOutputFormat.setOutputPath(job, new Path("G:/test/q3/output_3_3"));
- job.setMapperClass(MRMapper.class);
- job.setReducerClass(MRReducer.class);
- System.exit(job.waitForCompletion(true) ? 1:0);
- }
- }
- import java.io.IOException;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.NullWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- public class Question3_2_2 {
- //huangixaoming mapreduce-4-1.txt,3;mapreduce-4-2.txt,1
- //yangmi:mapreduce-4-1.txt,1
- public static class MRMapper extends Mapper<LongWritable, Text, TestBean, NullWritable>{
- @Override
- protected void map(LongWritable key, Text value, Context context)
- throws IOException, InterruptedException {
- String[] line = value.toString().split(":");
- TestBean tb = new TestBean(line[0],line[1].split(",")[0],Integer.parseInt(line[1].split(",")[1]));
- context.write(tb,NullWritable.get());
- }
- }
- public static class MRReducer extends Reducer<TestBean, NullWritable, Text, Text>{
- Text k = new Text();
- Text v = new Text();
- @Override
- protected void reduce(TestBean key, Iterable<NullWritable> values, Context context)
- throws IOException, InterruptedException {
- StringBuffer sb = new StringBuffer();
- for (NullWritable nv : values) {
- sb.append(key.getFileName()+","+key.getNum()+";");
- }
- k.set(key.getName());
- v.set(sb.toString());
- context.write(k, v);
- }
- }
- public static void main(String[] args) throws Exception {
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(conf);
- Job job = Job.getInstance(conf);
- job.setJarByClass(Question3_2_2.class);
- job.setMapperClass(MRMapper.class);
- job.setReducerClass(MRReducer.class);
- job.setMapOutputKeyClass(TestBean.class);
- job.setMapOutputValueClass(NullWritable.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(Text.class);
- job.setGroupingComparatorClass(UserGC.class);
- FileInputFormat.setInputPaths(job, new Path("G:/test/q3/output_3_3"));
- if(fs.exists(new Path("G:/test/q3/output_3_4"))){
- fs.delete(new Path("G:/test/q3/output_3_4"), true);
- }
- FileOutputFormat.setOutputPath(job, new Path("G:/test/q3/output_3_4"));
- System.exit(job.waitForCompletion(true) ? 1:0);
- }
- }
- import java.io.DataInput;
- import java.io.DataOutput;
- import java.io.IOException;
- import org.apache.hadoop.io.WritableComparable;
- public class TestBean implements WritableComparable<TestBean>{
- private String name;
- private String fileName;
- private int num;
- public String getName() {
- return name;
- }
- public void setName(String name) {
- this.name = name;
- }
- public String getFileName() {
- return fileName;
- }
- public void setFileName(String fileName) {
- this.fileName = fileName;
- }
- public int getNum() {
- return num;
- }
- public void setNum(int num) {
- this.num = num;
- }
- public TestBean() {
- super();
- // TODO Auto-generated constructor stub
- }
- public TestBean(String name, String fileName, int num) {
- super();
- this.name = name;
- this.fileName = fileName;
- this.num = num;
- }
- @Override
- public void write(DataOutput out) throws IOException {
- out.writeUTF(name);
- out.writeUTF(fileName);
- out.writeInt(num);
- }
- @Override
- public void readFields(DataInput in) throws IOException {
- name = in.readUTF();
- fileName = in.readUTF();
- num = in.readInt();
- }
- @Override
- public int compareTo(TestBean o) {
- if(o.getName().compareTo(this.getName()) == 0){
- int flag = o.getNum()-this.getNum();
- if(flag == 0){
- return 0;
- }else if(flag > 0){
- return 1;
- }else{
- return -1;
- }
- }else{
- return o.getName().compareTo(this.getName());
- }
- }
- }
自定义分组组件:UserGC
- import org.apache.hadoop.io.WritableComparable;
- import org.apache.hadoop.io.WritableComparator;
- public class UserGC extends WritableComparator{
- public UserGC() {
- super(TestBean.class,true);
- }
- @Override
- public int compare(WritableComparable a, WritableComparable b) {
- TestBean pa = (TestBean) a;
- TestBean pb = (TestBean) b;
- return pa.getName().compareTo(pb.getName());
- }
- }