MapReduce练习-----倒排索引

最新推荐文章于 2022-09-27 19:41:02 发布

爱萨萨

最新推荐文章于 2022-09-27 19:41:02 发布

阅读量458

点赞数

分类专栏：技术-大数据文章标签： mapreduce 倒排索引

技术-大数据专栏收录该内容

205 篇文章 19 订阅

订阅专栏

 
   [plain]  
   view plaincopy
数据1：  
huangbo love xuzheng  
huangxiaoming love baby huangxiaoming love mimi  
liangchaowei love liujialing  
  
数据2：  
hello huangbo  
hello xuzheng  
hello huangxiaoming  

题目一：编写 MapReduce 求出以下格式的结果数据：统计每个关键词在每个文档中当中的第几行出现了多少次。
例如，huangxiaoming 关键词的格式：

huangixaoming mapreduce-4-1.txt:2,2; mapreduce-4-1.txt:4,1;mapreduce-4-2.txt:3,1

首先是进行文件的额切分，拼接添加行号，以单词为key，文件名和行号进行拼接做为value，然后通过第二个MapRudece程序将数据组合成我们需要的。样式。

 
   [java]  
   view plaincopy
第一个MapReduce程序  
  
import java.io.IOException;  
  
import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.fs.FileSystem;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.io.IntWritable;  
import org.apache.hadoop.io.LongWritable;  
import org.apache.hadoop.io.NullWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Job;  
import org.apache.hadoop.mapreduce.Mapper;  
import org.apache.hadoop.mapreduce.Reducer;  
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import org.apache.hadoop.mapreduce.lib.input.FileSplit;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  
public class Question3_1_1 {  
        public static class MRMapper extends Mapper<LongWritable, Text, Text, IntWritable> {  
  
            Text k = new Text();  
            IntWritable v = new IntWritable(1);  
            int num = 0;  
  
            @Override  
            protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {  
  
                String line = value.toString();  
                //行号  
                num++;  
  
                String[] words = line.split(" ");  
  
                //huangixaoming  mapreduce-4-1.txt:2,2; mapreduce-4-1.txt:4,1;mapreduce-4-2.txt:3,1  
                FileSplit inputSplit = (FileSplit) context.getInputSplit();  
                //通过切片获取文件的名称  
                String fileName = inputSplit.getPath().getName();  
                for (String word : words) {  
                    //单词+文件名+行号  作为key输出  
                    k.set(word + ":" + fileName+ ":" + (num));  
                    System.out.println(word + "--" + fileName+ "--" + (num));  
                    context.write(k, v);  
  
                }  
                  
            }  
  
        }  
  
        public static class MRReducer extends Reducer<Text, IntWritable, Text, NullWritable> {  
  
            Text t = new Text();  
            @Override  
            protected void reduce(Text key, Iterable<IntWritable> values, Context context)   
                    throws IOException, InterruptedException {  
                //获取到key 单词+文件名+行号；  
                //根据key相同，进行累加相同的word出现了几次  
                int count = 0;  
                for (IntWritable value : values) {  
  
                    count += value.get();  
                }  
                  
                //转化输出  
                t.set(key.toString()+","+count);  
                context.write(t,NullWritable.get());  
  
            }  
  
        }  
  
        public static void main(String[] args) throws Exception {  
  
            Configuration conf = new Configuration();  
            FileSystem fs = FileSystem.get(conf);  
            Job job = Job.getInstance(conf);  
              
            job.setJarByClass(Question3_1_1.class);  
  
            job.setMapOutputKeyClass(Text.class);  
            job.setMapOutputValueClass(IntWritable.class);  
            job.setOutputKeyClass(Text.class);  
            job.setOutputValueClass(NullWritable.class);  
  
            FileInputFormat.setInputPaths(job, new Path("G:/test/q3/input"));  
              
            if(fs.exists(new Path("G:/test/q3/output_3_1"))){  
                fs.delete(new Path("G:/test/q3/output_3_1"), true);  
            }  
            FileOutputFormat.setOutputPath(job, new Path("G:/test/q3/output_3_1"));  
  
            job.setMapperClass(MRMapper.class);  
            job.setReducerClass(MRReducer.class);  
  
            System.exit(job.waitForCompletion(true) ? 1:0);  
  
        }  
  
  
}  

第二个MapReduce程序

 
   [java]  
   view plaincopy
import java.io.IOException;  
  
import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.fs.FileSystem;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.io.LongWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Job;  
import org.apache.hadoop.mapreduce.Mapper;  
import org.apache.hadoop.mapreduce.Reducer;  
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  
public class Question3_1_2 {  
      
    public static class MRMapper extends Mapper<LongWritable, Text, Text, Text>{  
        @Override  
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {  
            String line = value.toString();  
            String[] files = line.split(":");  
            //k.set(word + ":" + fileName+ ":" + (num));  
            //baby:mapreduce-4-1.txt:2,1  
            String str = files[1]+":"+files[2];  
            context.write(new Text(files[0]), new Text(str));  
        }  
    }  
      
    public static class MRReducer extends Reducer<Text, Text, Text, Text>{  
        @Override  
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {  
            StringBuffer sb = new StringBuffer();  
            for (Text text : values) {  
                sb.append(text.toString()+";");  
            }  
            context.write(key, new Text(sb.toString()));  
        }  
    }  
      
    public static void main(String[] args) throws Exception {  
          
        Configuration conf = new Configuration();  
        FileSystem fs = FileSystem.get(conf);  
        Job job = Job.getInstance(conf);  
        job.setJarByClass(Question3_1_2.class);  
          
        job.setMapperClass(MRMapper.class);  
        job.setReducerClass(MRReducer.class);  
        job.setOutputKeyClass(Text.class);  
        job.setOutputValueClass(Text.class);  
          
        FileInputFormat.setInputPaths(job, new Path("G:/test/q3/output_3_1"));  
          
        if(fs.exists(new Path("G:/test/q3/output_3_2"))){  
            fs.delete(new Path("G:/test/q3/output_3_2"), true);  
        }  
        FileOutputFormat.setOutputPath(job, new Path("G:/test/q3/output_3_2"));  
          
        System.exit(job.waitForCompletion(true) ? 1:0);  
    }  
  
  
}  

题目二：编写 MapReduce 程序求出每个关键词在每个文档出现了多少次，并且按照出现次数降序排序。
例如：
huangixaoming mapreduce-4-1.txt,3;mapreduce-4-2.txt,1
以上答案的含义：
关键词 huangxiaoming 在第一份文档 mapreduce-4-1.txt 中出现了 3 次，在第二份文档mapreduce-4-2.txt 中出现了 1 次。

方案：先统计出每个关键词在某个文件中的出现次数，然后再进行排序。

 
   [java]  
   view plaincopy
import java.io.IOException;  
  
import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.fs.FileSystem;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.io.IntWritable;  
import org.apache.hadoop.io.LongWritable;  
import org.apache.hadoop.io.NullWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Job;  
import org.apache.hadoop.mapreduce.Mapper;  
import org.apache.hadoop.mapreduce.Reducer;  
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import org.apache.hadoop.mapreduce.lib.input.FileSplit;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  
public class Question3_2_1 {  
        public static class MRMapper extends Mapper<LongWritable, Text, Text, IntWritable> {  
  
            Text k = new Text();  
            IntWritable v = new IntWritable(1);  
  
            @Override  
            protected void map(LongWritable key, Text value, Context context)   
                    throws IOException, InterruptedException {  
  
                String line = value.toString();  
  
                String[] words = line.split(" ");  
  
                //huangixaoming  mapreduce-4-1.txt:2,2; mapreduce-4-1.txt:4,1;mapreduce-4-2.txt:3,1  
                FileSplit inputSplit = (FileSplit) context.getInputSplit();  
                String fileName = inputSplit.getPath().getName();  
                for (String word : words) {  
                    k.set(word + ":" + fileName);  
                    context.write(k, v);  
  
                }  
                  
            }  
  
        }  
  
        public static class MRReducer extends Reducer<Text, IntWritable, Text, NullWritable> {  
  
            Text t = new Text();  
            @Override  
            protected void reduce(Text key, Iterable<IntWritable> values, Context context)   
                    throws IOException, InterruptedException {  
  
                int count = 0;  
                for (IntWritable value : values) {  
  
                    count += value.get();  
                }  
                t.set(key.toString()+","+count);  
                context.write(t,NullWritable.get());  
  
            }  
  
        }  
  
        public static void main(String[] args) throws Exception {  
  
            Configuration conf = new Configuration();  
            FileSystem fs = FileSystem.get(conf);  
            Job job = Job.getInstance(conf);  
              
            job.setJarByClass(Question3_2_1.class);  
  
            job.setMapOutputKeyClass(Text.class);  
            job.setMapOutputValueClass(IntWritable.class);  
            job.setOutputKeyClass(Text.class);  
            job.setOutputValueClass(NullWritable.class);  
  
            FileInputFormat.setInputPaths(job, new Path("G:/test/q3/input"));  
              
            if(fs.exists(new Path("G:/test/q3/output_3_3"))){  
                fs.delete(new Path("G:/test/q3/output_3_3"), true);  
            }  
            FileOutputFormat.setOutputPath(job, new Path("G:/test/q3/output_3_3"));  
  
            job.setMapperClass(MRMapper.class);  
            job.setReducerClass(MRReducer.class);  
  
            System.exit(job.waitForCompletion(true) ? 1:0);  
  
        }  
  
  
}  

使用自定义对象，将上面的结果组合成一个自定义对象，然后根据关键词分组，根据出现次数排序；

 
   [java]  
   view plaincopy
import java.io.IOException;  
  
import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.fs.FileSystem;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.io.IntWritable;  
import org.apache.hadoop.io.LongWritable;  
import org.apache.hadoop.io.NullWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Job;  
import org.apache.hadoop.mapreduce.Mapper;  
import org.apache.hadoop.mapreduce.Reducer;  
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  
public class Question3_2_2 {  
    //huangixaoming  mapreduce-4-1.txt,3;mapreduce-4-2.txt,1  
    //yangmi:mapreduce-4-1.txt,1  
    public static class MRMapper extends Mapper<LongWritable, Text, TestBean, NullWritable>{  
        @Override  
        protected void map(LongWritable key, Text value, Context context)   
                throws IOException, InterruptedException {  
              
            String[] line =  value.toString().split(":");  
            TestBean tb = new TestBean(line[0],line[1].split(",")[0],Integer.parseInt(line[1].split(",")[1]));  
            context.write(tb,NullWritable.get());  
        }  
    }  
      
    public static class MRReducer extends Reducer<TestBean, NullWritable, Text, Text>{  
        Text k = new Text();  
        Text v = new Text();  
        @Override  
        protected void reduce(TestBean key, Iterable<NullWritable> values, Context context)  
                throws IOException, InterruptedException {  
              
            StringBuffer sb = new StringBuffer();  
            for (NullWritable nv : values) {  
                sb.append(key.getFileName()+","+key.getNum()+";");  
            }  
            k.set(key.getName());  
            v.set(sb.toString());  
            context.write(k, v);  
        }  
    }  
      
    public static void main(String[] args) throws Exception {  
          
        Configuration conf = new Configuration();  
        FileSystem fs = FileSystem.get(conf);  
        Job job = Job.getInstance(conf);  
        job.setJarByClass(Question3_2_2.class);  
          
        job.setMapperClass(MRMapper.class);  
        job.setReducerClass(MRReducer.class);  
        job.setMapOutputKeyClass(TestBean.class);  
        job.setMapOutputValueClass(NullWritable.class);  
        job.setOutputKeyClass(Text.class);  
        job.setOutputValueClass(Text.class);  
        job.setGroupingComparatorClass(UserGC.class);  
          
        FileInputFormat.setInputPaths(job, new Path("G:/test/q3/output_3_3"));  
          
        if(fs.exists(new Path("G:/test/q3/output_3_4"))){  
            fs.delete(new Path("G:/test/q3/output_3_4"), true);  
        }  
        FileOutputFormat.setOutputPath(job, new Path("G:/test/q3/output_3_4"));  
          
        System.exit(job.waitForCompletion(true) ? 1:0);  
    }  
  
  
}  

自定义数据类型：TestBean

 
   [java]  
   view plaincopy
import java.io.DataInput;  
import java.io.DataOutput;  
import java.io.IOException;  
  
import org.apache.hadoop.io.WritableComparable;  
  
public class TestBean implements WritableComparable<TestBean>{  
    private String name;  
    private String fileName;  
    private int num;  
      
    public String getName() {  
        return name;  
    }  
    public void setName(String name) {  
        this.name = name;  
    }  
    public String getFileName() {  
        return fileName;  
    }  
    public void setFileName(String fileName) {  
        this.fileName = fileName;  
    }  
    public int getNum() {  
        return num;  
    }  
    public void setNum(int num) {  
        this.num = num;  
    }  
    public TestBean() {  
        super();  
        // TODO Auto-generated constructor stub  
    }  
    public TestBean(String name, String fileName, int num) {  
        super();  
        this.name = name;  
        this.fileName = fileName;  
        this.num = num;  
    }  
    @Override  
    public void write(DataOutput out) throws IOException {  
        out.writeUTF(name);  
        out.writeUTF(fileName);  
        out.writeInt(num);  
    }  
    @Override  
    public void readFields(DataInput in) throws IOException {  
        name = in.readUTF();  
        fileName = in.readUTF();  
        num = in.readInt();       
    }  
    @Override  
    public int compareTo(TestBean o) {  
          
        if(o.getName().compareTo(this.getName()) == 0){  
            int flag = o.getNum()-this.getNum();  
            if(flag == 0){  
                return 0;  
            }else if(flag > 0){  
                return 1;  
            }else{  
                return -1;  
            }  
        }else{  
            return o.getName().compareTo(this.getName());  
        }  
          
    }  
      
      
}  

自定义分组组件：UserGC

 
   [java]  
   view plaincopy
import org.apache.hadoop.io.WritableComparable;  
import org.apache.hadoop.io.WritableComparator;  
  
public class UserGC extends WritableComparator{  
  
    public UserGC() {  
        super(TestBean.class,true);  
    }  
  
    @Override  
    public int compare(WritableComparable a, WritableComparable b) {  
        TestBean pa = (TestBean) a;   
        TestBean pb = (TestBean) b;   
          
          
        return pa.getName().compareTo(pb.getName());  
    }  
      
}