mapreduce计算分词权重

最新推荐文章于 2024-05-23 15:43:00 发布

不会编程的码农

最新推荐文章于 2024-05-23 15:43:00 发布

阅读量420

点赞数 1

分类专栏： hadoop 文章标签： mapreduce

本文链接：https://blog.csdn.net/weijianpeng2013_2015/article/details/70919376

版权

hadoop 专栏收录该内容

16 篇文章 0 订阅

订阅专栏

计算每个词在每篇微博中的权值
思路：
公式：TF* loge(N/DF)
TF:当前词在本篇微博中出现的次数
N：总微博数
DF：当前词在多少微博中出现过
编程时特别注意不要导错包，不让会出现许多奇怪的错误：
1.测试数据

3823890335901756        今天是今年最暖和的一天，果断出来逛街！
3823890364788305        春天来了，约好友一起出去去踏青，去赏花！
3823890369489295        我在平湖，让你开挂练九阳真经，走火入魔毁了三叉神经了吧，改练九阴真经吧小子。   (免费下载 )
3823890373686361        约了小伙伴一起去理发！
3823890378201539        今天约了姐妹去逛街吃美食，周末玩得很开心啊！
3823890382081678        这几天一直在约，因为感冒发烧了，所以和老公约好了陪我去打针，求九阳安慰，我想喝豆浆，药好苦的
3823890399188850        和吃货的约会么就是吃
3823890419856548        全国包邮！九阳
3823890436963972        我亲爱的

代码：
First Map

import java.io.StringReader;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;


//计算每个词在该条微博中出现的次数，也就是公式中的（TF),统计N(微博总条数)
public class FirstMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
    protected void map(LongWritable key, Text value, Context context) throws InterruptedException {
        //String[] v=value.toString().trim().split("\t");
        String[] v=value.toString().split("\t");
        if(v.length>=2){
            String id=v[0].trim();
            String content=v[1].trim();

            StringReader sr=new StringReader(content);
            IKSegmenter ikSegmenter=new IKSegmenter(sr, true);
            Lexeme word=null;
            try {
                while((word=ikSegmenter.next())!=null){
                    String w=word.getLexemeText();
                    context.write(new Text(w+"_"+id), new IntWritable(1));
                }
                //修改
                sr.close();
                //计算公式中的N
                context.write(new Text("count"), new IntWritable(1));
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }else{
            System.out.println(value.toString()+"------------------------------");
        }

    };

}

FirstReduce:

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

//
public class FirstReduce extends Reducer<Text, IntWritable, Text, IntWritable>{
    protected void reduce(Text key,Iterable<IntWritable> count, Context context) throws IOException ,InterruptedException {
        int sum=0;
        for(IntWritable i:count){
            sum=sum+i.get();
        }
        if(key.equals(new Text("count"))){
            System.out.println(key.toString()+"__________"+sum);;
        }
        context.write(key, new IntWritable(sum));

    }//去掉分号;
}

FirstPartition:

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
//导错该包导致好几次测试都没有数据输出，特别注意
//import org.apache.hadoop.mapred.lib.HashPartitioner;


public class FIrstPartition extends HashPartitioner<Text, IntWritable>{
    @Override
    public int getPartition(Text key, IntWritable value, int numReduceTasks) {
        if(key.equals(new Text("count")))
            return 3;
        else
            //默认HashPartitioner--哈希值模Reduce数量
            return super.getPartition(key, value, numReduceTasks-1);
    }
}

FIrstJob:

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
//import org.apache.hadoop.examples.SecondarySort.FirstPartitioner;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class FirstJob {
    public static void main(String[] args) throws ClassNotFoundException, InterruptedException {
        Configuration conf=new Configuration();

        try {
            Job job=Job.getInstance(conf,"weibo1");
            job.setJarByClass(FirstJob.class);
            //设置Map任务输出的Key和Value类型
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            //设置Reduce个数
            job.setNumReduceTasks(4);
            //导错导致无结果写入
            //job.setPartitionerClass(FirstPartitioner.class);
            job.setPartitionerClass(FIrstPartition.class);
            job.setMapperClass(FirstMapper.class);
            job.setCombinerClass(FirstReduce.class);
            job.setReducerClass(FirstReduce.class);

            //mr运行时的输入数据从hdfs的哪个目录中获取
            FileInputFormat.addInputPath(job, new Path("/input/weibo1"));
            FileOutputFormat.setOutputPath(job, new Path("/output/weibo1"));
            boolean f=job.waitForCompletion(true);
            if(f){
                System.out.println("执行job成功");
                TwoJob.mainJob();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

TwoMapper

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

//统计每个词的DF
public class TwoMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
    protected void map(LongWritable key, Text value,Context context) throws InterruptedException {
        FileSplit fs=(FileSplit) context.getInputSplit();
        if(!fs.getPath().getName().contains("part-r-00003")){
            String[] v=value.toString().trim().split("\t");
            if(v.length>=2){
                String[] ss=v[0].split("_");
                if(ss.length>=2){
                    String w=ss[0];
                    //统计DF，该词一共在那些微博中出现过
                    try {
                        context.write(new Text(w), new IntWritable(1));
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
            }else{
                System.out.println(value.toString()+"---------------");
            }
        }
    };

}

TwoReduce

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;


public class TwoReduce extends Reducer<Text, IntWritable, Text, IntWritable>{
    protected void reduce(Text key, Iterable<IntWritable> count, Context context) throws IOException ,InterruptedException {
        int sum=0;
        for(IntWritable i:count){
            sum=sum+i.get();
        }
        context.write(key, new IntWritable(sum));
    }
}

TwoJob

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class TwoJob {
  public static void mainJob() {
      Configuration conf=new Configuration();

        try {
            Job job=Job.getInstance(conf,"weibo2");
            job.setJarByClass(TwoJob.class);
            //设置Map任务输出的Key和Value类型
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);

            job.setMapperClass(TwoMapper.class);
            job.setCombinerClass(TwoReduce.class);
            job.setReducerClass(TwoReduce.class);

            //mr运行时的输入数据从hdfs的哪个目录中获取
            FileInputFormat.addInputPath(job, new Path("/output/weibo1"));
            FileOutputFormat.setOutputPath(job, new Path("/output/weibo2"));
            boolean f=job.waitForCompletion(true);
            if(f){
                System.out.println("job2执行成功");
                LastJob.mainJob();
            }
        }catch (Exception e) {
            e.printStackTrace();
        }
  } 
}

LastMapper:

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.text.NumberFormat;
import java.util.HashMap;
import java.util.Map;


import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;


public class LastMapper extends Mapper<LongWritable, Text, Text, Text>{
    public static Map<String,Integer> cmap=null;
    public static Map<String,Integer>df=null;
    //在map方法之前执行
    protected void setup(Context context) throws IOException ,InterruptedException {
        if(cmap==null||cmap.size()==0||df==null||df.size()==0){
           URI[] ss=context.getCacheFiles();
           if(ss!=null){
               for(int i=0;i<ss.length;i++){
                   URI uri=ss[i];
                   if(uri.getPath().endsWith("part-r-00003")){
                       Path path=new Path(uri.getPath());
                       BufferedReader br=new BufferedReader(new FileReader(path.getName()));
                       String line=br.readLine();
                       if(line.startsWith("count")){
                           String[] ls=line.split("\t");
                           cmap=new HashMap<String, Integer>();
                           cmap.put(ls[0], Integer.parseInt(ls[1].trim()));
                       }
                       br.close();
                   }else if(uri.getPath().endsWith("part-r-00000")){
                       df=new HashMap<String,Integer>();
                       Path path=new Path(uri.getPath());
                       BufferedReader br=new BufferedReader(new FileReader(path.getName()));
                       String line=null;
                       while((line=br.readLine())!=null){
                           String[] ls=line.split("\t");
                           df.put(ls[0], Integer.parseInt(ls[1].trim()));
                       }
                       br.close();
                   }
               }
           }
        }
    };

    protected void map(LongWritable key, Text value,Context context) throws IOException ,InterruptedException {
        FileSplit fs=(FileSplit) context.getInputSplit();
        if(!fs.getPath().getName().contains("part-r-00003")){
            String[] v=value.toString().trim().split("\t");
            if(v.length>=2){
                int tf=Integer.parseInt(v[1].trim());
                String[] ss=v[0].split("_");
                if(ss.length>=2){
                    String w=ss[0];
                    String id=ss[1];

                    double s=tf*Math.log(cmap.get("count")/df.get(w));
                    NumberFormat nf=NumberFormat.getInstance();
                    nf.setMaximumFractionDigits(5);
                    context.write(new Text(id), new Text(w+":"+nf.format(s)));
                }
            }else{
                System.out.println(value.toString()+"-----------------");
            }
        }
    };

}

LastReduce:



import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;


public class LastReduce extends Reducer<Text, Text, Text, Text>{
   protected void reduce(Text key, Iterable<Text> value, Context context) throws InterruptedException {
       StringBuffer sb=new StringBuffer();
       for(Text text:value){
           sb.append(text.toString()+"\t");
       }
       try {
        context.write(key, new Text(sb.toString()));
    } catch (IOException e) {
        e.printStackTrace();
    }
   }
}

LastJob:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class LastJob {
    public static void mainJob() {
Configuration conf=new Configuration();

        try {
            Job job=Job.getInstance(conf,"weibo3");
            job.setJarByClass(LastJob.class);
            job.addCacheFile(new Path("/output/weibo1/part-r-00003").toUri());
            job.addCacheFile(new Path("/output/weibo2/part-r-00000").toUri());


            //设置Map任务输出的Key和Value类型
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);

            job.setMapperClass(LastMapper.class);
            job.setCombinerClass(LastReduce.class);
            job.setReducerClass(LastReduce.class);
            //mr运行时的输入数据从hdfs的哪个目录中获取

            FileInputFormat.addInputPath(job, new Path("/output/weibo1/"));
            FileOutputFormat.setOutputPath(job, new Path("/output/weibo3"));
            boolean f=job.waitForCompletion(true);
            if(f){
                System.out.println("job3执行成功");
            }
        }catch (Exception e) {
            e.printStackTrace();
        }
    }   
}