【大数据实验2】note4：Hadoop统计单词频数JAVA类编写

note4：Hadoop统计单词频数JAVA类编写

1 WCMapper类
2 WCReducer类
- 报错
3 WordCount类

直接在默认package下写代码啦：【New】 → 【Class】

1 WCMapper类

WCMapper

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

/**
 * 定义一个Mapper类，该类继承自Hadoop的Mapper类，Mapper类有4个泛型，分别代表
 * KEYIN(K1)、VALUEIN(V1)、KEYOUT(K2)、VALEOUT(V2)，其中<K1,V1>的数据如：
 * <0, "hello tom">，<K2,V2>的数据如：<"hello", 1>。Mapper的这4个泛型一定要实现
 * 序列化，这样方便快速传输。Hadoop所用的序列化与jdk所用的序列化是不一样的，因为
 * jdk的序列化机制非常冗余(需要保存类之间的关系等)，因此Hadoop实现了自己的一套序
 * 列化机制，其中数值型的数据可以用LongWritable来序列化，字符串型的数据可以用Text
 * 来序列化。我们发现K1是数值类型，因此它的序列化泛型是LongWritable，V1是字符串因
 * 此它的序列化泛型是Text，K2是字符串类型，因此它的序列化泛型是Text，V2是数值类型
 * 因此它的序列化泛型是LongWritable。
 *
 * @author
 *
 */
public class WCMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
      //需要重写map方法
      @Override
      protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context)
                                              throws IOException, InterruptedException {
              //接收数据V1
             String line = value.toString();
             //切分数据
             String[] words = line.split(" ");
             //循环输出word
             for(String word : words){
                //由于word是String类型数据，没有序列化，因此在写出去之前先序列化。
                //1是int类型，没有序列化，因此要序列化。
                context.write(new Text(word), new LongWritable(1));
             }
       }
}

2 WCReducer类

WCReducer类

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WCReduce extends Reducer<Text, LongWritable, Text, LongWritable>{
            @Override
            protected void reduce(Text key, Iterable<LongWritable> v2s,
                              Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
                    //定义一个counter用来统计某个单词出现的次数是多少  
                    long counter=0;
                    //其实v2s当中存储的都是一个个被序列化好了的1
                    for(LongWritable i : v2s){
                          counter+=i.get();//跟我们熟悉的counter++是一个意思
                    }
                   //输出<K3、V3>，比如<"hello", 5>
                   context.write(key, new LongWritable(counter));
            }
}

报错

错误原因：Java编译器版本太低
解决方法

【properties】
【Java Complier】 → 【Compiler compliance level：高于1.5（这里选了1.7）】

3 WordCount类

勾上√【public static void main】
WorCount类

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCount {
             public static void main(String[] args) throws Exception {
             	   long startTime = System.currentTimeMillis(); 
             //我们已经自定义好了Mapper和RedUC而现在我们要做的就是把MapReduce作业提交上去
                   //现在我们把MapReduce作业抽象成Job对象了
                   Job job = Job.getInstance(new Configuration());
  
                   //注意：一定要将main方法所在的类设置进来。
                   job.setJarByClass(WordCount.class);
   
                   //接下来我们设置一下Job的Mapper相关属性
                   job.setMapperClass(WCMapper.class);//设置Mapper类
                   job.setMapOutputKeyClass(Text.class);//设置K2的类型
                   job.setMapOutputValueClass(LongWritable.class);//设置V2的类型
                   //接下来我们得告诉程序我们应该去哪里读取文件。需要注意的是Path是指在Hadoop的HDFS系统上的路径
                   FileInputFormat.setInputPaths(job, new Path(args[0]));//这里我们采用变量的形式传进来地址
  
                   //接下来我们来设置一下Job的Reducer相关属性
                   job.setReducerClass(WCReducer.class);//设置Reducer类
                   job.setOutputKeyClass(Text.class);//设置K3的类型
                   job.setOutputValueClass(LongWritable.class);//设置V3的类型
                   //接下来我们得告诉程序应该把结果信息写到什么位置。注意：这里的Path依然是指文件在Hadoop的HDFS系统
                   //上的路径。
                  FileOutputFormat.setOutputPath(job, new Path(args[1]));//我们依然采用变量的形式传进来输出地址。
  
                  job.waitForCompletion(true);//把作业提交并且等待执行完成，参数为true的话，会打印进度和详情。
                  long endTime = System.currentTimeMillis();
                  System.out.println("程序运行时间：" + (endTime - startTime) + "ms");
            }
}