统计文件中单词出现的次数,文件text
1、java 代码,WcMapper.class继承Mapper可执行业务代码
package com.jxl.mr;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WcMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
//每次调用map方法会传入split中的一行数据key:改行数据所在文件中的位置下标,value 是这行数据
protected void map(LongWritable key, Text value,
Context context)
throws IOException, InterruptedException {
final String line = value.toString();
final StringTokenizer stringTokenizer = new StringTokenizer(line);
while(stringTokenizer.hasMoreTokens()){
final String word = stringTokenizer.nextToken();
context.write(new Text(word), new IntWritable(1));//map输出
}
}
}
2、java 代码,WcReduce.class继承Reducer可执行业务代码
package com.jxl.mr;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WcReduce extends Reducer<Text, IntWritable, Text, IntWritable>{
//重写reduce
protected void reduce(Text key, Iterable<IntWritable> iterable,
Context context)
throws IOException, InterruptedException {
int sum = 0;
for(IntWritable i:iterable){
sum = sum+i.get();
}
//输出
context.write(key, new IntWritable(sum));
}
}
3、java 代码,JobRun.class执行的JobTracker
package com.jxl.mr;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class JobRun {
public static void main(String[] args) {
System.err.println("Job开始执行");
final Configuration config = new Configuration();
config.set("mapred.job.tracker", "centos-node6:9001");
try {
final Job job = new Job(config);
job.setJarByClass(JobRun.class);
job.setMapperClass(WcMapper.class);
job.setReducerClass(WcReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//job.setNumReduceTasks(1);//任务执行次数
//mapreduce 输入数据所在的目录或者文件
FileInputFormat.addInputPath(job, new Path("/hello/input/wc/text"));
//mr执行之后的输出数据目录
FileOutputFormat.setOutputPath(job, new Path("/hello/output/wc/"));
//执行完退出,不可缺少
System.exit(job.waitForCompletion(true) ? 0 : 1);
System.err.println("Job执行完成");
} catch (Exception e) {
e.printStackTrace();
}
}
}
4、打包wc.jar包执行一下命令
./hadoop jar /wc.jar com.jxl.mr.JobRun