需求:在给定的文本文件中统计输出每一个单词出现的总次数
数据格式准备如下:
cd /export/servers
vim wordcount.txt(加入以下内容)
hello,world,hadoop
hive,sqoop,flume,hello
kitty,tom,jerry,world
hadoop
hdfs dfs -mkdir /wordcount/
hdfs dfs -put wordcount.txt /wordcount/
定义一个mapper类:
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WordCountMapper extends Mapper<LongWritable, Text,Text,LongWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String str = value.toString();
String[] wrods = str.split(",");
for (String wrod : wrods) {
context.write(new Text(wrod),new LongWritable(1));
}
}
}
定义一个reducer类:
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WordCountReducer extends Reducer<Text, LongWritable,Text,LongWritable> {
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long count=0;
for (LongWritable value : values) {
count+=value.get();
}
context.write(key,new LongWritable(count));
}
}
定义一个主类,用来描述job并提交job:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class JobMain {
public static void main(String[] args) throws Exception {
Job job = Job.getInstance(new Configuration(), "Demo01");
//设置程序的主类
job.setJarByClass(JobMain.class);
//设置Map程序代码 和 Reduce
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
//设置Map输出的 key value的类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
//设置Reduce输出的 key value的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
//设置去哪里读取
FileInputFormat.addInputPath(job,new Path("/wordcount"));
//设置最终结果写到哪里去
FileOutputFormat.setOutputPath(job,new Path("/wordcount_out"));
//提交业务
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
}
}
代码编写完毕后将代码打成jar包放到服务器上面去运行
hadoop jar jar包名 main方法路径