WordCount
需求: 在一堆给定的文本文件中统计输出每一个单词出现的总次数
Step 1. 数据格式准备
- 创建一个新的文件
cd /export/servers
vim wordcount.txt
- 向其中放入以下内容并保存
hello,world,hadoop
hive,sqoop,flume,hello
kitty,tom,jerry,world
hadoop
- 上传到 HDFS
hdfs dfs -mkdir /wordcount/
hdfs dfs -put wordcount.txt /wordcount/
Step 2. Mapper
public class WordCountMapper extends Mapper<LongWritable,Text,Text,LongWritable> {
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] split = line.split(",");
for (String word : split) {
context.write(new Text(word),new LongWritable(1));
}
}
}
Step 3. Reducer
public class WordCountReducer extends Reducer<Text,LongWritable,Text,LongWritable> {
/**
* 自定义我们的reduce逻辑
* 所有的key都是我们的单词,所有的values都是我们单词出现的次数
* @param key
* @param values
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long count = 0;
for (LongWritable value : values) {
count += value.get();
}
context.write(key,new LongWritable(count));
}
}
Step 4. 定义主类, 描述 Job 并提交 Job
public class JobMain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(