输入数据
E:\input\wordcount\hello.txt
hadoop xue
hadoop xue hive
xue yankai
xue
xue yankai
hive
hbase
运行结果
E:\output\wordcount\part-r-00000
hadoop 2
hbase 1
hive 2
xue 5
yankai 2
WordCountMapper.class
/**
*Mapper<LongWritable, Text, Text, IntWritable>
*Mapper<输入的KEY, 输入的Value, 输出的KEY, 输出的Value>
*输出格式为 <单词,个数>
*/
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private Text k = new Text();
private IntWritable v = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 读取一行数据
String line = value.toString();
// 按空格拆分
String[] words = line.split(" ");
for (int i = 0; i < words.length; i++) {
// 将单词作为key,写出
k.set(words[i]);
context.write(k, v);
}
}
}
WordCountMapper.class
/**
* Reducer<Text, IntWritable, Text, IntWritable>
* Reducer<输入的KEY, 输入的Value, 输出的KEY, 输出的Value>
* 输出格式为 <单词,个数>
*/
public class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context count)
throws IOException, InterruptedException {
// 统计单词总个数
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
// 输出单词总个数
count.write(key, new IntWritable(sum));
}
}
WordCountDdiver
public class WordCountDriver implements Tool {
Configuration conf = null;
@Override
public Configuration getConf() {
return this.conf;
}
@Override
public void setConf(Configuration configuration) {
this.conf = configuration;
}
@Override
public int run(String[] args) throws Exception {
// 获取 job
Job job = Job.getInstance(conf);
// 设置Jar包位置
job.setJarByClass(WordCountDriver.class);
// 设置Mapper、Reduce类位置
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReduce.class);
// 设置map输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 设置最终输出数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 设置输入输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 提交Job
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) throws Exception {
//args=new String[]{"e:/input/wordcount","e:/output/wordcount"};
WordCountDriver driver = new WordCountDriver();
int run = ToolRunner.run(driver, args);
System.exit(run);
}
}
注意:
- 在windows环境上配置HADOOP_HOME环境变量。运行程序时,携带输入目录/输出目录的参数
- 在Linux运行,先打成jar包,上传到linux,执行运行命令。
运行jar包
[root@hadoop83 ~]# hadoop jar wc.jar vip.abatt.wordcount.WordCountDriver /root/input/wordcount /root/output/wordcount
其中/root/input/wordcount并非操作系统下的文件,而是HDFS系统中的文件,参考以下命令上传文件。
[root@hadoop83 ~]# hdfs dfs -mkdir -p /root/input/wordcount
[root@hadoop83 ~]# hdfs dfs -put hello.txt /root/input/wordcount
查看HDFS中运行结果
[root@hadoop83 ~]# hdfs dfs -cat /root/output/wordcount/part-r-00000
hadoop 2
hbase 1
hive 2
xue 5
yankai 2