package com.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* 数据的输入和输出以key value进行传输
* keyIN: LongWritable(long) 数据的起始偏移量
* valueIN:具体数据
* <p>
* mapper需要把数据传递给reduce
* keyOut:单词
* valueOut:出现的次数
*/
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 1.接入数据
String line = value.toString();
//2.对数据进行切分
String[] words = line.split(" ");
//3.写出以<hello,1>
for (String s : words) {
context.write(new Text(s), new IntWritable(1));
}
}
}
|
package com.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* reduce阶段接受的是mapper输出的数据
* <p>
* keyIN: mapper输出的key的类型
* valueIn:mapper输出的value的类型
* <p>
* reduce端输出的数据类型
* keyOut:Text
* valueOut:IntWritable
*/
public class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
/**
* key ---单词 value---次数
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
//1记录出现的次数
int sum = 0;
for (IntWritable v : values) {
sum += v.get();
}
//2.累加求和输出
context.write(key, new IntWritable(sum));
}
}
|
package com.wordcount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCountDriver {
public static void main(String[] args) throws Exception {
//1.创建job任务
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//2.指定jar包位置
job.setJarByClass(WordCountDriver.class);
//3.关联使用mapper
job.setMapperClass(WordCountMapper.class);
//4.关联使用reducer
job.setReducerClass(WordCountReduce.class);
//5.设置mapper阶段输出的数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//6设置reducer阶段输出的数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//7.设置数据输入的路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
//8.设置数据输出的路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//9.提交任务
boolean b = job.waitForCompletion(true);
System.out.println(b ? 0 : 1);
}
}
|
将project打包放到hadoop集群上 进行调用
hadoop jar HDFSTest001-1.0-SNAPSHOT.jar com.wordcount.WordCountDriver /merge.txt /output