map
package os.unix.cn;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordCountMapper extends Mapper<LongWritable,Text,Text,LongWritable>{
@Override
protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {
String line = value.toString();
String[] words = line.split(" ");
for(String word:words) {
context.write(new Text(word), new LongWritable(1));
}
}
}
reduce
package os.unix.cn;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
@Override
protected void reduce(Text key, Iterable<LongWritable> values,Context context) throws IOException, InterruptedException {
long count = 0;
for(LongWritable value: values) {
count += value.get();
}
context.write(key, new LongWritable(count));
}
}
main
package os.unix.cn
import java.io.IOException
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
public class WordCountJob {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException{
Configuration conf = new Configuration()
Job wordCountJob = Job.getInstance(conf)
// 指定本Job的 所在的jar包
wordCountJob.setJarByClass(WordCountJob.class)
//设置wordCountJob 所用的mapper逻辑类为那个类
wordCountJob.setMapperClass(WordCountMapper.class)
//设置wordCountJob 所用的mapper逻辑类为那个类
wordCountJob.setReducerClass(WordCountReducer.class)
// map阶段输出的kv类型
wordCountJob.setMapOutputKeyClass(Text.class)
wordCountJob.setMapOutputValueClass(LongWritable.class)
// reduce阶段输出的kv类型
wordCountJob.setOutputKeyClass(Text.class)
wordCountJob.setOutputValueClass(LongWritable.class)
//设置的要处理的文本数据存放路径
FileInputFormat.setInputPaths(wordCountJob, "hdfs://os-1:9000/word/srcdata/")
//设置最终结果所存放的路径
FileOutputFormat.setOutputPath(wordCountJob, new Path("hdfs://os-1:9000/wordcount/output/"))
wordCountJob.waitForCompletion(true)
}
}
打包之后
执行 hadoop jar wordCount.jar os.unix.cn.WordCountJob