一、Map
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* 这部分的输入是由mapreduce自动读取进来的
* 简单的统计单词出现次数<br>
* KEYIN 默认情况下,是mapreduce所读取到的一行文本的起始偏移量,Long类型,在hadoop中有其自己的序列化类LongWriteable
* VALUEIN 默认情况下,是mapreduce所读取到的一行文本的内容,hadoop中的序列化类型为Text
* KEYOUT 是用户自定义逻辑处理完成后输出的KEY,在此处是单词,String
* VALUEOUT 是用户自定义逻辑输出的value,这里是单词出现的次数,Long
* @author Administrator
*
*/
public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
//这是mapreduce读取到的一行字符串
String line = value.toString();
String[] words = line.split(" ");
for (String word : words) {
//将单词输出为key,次数输出为value,这行数据会输到reduce中
context.write(new Text(word), new LongWritable(1));
}
}
}
二、reduce
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/**
* 第一个Text: 是传入的单词名称,是Mapper中传入的
* 第二个:LongWritable 是该单词出现了多少次,这个是mapreduce计算出来的,比如 hello出现了11次
* 第三个Text: 是输出单词的名称 ,这里是要输出到文本中的内容
* 第四个LongWritable: 是输出时显示出现了多少次,这里也是要输出到文本中的内容
* @author Administrator
*
*/
public class WordCountReduce extends Reducer<Text, LongWritable, Text, LongWritable> {
@Override
protected void reduce(Text key, Iterable<LongWritable> values,
Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
long count = 0;
for (LongWritable num : values) {
count += num.get();
}
context.write(key, new LongWritable(count));
}
}
三、driver
import java.io.IOException;import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 相当于运行在yarn中的客户端
* @author Administrator
*
*/
public class WordCountDriver {
public static void main(String[] args) throws IOException {
Configuration conf = new Configuration();
//如果是打包在linux上运行,则不需要写这两行代码
//指定运行在yarn中
//conf.set("fs.defaultFS", "hdfs://192.168.186.231:9000");
//conf.set("hadoop.job.user", "liukai");
//conf.set("mapreduce.framework.name", "yarn");
//指定resourcemanager的主机名
//conf.set("yarn.resourcemanager.hostname", "192.168.186.231");
Job job = Job.getInstance(conf);
//使得hadoop可以根据类包,找到jar包在哪里
job.setJarByClass(WordCountDriver.class);
//指定Mapper的类
job.setMapperClass(WordCountMapper.class);
//指定reduce的类
job.setReducerClass(WordCountReduce.class);
//设置Mapper输出的类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
//设置最终输出的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
//指定输入文件的位置,可以接收外部参数,也可以直接指定位置
//外部参数在run configurations 中 arguments 中设置
//FileInputFormat.setInputPaths(job, new Path(args[0]));
FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.186.231:9000/wordcount/input"));
//指定输入文件的位置,这里接收启动参数
//FileOutputFormat.setOutputPath(job, new Path(args[1]));
FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.186.231:9000/wordcount/output"));
//将job中的参数,提交到yarn中运行
//job.submit();
try {
job.waitForCompletion(true);
//这里的为true,会打印执行结果
} catch (ClassNotFoundException | InterruptedException e) {
e.printStackTrace();
}
}
}