hadoop 中mapreduce的初次使用
map类
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* Mapper 四个类型
* KEYIN 输入数据KV对中的KEY的数据v类型
* VALUEIN: 输入KV数据中的VALUE的数据类型
* KEYOUT: 输出KV对中KEY的类型
* VALUEOUT:输出KV对中VALUE的类型
*
* 新类型为hadoop便于序列化的类型
* Created by hadoop on 17-2-18.
*/
public class WordCountMapper extends Mapper<LongWritable,Text,Text,IntWritable>{
/*map方法是提供给maptask进程来调用的,maptask没读取一行文本来调用一次map函数
* 参数:
* 一行的开始偏移量LongWritable作为KEY
* 一行的内容TEXT作为Value
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//将Text内容转化成String类型
String line = value.toString();
//分词
String[] words = line.split(" ");
//return <word,1>
for (String word:words)
{
context.write(new Text(word),new IntWritable(1));
}
}
}
reduce类
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* Created by hadoop on 17-2-18.
* Reducer参数:<KEYIN,VALUEIN,KEYOUT,VALUEOUT>
*/
public class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
/*
* 工作原理先将相同KEY的KV对聚合在一起然后对每一组KV对调用一次reduce函数
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int result = 0;
for(IntWritable value:values)
{
result += value.get();
}
//输出最终KV对
context.write(key,new IntWritable(result));
}
}
main函数
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* Created by hadoop on 17-2-18.
*/
public class WordCountJobSubmitter {
public static void main(String[] args) throws Exception {
//获取Job实例
Configuration conf = new Configuration();
Job wordCountJob = Job.getInstance(conf);
//制定本job所在jar包
wordCountJob.setJarByClass(WordCountJobSubmitter.class);
//设置wordCountJob所用的mapper和reducer类
wordCountJob.setMapperClass(WordCountMapper.class);
wordCountJob.setReducerClass(WordCountReducer.class);
//设置两个阶段的数据传输类型
wordCountJob.setMapOutputKeyClass(Text.class);
wordCountJob.setMapOutputValueClass(IntWritable.class);
wordCountJob.setOutputKeyClass(Text.class);
wordCountJob.setOutputValueClass(IntWritable.class);
//设置处理的文件路径
FileInputFormat.setInputPaths(wordCountJob,"hdfs://hadoop-virtual-machine:9000/wordcount/input");
FileOutputFormat.setOutputPath(wordCountJob,new Path("hdfs://hadoop-virtual-machine:9000/wordcount/output"));
//提交job给hadoop集群
wordCountJob.waitForCompletion(true);
}
}
可以在IntellJ IDEA中直接配置,然后在IDEA中直接运行,但是在Windows系统下运行会出现问题(未尝试过)。
也可以打成可运行的Jar包如何创建可运行JAR包
然后在terminal中执行
hadoop jar myjar.jar