一.基础环境:
本文默认了你已经有一点的java基础,本机环境已安装java、maven、ide,配置好了相关的环境变量,且已经有可用的hadoop环境,已经用idea新建一个java maven项目。还要有一台linux客户机,可执行hadoop命令的。
以上环境有没完成的,自行去百度完成。
二.pom.xml引入包:
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.3</version>
</dependency>
三.准备统计文件并上传
1.新建一个文件word_test.txt
I have searched a thousand years,And I have cried a thousand tears。
I found everything I need,You are everything to me。
2.上传到hadoop
先 rz 上传到linux客户机,再执行下边命令上传到hdfs
hadoop fs -mkdir /tmp/mr_test/
hadoop fs -put ./word_test.txt /tmp/mr_test/
四.上代码(官方WordCount V1)
package com.yixin;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.StringTokenizer;
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private final Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private final IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
五.打包上传执行
1.用ide或maven打包代码成jar,
2.rz上传到linux客户机
3.执行代码
hadoop jar mrtest-1.0-SNAPSHOT.jar com.yixin.WordCount /tmp/mr_test/word_test.txt /tmp/mr_test/output
4.查看结果
hadoop fs -cat /tmp/mr_test/output/*
六.官方WordCount V2
与v1的主要差别,在Map类中是增加了 setup(Context context) 方法,增加2个参数对大小写和忽略字符进行控制。具体说明我在代码中有备注。
实际上,map和reduce都有setup(Context context) ,cleanup(Context context),前一个是初始化操作,后一个是做清理工作。在map或reduce方法的前后执行,大家可以根据业务合理使用。
1.上v2代码
package com.yixin;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.StringUtils;
public class WordCount2 {
/**
* 这里可以设置map和reduce用到的变量和方法。
* 注:正常每个map和reduce是各用各的,修改互相不可见。除非你用的全局计数器或分布式缓存。
*/
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable> {
/**
* 这里可以设置map用到的变量和方法。
* 注:正常每个map是各用各的,修改互相不可见。除非你用的全局计数器或分布式缓存。
*/
// 这就是个全局计数器,各map是可共享的,修改可见的。
static enum CountersEnum {INPUT_WORDS}
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private boolean caseSensitive;
private Set<String> patternsToSkip = new HashSet<String>();
private Configuration conf;
private BufferedReader fis;
/**
* setup方法可以执行一些map执行前的一些初始化工作,
* 如对变量做初始化加工,设置数据库连接,输入路径过滤等。
* 这个例子是对是否大小写做了个处理,对跳过字符的规则做了处理。
*/
@Override
public void setup(Context context) throws IOException,
InterruptedException {
conf = context.getConfiguration();
caseSensitive = conf.getBoolean("wordcount.case.sensitive", true);
if (conf.getBoolean("wordcount.skip.patterns", true)) {
URI[] patternsURIs = Job.getInstance(conf).getCacheFiles();
if(patternsURIs!=null){
for (URI patternsURI : patternsURIs) {
Path patternsPath = new Path(patternsURI.getPath());
String patternsFileName = patternsPath.getName().toString();
parseSkipFile(patternsFileName);
}
}
}
}
private void parseSkipFile(String fileName) {
try {
fis = new BufferedReader(new FileReader(fileName));
String pattern = null;
while ((pattern = fis.readLine()) != null) {
patternsToSkip.add(pattern);
}
} catch (IOException ioe) {
System.err.println("Caught exception while parsing the cached file '"
+ StringUtils.stringifyException(ioe));
}
}
@Override
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String line = (caseSensitive) ?
value.toString() : value.toString().toLowerCase();
for (String pattern : patternsToSkip) {
line = line.replaceAll(pattern, "");
}
StringTokenizer itr = new StringTokenizer(line);
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
Counter counter = context.getCounter(CountersEnum.class.getName(),
CountersEnum.INPUT_WORDS.toString());
counter.increment(1);
}
}
}
public static class IntSumReducer
extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
String[] remainingArgs = optionParser.getRemainingArgs();
if (!(remainingArgs.length != 2 || remainingArgs.length != 4)) {
System.err.println("Usage: wordcount <in> <out> [-skip skipPatternFile]");
System.exit(2);
}
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount2.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
List<String> otherArgs = new ArrayList<String>();
for (int i = 0; i < remainingArgs.length; ++i) {
if ("-skip".equals(remainingArgs[i])) {
job.addCacheFile(new Path(remainingArgs[++i]).toUri());
job.getConfiguration().setBoolean("wordcount.skip.patterns", true);
} else {
otherArgs.add(remainingArgs[i]);
}
}
FileInputFormat.addInputPath(job, new Path(otherArgs.get(0)));
FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1)));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
2.运行方式a,和v1参数一样
打包上传和v1一样不说了,运行是如下命令:
hadoop fs -rm -r /tmp/mr_test/output
hadoop jar mrtest-1.0-SNAPSHOT.jar com.yixin.WordCount2 /tmp/mr_test/word_test.txt /tmp/mr_test/output
结果:
hadoop fs -cat /tmp/mr_test/output/*
3.新建文件patterns.txt(跳过字符的规则),并上传到集群。
新建,编辑,保存
vim patterns.txt
\.
\,
\!
to
\,
\。
上传:
hadoop fs -put patterns.txt /tmp/mr_test/
运行方法b,多加2个参数,跳过不需要统计的字符
hadoop fs -rm -r /tmp/mr_test/output
hadoop jar mrtest-1.0-SNAPSHOT.jar com.yixin.WordCount2 /tmp/mr_test/word_test.txt /tmp/mr_test/output -skip /tmp/mr_test/patterns.txt
结果:
hadoop fs -cat /tmp/mr_test/output/*
相关字符已经过滤掉。
七.MR简化配置工具ToolRunner:
可以使这个工具来预配置,这样能简化使用方的代码量,如精简main方法,只需要2行代码,这个我就不写例子,推荐大家看这个文章: