写了一个简单的MapReduce程序,主要目的是想记录一下正则表达式的使用
注意事项:
- 需要传入两个参数,参数1为输入文件路径,参数2为输出路径
- 为方便测试MapReduce,避免经常修改代码并打包上传,可以搭建Windows版本的Hadoop环境
- 需要创建Maven工程
- 需要注意输入文件的编码格式,建议利用记事本更改为UTF-8,避免乱码
代码如下
package com.oracle.mrexample.b.wc;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.BasicConfigurator;
import java.io.IOException;
public class ChWCApp {
public static class WCMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
private Text outKey = new Text();
private IntWritable outValue = new IntWritable(1);//可以替换为NullWritable
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 获取一行内容的字符串类型
String line = value.toString();
// 把所有非中文替换掉
line = line.replaceAll("[^\\u4e00-\\u9fa5]","");
// 把一行内容拆解成一个个汉字
String[] words = line.split("");
// 分别将每个汉字向下一个环节写入[K-V:KEY是单词 VALUE是一个数量]
for (String word : words) {
outKey.set(word);
context.write(outKey, outValue);
}
}
}
public static class WCReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable outValue = new IntWritable(); // 放reduce中调用一次reduce就会创建一个对象,所以做属性
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
if(key.toString().equals("")){
return;
}
int count = 0;
for (IntWritable value : values) {
count += value.get();
}
outValue.set(count);
context.write(key, outValue);
}
}
public static void main(String[] args) throws Exception{
BasicConfigurator.configure();
if(args==null || args.length<2){
System.out.println("参数个数不正确,必须输入两个路径参数!");
return;
}
Configuration conf = new Configuration();
// 获取流水线作业对象
Job job = Job.getInstance(conf);
// 4 4个输出类型
job.setMapOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 3 3个环节类型
job.setJarByClass(ChWCApp.class);
job.setMapperClass(WCMapper.class);
job.setReducerClass(WCReducer.class);
// 2 2个位置:数据计算的输入和输出位置
FileInputFormat.setInputPaths(job,new Path(args[0]));
Path outPath = new Path(args[1]);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(outPath)){ // 避免因为输出路径存在而产生错误
fs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
// 1 1次提交
boolean result = job.waitForCompletion(true);
System.exit(result?0:1);// 0正常终止 1强制终止 可有可无,一条优化语句
}
}
pom.xml依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.oracle</groupId>
<artifactId>mrexample</artifactId>
<version>1.0-SNAPSHOT</version>
<name>mrexample</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<project.build.