一、前言
介绍MapReduce 一些常见的经典案例
二、去重
思想:利用MapReduce 的shuffle过程,合并相同的key 特性,可实现。
核心:Map ->context.write(new Text(line),new Text("")) ; Reduce->context.write(key,new Text(""))
package hadoop.v5;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import hadoop.utils.HDFSUtils;
/**
* @author : chenhaipeng
* @date : 2015年9月6日 上午2:00:50
*/
public class Duplication extends Configured implements Tool {
public static class Map extends Mapper<LongWritable, Text, Text, Text>{
// private final static IntWritable one = new IntWritable(1);
// private Text word = new Text("UTF-8");
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapreduce.Mapper.Context)
*/
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
// StringTokenizer tokenizer = new StringTokenizer(line);
// while(tokenizer.hasMoreTokens()){
// word.set(tokenizer.nextToken());
context.write(new Text(line), new Text(""));
// }
}
}
public static class Reduce extends Reducer<Text, Text, Text, Text>{
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Reducer#reduce(java.lang.Object, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
*/
@Override
public void reduce(Text key, Iterable<Text> values,Context context) throws IOException, InterruptedException {
/* int sum = 0;
for(IntWritable value : values){
sum += value.get();
}
System.out.println("key----->"+key);*/
context.write(key, new Text(""));
}
}
public static void main(String[] args) throws Exception {
int ret = ToolRunner.run(new Duplication(), args);
System.exit(ret);
}
public static void deletedir(String path){
try {
HDFSUtils.DeleteHDFSFile(path);
} catch (IOException e) {
e.printStackTrace();
}
}
/*
* @see org.apache.hadoop.util.Tool#run(java.lang.String[])
*/
@Override
public int run(String[] args) throws Exception {
Job job = new Job(getConf());
job.setJarByClass(Duplication.class);
job.setJobName("Duplication");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//并不是一定要
// job.setInputFormatClass(TextInputFormat.class);
// job.setOutputFormatClass(TextOutputFormat.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setCombinerClass(Reduce.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
deletedir(args[1]);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean sucess = job.waitForCompletion(true);
return sucess == true? 0 : 1;
}
}
三、排序
思想:Map输出hadoop 会自动进行shuffle过程,需要重写comparator和partition
package hadoop.v5;
import java.io.File;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import hadoop.utils.EJob;
import hadoop.utils.HDFSUtils;
/**
* 场景:对刚才统计出的单词数作一个由高到低的排序
* 思路:Map输出hadoop 会自动进行shuffle过程,需要重写comparator和partition
* @author : chenhaipeng
* @date : 2015年9月8日 下午11:58:32
*/
public class SortWordCount {
public static class SortMap extends Mapper<LongWritable, Text, IntWritable, Text> {
private Text word = new Text();
private IntWritable count = new IntWritable();
/*
* 使key--value反转输出
*
* @see org.apache.hadoop.mapreduce.Mapper#map(java.lang.Object,
* java.lang.Object, org.apache.hadoop.mapreduce.Mapper.Context)
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
if (StringUtils.isNotEmpty(line)) {
// 读入每一个数据
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken().trim().toString());
count.set(Integer.parseInt(tokenizer.nextToken().trim()));
System.out.println(count+"-->"+word);
context.write(count, word);
}
}
}
}
public static class SortReduce extends Reducer<IntWritable, Text, Text, IntWritable> {
private Text result = new Text();
/*
* 由于shuffle过程是自带排序的,我们要自定义自己的Partion 和Comparator
*
* @see org.apache.hadoop.mapreduce.Reducer#reduce(java.lang.Object,
* java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
*/
@Override
protected void reduce(IntWritable key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
for (Text value : values) {
result.set(value.toString());
context.write(value, key);
}
}
}
/**
* partition 目的是给多个reduce使用的时候,分割数据,有时候你会看到一个reduce 输出为空
* @author : chenhaipeng
* @date : 2015年9月9日 上午12:38:54
*/
public static class SortPartition extends Partitioner<IntWritable, Text>{
/*
* @see org.apache.hadoop.mapreduce.Partitioner#getPartition(java.lang.Object, java.lang.Object, int)
*/
@Override
public int getPartition(IntWritable key, Text value, int numPartitions) {
//以数值大于2为一个分区
int maxValue = 2;
int keySection = 0;
//只有key 大于 maxValue 并且numPartitions 的大于2的时候才需要分区,否则直接返回0
if(numPartitions > 1 && key.hashCode() < maxValue){
int sectionValue = maxValue / (numPartitions -1);
int count = 0;
while(key.hashCode() - sectionValue * count > sectionValue){
count++;
}
keySection = numPartitions -1 - count;
}
return keySection;
}
}
public static class SortComparator extends WritableComparator{
/**
* @param keyClass
*/
protected SortComparator() {
super(IntWritable.class,true);
}
/* 降序
* @see org.apache.hadoop.io.WritableComparator#compare(org.apache.hadoop.io.WritableComparable, org.apache.hadoop.io.WritableComparable)
*/
@Override
public int compare(WritableComparable a, WritableComparable b) {
return -super.compare(a, b);
}
}
public static void deletedir(String path){
try {
HDFSUtils.DeleteHDFSFile(path);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
//-
File jarFile = EJob.createTempJar("bin");
// EJob.addClasspath("D:/hadoop-1.2.1/conf/conf");
ClassLoader classLoader = EJob.getClassLoader();
Thread.currentThread().setContextClassLoader(classLoader);
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: SortWordCount <in> <out>");
System.exit(2);
}
//运行在真正的集群上
conf.set("mapred.job.tracker", "192.168.100.150:9001");
Job job = new Job(conf);
job.setJobName("SortWordCount");
job.setJarByClass(SortWordCount.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(SortMap.class);
job.setReducerClass(SortReduce.class);
job.setPartitionerClass(SortPartition.class);
job.setSortComparatorClass(SortComparator.class);
job.setNumReduceTasks(2);
//-
((JobConf) job.getConfiguration()).setJar(jarFile.toString());
FileInputFormat.setInputPaths(job, new Path(args[0]));
deletedir(args[1]);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true)? 0 : 1);
}
}
四、单表关联
思想:表与表之间的自连接 思路:在hadoop shuffle过程的时候会进行合并,左表parent 为key 右表child 为key
package hadoop.v5;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import hadoop.utils.HDFSUtils;
import hadoop.v5.Sort.IntKeyDescComparator;
import hadoop.v5.Sort.KeySectionPartitioner;
import hadoop.v5.Sort.Map;
import hadoop.v5.Sort.Reduce;
/**
* 表与表之间的自连接 思路:在hadoop shuffle过程的时候会进行合并,左表parent 为key 右表child 为key,
* 这样在shuffle的过程中,并自动合并表一个笛卡尔积
*
* @author : chenhaipeng
* @date : 2015年9月9日 上午1:48:20
*/
public class SelfJoin {
public static int time = 0;
/*
* Map 将输入分割成child 和parent ,然后正序输入 一次作为右表,反序输入 一次作为左表,左右表加上标记
*/
public static class Map extends Mapper<LongWritable, Text, Text, Text> {
/*
* @see org.apache.hadoop.mapreduce.Mapper#map(java.lang.Object,
* java.lang.Object, org.apache.hadoop.mapreduce.Mapper.Context)
*/
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
String line = value.toString();
if (StringUtils.isNotEmpty(line)) {
String childName = new String();
String parentName = new String();
String relationType = new String();
System.out.println(line);
int i = 0;
// 找出字符隔的位置
while (line.charAt(i) != '\t') {
i++;
}
String[] values = { line.substring(0, i), line.substring(i + 1) };
// 开头的表头不处理
if (values[0].compareTo("child") != 0) {
childName = values[0].trim();
parentName = values[1].trim();
// 左右表标记
relationType = "1";
context.write(new Text(parentName), new Text(relationType + "+" + childName + "+" + parentName));
relationType = "2";
// 这样左右表在shuffle 过程中合并 在一起
context.write(new Text(childName), new Text(relationType + "+" + childName + "+" + parentName));
}
}
}
}
public static class Reduce extends Reducer<Text, Text, Text, Text> {
/*
* @see org.apache.hadoop.mapreduce.Reducer#reduce(java.lang.Object,
* java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
*/
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
if (time == 0) {
context.write(new Text("grandChild"), new Text("grandParent"));
time++;
}
int grandChildNum = 0;
String[] grandChild = new String[10];
int grandParentNum = 0;
String[] grandParent = new String[10];
Iterator iter = values.iterator();
while (iter.hasNext()) {
String record = iter.next().toString();
// System.out.println(record);
int len = record.length();
int i = 2;
if (len == 0)
continue;
char relationType = record.charAt(0);
String childname = new String();
String parnetname = new String();
while (record.charAt(i) != '+') {
i++;
}
childname = record.substring(2, i);
parnetname = record.substring(i + 1);
// System.out.println("childname-->"+childname);
// System.out.println("parnetname-->"+parnetname);
// 左表
if (relationType == '1') {
grandChild[grandChildNum] = childname;
grandChildNum++;
} else {
grandParent[grandParentNum] = parnetname;
grandParentNum++;
}
}
// System.out.println(Arrays.asList(grandChild));
// System.out.println(Arrays.asList(grandParent));
// 求数组的笛卡尔积
if (grandParentNum != 0 && grandChildNum != 0) {
for (int m = 0; m < grandChildNum; m++) {
for (int n = 0; n < grandParentNum; n++) {
context.write(new Text(grandChild[m].trim()),new Text(grandParent[n].trim()));
}
}
}
}
}
public static void deletedir(String path) {
try {
HDFSUtils.DeleteHDFSFile(path);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: SelfJon <in> <out>");
System.exit(2);
}
Job job = new Job(conf);
job.setJarByClass(SelfJoin.class);
job.setJobName("SelfJoin");
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
deletedir(args[1]);
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
输入 为:
child parent
Tom Lucy
Tom Jack
Jone Lucy
Lucy Marry
Lucy Ben
Jack Alice
Jack Jesse
Terry Alice
Terry Jesse
Philip Terry
Philip Alma
Mark Terry
Mark Alma
五、多表关联
思想:与单表关联相似,要提前指定好那个是左表,那个是右表
核心 :
//找出数据的分割点
while(line.charAt(i) >= '9' || line.charAt(i) <= '0'){
i++;
}
//代表的是左表
if(line.charAt(0)>= '9' || line.charAt(0) <= '0'){
int j = i-1;
while(line.charAt(j) != ' ')j--;
String[] values = {line.substring(0,j),line.substring(i)};
context.write(new Text(values[1].trim()), new Text("1+"+values[0]));
}else{
int j = i+1;
while(line.charAt(j) != ' ')j++;
String[] values = {line.substring(0,i+1),line.substring(j)};
context.write(new Text(values[0].trim()), new Text("2+"+values[1]));
}
代码:
package hadoop.v5;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import hadoop.utils.HDFSUtils;
/**
* 多表连接,类似自连接
* @author : chenhaipeng
* @date : 2015年9月10日 上午12:33:26
*/
public class MTJoin {
public static int time = 0;
public static class Map extends Mapper<LongWritable, Text, Text, Text>{
/*
* @see org.apache.hadoop.mapreduce.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapreduce.Mapper.Context)
*/
@Override
protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {
String line = value.toString();
int i = 0;
if(line.contains("factoryname") == true || line.contains("addressID") == true){
return;
}
//找出数据的分割点
while(line.charAt(i) >= '9' || line.charAt(i) <= '0'){
i++;
}
//代表的是左表
if(line.charAt(0)>= '9' || line.charAt(0) <= '0'){
int j = i-1;
while(line.charAt(j) != ' ')j--;
String[] values = {line.substring(0,j),line.substring(i)};
context.write(new Text(values[1].trim()), new Text("1+"+values[0]));
}else{
int j = i+1;
while(line.charAt(j) != ' ')j++;
String[] values = {line.substring(0,i+1),line.substring(j)};
context.write(new Text(values[0].trim()), new Text("2+"+values[1]));
}
}
}
public static class Reduce extends Reducer<Text, Text, Text, Text>{
/*
* @see org.apache.hadoop.mapreduce.Reducer#reduce(java.lang.Object, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
*/
@Override
protected void reduce(Text text, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
if(time == 0){
context.write(new Text("factoryname"), new Text("addressname"));
time++;
}
int factorynum = 0;
String factory[] = new String[10];
int addressnum = 0;
String address[] = new String[10];
Iterator iter = values.iterator();
while(iter.hasNext()){
String record = iter.next().toString();
int len = record.length();
int i = 2;
char type = record.charAt(0);
String factoryname = new String();
String addressname = new String();
if(type == '1' ){ //左表
factory[factorynum] = record.substring(2);
factorynum++;
}else{ //右表
address[addressnum] = record.substring(2);
addressnum++;
}
}
//
if(factorynum != 0 && addressnum!= 0){
for(int m = 0; m < factorynum; m++){
for(int n = 0; n < addressnum; n++){
context.write(new Text(factory[m]), new Text(address[n]));
}
}
}
}
}
public static void deletedir(String path) {
try {
HDFSUtils.DeleteHDFSFile(path);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: MTJoin <in> <out>");
System.exit(2);
}
Job job = new Job(conf);
job.setJarByClass(MTJoin.class);
job.setJobName("MTJoin");
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
deletedir(args[1]);
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
输入为:
factoryname addressed
Beijing Red Star 1
addressID addressname
1 Beijing