一、建立MAVEN工程,在POM.XML中引入JAR包
pom.xml
<dependencies>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.14.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.7</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.7</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.7</version>
</dependency>
</dependencies>
log4j.properties
log4j.rootLogger=INFO, stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n
log4j.appender.logFile=org.apache.log4j.FileAppender
log4j.appender.logFile.File=target/spring.log
log4j.appender.logFile.layout=org.apache.log4j.PatternLayout
log4j.appender.logFile.layout.ConversionPattern=%d %p [%c] - %m%n
二、使用步骤
1.WordCount示例
WordMapper
package com.hdfs.mr;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WordMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
Text k = new Text();
IntWritable v = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] words = line.split(" ");
for (String word:words) {
k.set(word);
context.write(k,v);
}
}
}
WordReducer
package com.hdfs.mr;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
public class WordReducer extends Reducer <Text, IntWritable ,Text, IntWritable>{
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
Iterator it = values.iterator();
while (it.hasNext()){
sum += Integer.parseInt(it.next().toString());
}
v.set(sum);
context.write(key,v);
}
}
WordDriver
package com.hdfs.mr;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WordDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
String[] args1 = new String[2];
args1[0]= "d:/hello.txt";
args1[1]="d:/output10";
Configuration conf = new Configuration();
// 设置map 输出端压缩
conf.setBoolean("mapreduce.map.output.compress",true);
conf.setClass("mapreduce.map.output.compress.codec", BZip2Codec.class, CompressionCodec.class);
//1, 获取job对象
Job job = Job.getInstance(conf);
//2, 设置JAR存储位置
job.setJarByClass(WordDriver.class);
//3, 关联Map和Reducer类
job.setMapperClass(WordMapper.class);
job.setReducerClass(WordReducer.class);
//4, 设置Maper阶段输出数据的Key和Value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//5, 设置最终输出阶段的Key和Value类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//8, 设置Combiner 可以把reducer 直接设置进去,如果处理流程一样
job.setCombinerClass(WordReducer.class);
//9 设置reduce 输出端压缩
FileOutputFormat.setCompressOutput(job,true);
FileOutputFormat.setOutputCompressorClass(job,BZip2Codec.class);
//6, 设置输入和输出路径
FileInputFormat.setInputPaths(job,new Path(args1[0]));
FileOutputFormat.setOutputPath(job,new Path(args1[1]));
//7, 提交JOB
job.waitForCompletion(true);
}
}
2.序列化 自定义分区 示例
FlowBean
package com.hdfs.mr;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class FlowBean implements Writable {
private long upload;
private long download;
private long sum;
public FlowBean() {
super();
}
public FlowBean(long upload, long download) {
super();
this.upload = upload;
this.download = download;
this.sum = upload + download;
}
@Override
public String toString() {
return upload +"\t" + download +"\t" + sum ;
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeLong(upload);
dataOutput.writeLong(download);
dataOutput.writeLong(sum);
}
public void readFields(DataInput dataInput) throws IOException {
this.upload = dataInput.readLong();
this.download = dataInput.readLong();
this.sum = dataInput.readLong();
}
public long getUpload() {
return upload;
}
public void setUpload(long upload) {
this.upload = upload;
}
public long getDownload() {
return download;
}
public void setDownload(long download) {
this.download = download;
}
public long getSum() {
return sum;
}
public void setSum(long a, long b) {
this.sum = a + b;
}
}
FlowMapper
package com.hdfs.mr;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FlowMapper extends Mapper<LongWritable, Text,Text, FlowBean> {
Text k = new Text();
FlowBean flowBean = new FlowBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] fileds = line.split("\t");
flowBean.setUpload(Long.parseLong(fileds[1]));
flowBean.setDownload(Long.parseLong(fileds[2]));
flowBean.setSum(Long.parseLong(fileds[1]),Long.parseLong(fileds[2]));
k.set(fileds[0]);
context.write(k,flowBean);
}
}
FlowReducer
package com.hdfs.mr;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FlowReducer extends Reducer<Text, FlowBean, Text, FlowBean> {
FlowBean flowBean = new FlowBean();
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
long sumUp = 0;
long sumDown = 0;
for (FlowBean ff : values) {
sumUp += ff.getUpload();
sumDown += ff.getDownload();
}
flowBean.setUpload(sumUp);
flowBean.setDownload(sumDown);
flowBean.setSum(sumUp, sumDown);
context.write(key, flowBean);
}
}
FlowDriver
package com.hdfs.mr;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class FlowDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
String[] args1 = new String[2];
args1[0]= "d:/phone.txt";
args1[1]="d:/output4";
Configuration conf = new Configuration();
//1, 获取job对象
Job job = Job.getInstance(conf);
//2, 设置JAR存储位置
job.setJarByClass(FlowDriver.class);
//3, 关联Map和Reducer类
job.setMapperClass(FlowMapper.class);
job.setReducerClass(FlowReducer.class);
//4, 设置Maper阶段输出数据的Key和Value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
//5, 设置最终输出阶段的Key和Value类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
//8 设置自定义分区(没有可以不设置)
// job.setPartitionerClass(CustomPartitioner.class);
// job.setNumReduceTasks(2);
//6, 设置输入和输出路径
FileInputFormat.setInputPaths(job,new Path(args1[0]));
FileOutputFormat.setOutputPath(job,new Path(args1[1]));
//7, 提交JOB
job.waitForCompletion(true);
}
}
CustomPartitioner
package com.hdfs.shuffle;
import com.hdfs.mr.FlowBean;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class CustomPartitioner extends Partitioner<Text, FlowBean> {
public int getPartition(Text text, FlowBean flowBean, int i) {
// 手机号前3位
String phoneNo = text.toString().substring(0,3);
int partition = 1;
if("138".equals(phoneNo)){
partition = 0;
}
return partition;
}
}
3.Map Join
1 MapJoinMapper
package com.hdfs.mapjoin;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
public class MapJoinMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
HashMap<String, String> pdMap = new HashMap();
Text k = new Text();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
FileSystem fs = null;
try {
fs = FileSystem.get(new URI("hdfs://s201:9000"),conf,"root");
} catch (URISyntaxException e) {
e.printStackTrace();
}
//缓存小表
URI[] cacheFiles = context.getCacheFiles();
String path=cacheFiles[0].getPath().toString();
System.out.println("path:============="+ path);
FSDataInputStream fin = fs.open(new Path(path));
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(fin,"UTF-8"));
// BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(path),"UTF-8"));
String line;
while (StringUtils.isNotEmpty(line = bufferedReader.readLine())){
String[] fileds = line.toString().split(" ");
pdMap.put(fileds[0],fileds[1]);
}
IOUtils.closeStream(fin);
IOUtils.closeStream(bufferedReader);
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] fileds = value.toString().split(" ");
// 获取ID
String pid = fileds[1];
String pname = pdMap.get(pid);
System.out.println(pid +"pid==================pname"+ pname);
String line = fileds[0] +"\t"+ pname +"\t" + fileds[2];
k.set(line);
context.write(k ,NullWritable.get());
}
}
2 MapJoinDriver
package com.hdfs.mapjoin;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class MapJoinDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
String[] args1 = new String[2];
// 这么设置就是HDFS的路径,不是LINUX 系统路径
args1[0] = "/root/order.txt";
args1[1] = "/output9";
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://s201:9000");
//1, 获取job对象
Job job = Job.getInstance(conf);
//2, 设置JAR存储位置
job.setJarByClass(MapJoinDriver.class);
//3, 关联Map和Reducer类
job.setMapperClass(MapJoinMapper.class);
// job.setReducerClass(OrderCompareReducer.class);
//4, 设置Maper阶段输出数据的Key和Value类型
// job.setMapOutputKeyClass(Text.class);
// job.setMapOutputValueClass(NullWritable.class);
//5, 设置最终输出阶段的Key和Value类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//8 设置自定义分区(没有可以不设置)(如果分区内排序,就需要分区了,要是全部排序就不能设置分区)
// job.setPartitionerClass(CustomTowPartitioner.class);
// job.setNumReduceTasks(2);
//9 设置分组排序类
// job.setGroupingComparatorClass(OrderGroupingComparator.class);
//10 设置缓存文件
job.addCacheFile(new URI("hdfs://s201:9000/pd.txt"));
// 这里是linux 系统路径
// job.addCacheFile(new Path("/root/pd.txt").toUri());
job.setNumReduceTasks(0);
//6, 设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path(args1[0]));
FileOutputFormat.setOutputPath(job, new Path(args1[1]));
//7, 提交JOB
job.waitForCompletion(true);
}
}
3 数据文件
order.txt
1001 1 1
1002 2 2
1003 3 3
1004 1 4
1005 2 5
1006 3 6
pd.txt
1 小米
2 华为
3 格力
4.分组排序(辅助排序)
OrderGroupingComparator
package com.hdfs.shuffle;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class OrderGroupingComparator extends WritableComparator {
public OrderGroupingComparator() {
super(OrderCompareBean.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
OrderCompareBean aBean = (OrderCompareBean) a;
OrderCompareBean bBean = (OrderCompareBean) b;
int result;
if(aBean.getOrder_id() > bBean.getOrder_id()){
result = 1;
}else if(aBean.getOrder_id() < bBean.getOrder_id()){
result = -1;
}else {
result = 0;
}
return result;
}
}