第 1 章 MapReduce 概述
1.1 MapReduce 定义
MapReduce 是一个分布式运算程序的编程框架,是用户开发“基于 Hadoop 的数据分析
应用”的核心框架。
MapReduce 核心功能是将用户编写的业务逻辑代码和自带默认组件整合成一个完整的
分布式运算程序,并发运行在一个 Hadoop 集群上。
1.2 MapReduce 编程规范
用户编写的程序分成三个部分:Mapper、Reducer 和 Driver。
1.Mapper阶段
(1)用户自定义的Mapper要继承自己的父类
(2)Mapper的输入数据是KV对的形式(KV的类型可自定义)
(3)Mapper中的业务逻辑写在map()方法中
(4)Mapper的输出数据是KV对的形式(KV的类型可自定义)
(5)map()方法(MapTask进程)对每一个<K,V>调用一次 尚硅谷大数据技术之 Hadoop(MapReduce)
2.Reducer阶段
(1)用户自定义的Reducer要继承自己的父类
(2)Reducer的输入数据类型对应Mapper的输出数据类型,也是KV
(3)Reducer的业务逻辑写在reduce()方法中
(4)ReduceTask进程对每一组相同k的<k,v>组调用一次reduce()方法
3.Driver阶段
相当于YARN集群的客户端,用于提交我们整个程序到YARN集群,提交的是封装了MapReduce程序相关运行参数的job对象
1.3 MapReduce案例实操
1)环境准备
(1)创建 maven 工程,MapReduceDemo
(2)在 pom.xml 文件中添加如下依赖
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.1.3</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.30</version>
</dependency>
</dependencies>
(2)在项目的 src/main/resources 目录下,新建一个文件,命名为“log4j.properties”,在 文件中填入。
log4j.rootLogger=INFO, stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=target/spring.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n
(3)创建包名
:com.atguigu.mapreduce.wordcount
4)编写WordCount 程序
(1)编写 Mapper 类
package com.atxsz.mapreduce.wordcount;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCountDriver {
public static void main(String[] args) throws IOException,
ClassNotFoundException, InterruptedException {
// 1 获取配置信息以及获取 job 对象
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 2 关联本 Driver 程序的 jar(设置jar包路径)
job.setJarByClass(WordCountDriver.class);
// 3 关联 Mapper 和 Reducer 的 jar
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
// 4 设置 Mapper 输出的 kv 类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 5 设置最终输出 kv 类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 6 设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path("D:\\xx.txt"));
FileOutputFormat.setOutputPath(job, new Path("D:\\hadoop\\hadoop\\test1"));
// 7 提交 job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
(2)编写 Reducer 类
package com.atxsz.mapreduce.wordcount;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordCountMapper extends Mapper<LongWritable, Text, Text,
IntWritable>{
Text outk = new Text();
IntWritable outv = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 1 获取一行
String line = value.toString();
// 2 切割
String[] words = line.split(" ");
// 3 循环写出
for (String word : words) {
//封装outk
outk.set(word);
//写出
context.write(outk, outv);
}
}
}
(3)编写 Driver 驱动类
package com.atxsz.mapreduce.wordcount;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends Reducer<Text, IntWritable, Text,
IntWritable>{
int sum;
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values,Context
context) throws IOException, InterruptedException {
// 1 累加求和
sum = 0;
for (IntWritable count : values) {
sum += count.get();
}
// 2 输出
v.set(sum);
context.write(key,v);
}
}
5) Partition 分区案例
编写ProvincePartitioner
package com.atxsz.mapereduce.pattitioner2;
//分区
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class ProvincePartitioner extends Partitioner<Text, FlowBean> {
@Override
public int getPartition(Text text, FlowBean flowBean, int numPartitions) {
//text是手机号
String phone = text.toString();
String prePhone = phone.substring(0,3);
//手机号前三位
int partition ;//初始化分区
if ("136".equals(prePhone)){
partition = 0 ;
}else if ("137".equals(prePhone)){
partition = 1 ;
}else if ("138".equals(prePhone)){
partition = 2 ;
}else if ("139".equals(prePhone)){
partition = 3 ;
}else {
partition = 4 ;
}
return partition;
}
}
编写FlowReducer
package com.atxsz.mapereduce.pattitioner2;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FlowReducer extends Reducer<Text , FlowBean,Text, FlowBean> {
private FlowBean outv = new FlowBean();
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
long totalUp = 0 ;
long totaldown = 0 ;
for (FlowBean value : values) {
totalUp += value.getUpFlow();
totaldown += value.getDownFlow();
}
outv.setUpFlow(totalUp);
outv.setDownFlow(totaldown);
outv.setSumFlow();
context.write(key,outv);
}
}
编写 FlowMapper
package com.atxsz.mapereduce.pattitioner2;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FlowMapper extends Mapper<LongWritable, Text,Text, FlowBean> {
private Text outk = new Text() ;
private FlowBean outv = new FlowBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[]split =line.split("\t");
String phone =split[1];
String up = split[split.length - 3];
String down =split[split.length -2];
outk.set(phone);
outv.setUpFlow(Long.parseLong(up));
outv.setDownFlow(Long.parseLong(down));
outv.setSumFlow();
context.write(outk,outv);
}
}
编写FlowDriver
package com.atxsz.mapereduce.pattitioner2;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class FlowDriver {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
//1 获取 job 对象
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//2 关联本 Driver 类
job.setJarByClass(FlowDriver.class);
//3 关联 Mapper 和 Reducer
job.setMapperClass(FlowMapper.class);
job.setReducerClass(FlowReducer.class);
//4 设置 Map 端输出 KV 类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
//5 设置程序最终输出的 KV 类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
job.setPartitionerClass(ProvincePartitioner.class);//建立连接
job.setNumReduceTasks(5);//设置分区数目
//6 设置程序的输入输出路径
FileInputFormat.setInputPaths(job , new Path("D:\\11_input\\inputflow"));
FileOutputFormat.setOutputPath(job ,new Path("D:\\hadoop\\hadoop\\test4"));
//7 提交 Job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
编写 FlowBean
package com.atxsz.mapereduce.pattitioner2;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/*
1.定义类实现writable接口
2.重写序列化和反序列化方法
3.重写空参构造
4.toString方法
*/
public class FlowBean implements Writable {
private long upFlow;//上行流量
private long downFlow ; //下行流量
private long sumFlow ; //总流量
//空参构造
public FlowBean() {
}
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getDownFlow() {
return downFlow;
}
public void setDownFlow(long downFlow) {
this.downFlow = downFlow;
}
public long getSumFlow() {
return sumFlow;
}
public void setSumFlow(long sumFlow) {
this.sumFlow = sumFlow;
}
public void setSumFlow() {
this.sumFlow = this.downFlow + this.upFlow;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(upFlow);
out.writeLong(downFlow);
out.writeLong(sumFlow);
}
//重写序列化(上)和反序列化方法(下)
@Override
public void readFields(DataInput in) throws IOException {
this.upFlow = in.readLong();
this.downFlow = in.readLong();
this.sumFlow = in.readLong();
}
//toString重写
@Override
public String toString() {
return upFlow + "\t" + downFlow + "\t" + sumFlow ;
}
}
6) WritableComparable 排序(全排序)
编写FlowBean
import java.io.DataOutput;
import java.io.IOException;
/*
1.定义类实现writable接口
2.重写序列化和反序列化方法
3.重写空参构造
4.toString方法
*/
public class FlowBean implements WritableComparable<FlowBean> {
private long upFlow;//上行流量
private long downFlow ; //下行流量
private long sumFlow ; //总流量
//空参构造
public FlowBean() {
}
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getDownFlow() {
return downFlow;
}
public void setDownFlow(long downFlow) {
this.downFlow = downFlow;
}
public long getSumFlow() {
return sumFlow;
}
public void setSumFlow(long sumFlow) {
this.sumFlow = sumFlow;
}
public void setSumFlow() {
this.sumFlow = this.downFlow + this.upFlow;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(upFlow);
out.writeLong(downFlow);
out.writeLong(sumFlow);
}
//重写序列化(上)和反序列化方法(下)
@Override
public void readFields(DataInput in) throws IOException {
this.upFlow = in.readLong();
this.downFlow = in.readLong();
this.sumFlow = in.readLong();
}
//toString重写
@Override
public String toString() {
return upFlow + "\t" + downFlow + "\t" + sumFlow ;
}
@Override
public int compareTo(FlowBean flowBean) {
//总流量的倒序
if (this.sumFlow > flowBean.sumFlow) {
return -1;
} else if (this.sumFlow < flowBean.sumFlow) {
return 1;
} else {
//二次排序
//按照上行流量的正序排
if (this.upFlow > flowBean.upFlow){
return 1;
}else if (this.upFlow < flowBean.upFlow){
return -1 ;
}else {
//按照下行流量的正序排
if (this.downFlow > flowBean.downFlow){
return 1;
}else if (this.downFlow < flowBean.downFlow){
return -1 ;
}else {
return 0;
}
}
}
}
}
编写 FlowDriver
package com.atxsz.mapereduce.writableComparable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class FlowDriver {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
//1 获取 job 对象
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//2 关联本 Driver 类
job.setJarByClass(FlowDriver.class);
//3 关联 Mapper 和 Reducer
job.setMapperClass(FlowMapper.class);
job.setReducerClass(FlowReducer.class);
//4 设置 Map 端输出 KV 类型
job.setMapOutputKeyClass(FlowBean.class);
job.setMapOutputValueClass(Text.class);
//5 设置程序最终输出的 KV 类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
//6 设置程序的输入输出路径
FileInputFormat.setInputPaths(job , new Path("D:\\hadoop\\hadoop\\test2"));
FileOutputFormat.setOutputPath(job ,new Path("D:\\hadoop\\hadoop\\test6"));
//7 提交 Job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
编写FlowMapper
package com.atxsz.mapereduce.writableComparable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FlowMapper extends Mapper<LongWritable, Text,FlowBean,Text > {
private FlowBean outK = new FlowBean();
private Text outV = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取一行
String line = value.toString();
//切割
String[] split = line.split("\t");
//封装
outV.set(split[0]);
outK.setUpFlow(Long.parseLong(split[1]));
outK.setDownFlow(Long.parseLong(split[2]));
outK.setSumFlow();
//写出
context.write(outK,outV);
}
}
编写FlowReducer
package com.atxsz.mapereduce.writableComparable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FlowReducer extends Reducer< FlowBean,Text ,Text, FlowBean> {
@Override
protected void reduce(FlowBean key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text value : values) {
context.write(value,key);
}
}
}
7)WritableComparable 排序案例实操(区内排序)
编写ProvincePartitioner2
package com.atxsz.mapereduce.partitionerandwritableComparable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class ProvincePartitioner2 extends Partitioner<FlowBean , Text> {
@Override
public int getPartition(FlowBean flowBean, Text text, int numPartitions) {
String phone = text.toString();
String prePhone = phone.substring(0, 3);
int partition;
if ("136".equals(prePhone)) {
partition = 0;
} else if ("137".equals(prePhone)) {
partition = 1;
} else if ("138".equals(prePhone)) {
partition = 2;
} else if ("139".equals(prePhone)) {
partition = 3;
} else {
partition = 4;
}
return partition ;
}
}
编写FlowMapper
package com.atxsz.mapereduce.partitionerandwritableComparable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FlowMapper extends Mapper<LongWritable, Text, FlowBean,Text > {
private FlowBean outK = new FlowBean();
private Text outV = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取一行
String line = value.toString();
//切割
String[] split = line.split("\t");
//封装
outV.set(split[0]);
outK.setUpFlow(Long.parseLong(split[1]));
outK.setDownFlow(Long.parseLong(split[2]));
outK.setSumFlow();
//写出
context.write(outK,outV);
}
}
编写FlowReducer
package com.atxsz.mapereduce.partitionerandwritableComparable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FlowReducer extends Reducer<FlowBean,Text ,Text, FlowBean> {
@Override
protected void reduce(FlowBean key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text value : values) {
context.write(value,key);
}
}
}
编写FlowDriver
package com.atxsz.mapereduce.partitionerandwritableComparable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class FlowDriver {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
//1 获取 job 对象
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//2 关联本 Driver 类
job.setJarByClass(FlowDriver.class);
//3 关联 Mapper 和 Reducer
job.setMapperClass(FlowMapper.class);
job.setReducerClass(FlowReducer.class);
//4 设置 Map 端输出 KV 类型
job.setMapOutputKeyClass(FlowBean.class);
job.setMapOutputValueClass(Text.class);
//5 设置程序最终输出的 KV 类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
//关联分区
job.setPartitionerClass(ProvincePartitioner2.class);
job.setNumReduceTasks(5);
//6 设置程序的输入输出路径
FileInputFormat.setInputPaths(job , new Path("D:\\hadoop\\hadoop\\test2"));
FileOutputFormat.setOutputPath(job ,new Path("D:\\hadoop\\hadoop\\test7"));
//7 提交 Job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
编写FlowBean
package com.atxsz.mapereduce.partitionerandwritableComparable;
//二次排序
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/*
1.定义类实现writable接口
2.重写序列化和反序列化方法
3.重写空参构造
4.toString方法
*/
public class FlowBean implements WritableComparable<FlowBean> {
private long upFlow;//上行流量
private long downFlow ; //下行流量
private long sumFlow ; //总流量
//空参构造
public FlowBean() {
}
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getDownFlow() {
return downFlow;
}
public void setDownFlow(long downFlow) {
this.downFlow = downFlow;
}
public long getSumFlow() {
return sumFlow;
}
public void setSumFlow(long sumFlow) {
this.sumFlow = sumFlow;
}
public void setSumFlow() {
this.sumFlow = this.downFlow + this.upFlow;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(upFlow);
out.writeLong(downFlow);
out.writeLong(sumFlow);
}
//重写序列化(上)和反序列化方法(下)
@Override
public void readFields(DataInput in) throws IOException {
this.upFlow = in.readLong();
this.downFlow = in.readLong();
this.sumFlow = in.readLong();
}
//toString重写
@Override
public String toString() {
return upFlow + "\t" + downFlow + "\t" + sumFlow ;
}
@Override
public int compareTo(FlowBean flowBean) {
//总流量的倒序
if (this.sumFlow > flowBean.sumFlow) {
return -1;
} else if (this.sumFlow < flowBean.sumFlow) {
return 1;
} else {
//二次排序
//按照上行流量的正序排
if (this.upFlow > flowBean.upFlow){
return 1;
}else if (this.upFlow < flowBean.upFlow){
return -1 ;
}else {
//按照下行流量的正序排
if (this.downFlow > flowBean.downFlow){
return 1;
}else if (this.downFlow < flowBean.downFlow){
return -1 ;
}else {
return 0;
}
}
}
}
}
8)Combiner 合并
编写WordCountCombiner
package com.atxsz.mapereduce.combiner;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
//combiner聚合
public class WordCountCombiner extends Reducer<Text, IntWritable ,Text ,IntWritable> {
private IntWritable outV = new IntWritable() ;
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0 ;
for (IntWritable value : values) {
sum += value.get();
}
outV.set(sum);
context.write(key,outV);
}
}
编写WordCountMapper
package com.atxsz.mapereduce.combiner;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WordCountMapper extends Mapper<LongWritable, Text, Text,
IntWritable>{
Text outk = new Text();
IntWritable outv = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 1 获取一行
String line = value.toString();
// 2 切割
String[] words = line.split(" ");
// 3 循环写出
for (String word : words) {
//封装outk
outk.set(word);
//写出
context.write(outk, outv);
}
}
}
编写WordCountReducer
package com.atxsz.mapereduce.combiner;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WordCountReducer extends Reducer<Text, IntWritable, Text,
IntWritable>{
int sum;
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values,Context
context) throws IOException, InterruptedException {
// 1 累加求和
sum = 0;
for (IntWritable count : values) {
sum += count.get();
}
// 2 输出
v.set(sum);
context.write(key,v);
}
}
编写WordCountDriver
package com.atxsz.mapereduce.combiner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WordCountDriver {
public static void main(String[] args) throws IOException,
ClassNotFoundException, InterruptedException {
// 1 获取配置信息以及获取 job 对象
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 2 关联本 Driver 程序的 jar(设置jar包路径)
job.setJarByClass(WordCountDriver.class);
// 3 关联 Mapper 和 Reducer 的 jar
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
// 4 设置 Mapper 输出的 kv 类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 5 设置最终输出 kv 类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setCombinerClass(WordCountCombiner.class);
// 6 设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path("D:\\11_input\\inputword"));
FileOutputFormat.setOutputPath(job, new Path("D:\\hadoop\\hadoop\\test8"));
// 7 提交 job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
9)OutputFormat 数据输出
package com.atxsz.mapereduce.outputformat;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class LogMapper extends Mapper<LongWritable , Text , Text , NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//http:/www.baidu.com
//http:/www.google.com
//(http:/www.google.com , NullWritable)
//不做任何处理
context.write(value,NullWritable.get());
}
}
package com.atxsz.mapereduce.outputformat;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class LogReducer extends Reducer<Text , NullWritable ,Text ,NullWritable> {
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
//http:/www.baidu.com
//http:/www.baidu.com
//防止有相同数据,丢数据。 去重可不用for循环
for (NullWritable value : values) {
context.write(key ,NullWritable.get());
}
}
}
package com.atxsz.mapereduce.outputformat;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.IOException;
public class LogRecordWriter extends RecordWriter<Text, NullWritable> {
private FSDataOutputStream atguiguOut;
private FSDataOutputStream otherOut;
public LogRecordWriter(TaskAttemptContext job) {
//创建两条流
try {
FileSystem fileSystem = FileSystem.get(job.getConfiguration());
atguiguOut = fileSystem.create(new Path("D:\\hadoop\\hadoop\\atguigu.log"));
otherOut = fileSystem.create(new Path("D:\\hadoop\\hadoop\\other.log"));
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public void write(Text key, NullWritable value) throws IOException, InterruptedException {
String log = key.toString();
//具体写
if(log.contains("atguigu")){
atguiguOut.writeBytes(log +"\n");
}else {
otherOut.writeBytes(log + "\n");
}
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
//关流
IOUtils.closeStream(atguiguOut);
IOUtils.closeStream(otherOut);
}
}
package com.atxsz.mapereduce.outputformat;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class LogOutputFormat extends FileOutputFormat<Text , NullWritable> {
@Override
public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
LogRecordWriter lrw = new LogRecordWriter(job) ;
return lrw;
}
}
package com.atxsz.mapereduce.outputformat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class LogDriver {
public static void main(String[] args) throws IOException,
ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(LogDriver.class);
job.setMapperClass(LogMapper.class);
job.setReducerClass(LogReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//设置自定义的 outputformat
job.setOutputFormatClass(LogOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path("D:\\11_input\\inputoutputformat"));
// 虽 然 我 们 自 定 义 了 outputformat , 但 是 因 为 我 们 的 outputformat 继承自fileoutputformat
//而 fileoutputformat 要输出一个_SUCCESS 文件,所以在这还得指定一个输出目录
FileOutputFormat.setOutputPath(job, new Path("D:\\hadoop\\hadoop\\test09"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
10) Join 应用
ReduceJoin
编写TableBean
package com.atxsz.mapereduce.reduceJoin;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class TableBean implements Writable {
private String id ; //订单id
private String pid ; //商品id
private int amount ; //商品数量
private String pname;//商品名字
private String flag ; // 标记是什么表 order还是pd
public TableBean(){
//空参构造
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getPid() {
return pid;
}
public void setPid(String pid) {
this.pid = pid;
}
public int getAmount() {
return amount;
}
public void setAmount(int amount) {
this.amount = amount;
}
public String getPname() {
return pname;
}
public void setPname(String pname) {
this.pname = pname;
}
public String getFlag() {
return flag;
}
public void setFlag(String flag) {
this.flag = flag;
}
@Override//序列化方法
public void write(DataOutput out) throws IOException {
out.writeUTF(id);
out.writeUTF(pid);
out.writeInt(amount);
out.writeUTF(pname);
out.writeUTF(flag);
}
@Override//反序列化
public void readFields(DataInput in) throws IOException {
this.id = in.readUTF();
this.pid = in.readUTF();
this.amount = in.readInt();
this.pname = in.readUTF();
this.flag = in.readUTF();
}
@Override//重写toString,未来需打印
public String toString() {
return id + '\t' + "\t" + pname + "\t" + amount ;
}
}
编写TableMapper
package com.atxsz.mapereduce.reduceJoin;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.File;
import java.io.IOException;
public class TableMapper extends Mapper<LongWritable , Text , Text , TableBean> {
private String fileName ;
private Text outK = new Text();
private TableBean outV = new TableBean();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//初始化 oder pd
//获取对应文件名称
FileSplit split = (FileSplit) context.getInputSplit();//获取切片信息
fileName = split.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//1 获取一行
String line = value.toString();
//2 判断是哪个文件,然后针对文件进行不同的操作
if (fileName.contains("order")) {
//处理的是订单表
String[] split = line.split("\t");
//封装 outK
outK.set(split[1]);
//封装 outV
outV.setId(split[0]);
outV.setPid(split[1]);
outV.setAmount(Integer.parseInt(split[2]));
outV.setPname("");
outV.setFlag("order");
}else {//商品表的处理
String[] split = line.split("\t");
//封装 outK
outK.set(split[0]);
//封装 outV
outV.setId("");
outV.setPid(split[0]);
outV.setAmount(0);
outV.setPname(split[1]);
outV.setFlag("pd");
}
//写出 KV
context.write(outK,outV);
}
}
编写TableReducer
package com.atxsz.mapereduce.reduceJoin;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
public class TableReducer extends Reducer<Text , TableBean ,TableBean , NullWritable> {
@Override
protected void reduce(Text key, Iterable<TableBean> values, Context context) throws IOException, InterruptedException {
ArrayList<TableBean> orderBeans = new ArrayList<>();
TableBean pdBean = new TableBean();
//循环遍历
for (TableBean value : values) {
//判断数据来自哪个表
if("order".equals(value.getFlag())){ //订单表
//创建一个临时 TableBean 对象接收 value
TableBean tmpOrderBean = new TableBean();
try {
BeanUtils.copyProperties(tmpOrderBean,value);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
//将临时 TableBean 对象添加到集合 orderBeans
orderBeans.add(tmpOrderBean);
}else { //商品表
try {
BeanUtils.copyProperties(pdBean,value);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
}
}
//遍历集合 orderBeans,
// 替换掉每个 orderBean 的 pid 为 pname,然后写出
for (TableBean orderBean : orderBeans) {
orderBean.setPname(pdBean.getPname());
context.write(orderBean,NullWritable.get());
}
}
}
编写TableDriver
package com.atxsz.mapereduce.reduceJoin;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class TableDriver {
public static void main(String[] args) throws IOException,
ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(TableDriver.class);
job.setMapperClass(TableMapper.class);
job.setReducerClass(TableReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(TableBean.class);
job.setOutputKeyClass(TableBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("D:\\11_input\\inputtable"));
FileOutputFormat.setOutputPath(job, new Path("D:\\hadoop\\hadoop\\test11"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
MapJoin
编写 MapJoinDriver
package com.atxsz.mapereduce.mapjoin;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class MapJoinDriver {
public static void main(String[] args) throws IOException,
URISyntaxException, ClassNotFoundException, InterruptedException {
// 1 获取 job 信息
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 2 设置加载 jar 包路径
job.setJarByClass(MapJoinDriver.class);
// 3 关联 mapper
job.setMapperClass(MapJoinMapper.class);
// 4 设置 Map 输出 KV 类型 尚硅谷大数据技术之 Hadoop(MapReduce)
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
// 5 设置最终输出 KV 类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// 加载缓存数据
job.addCacheFile(new URI("file:///D:/11_input/tablecache/pd.txt"));
// Map 端 Join 的逻辑不需要 Reduce 阶段,设置 reduceTask 数量为 0
job.setNumReduceTasks(0);
// 6 设置输入输出路径
FileInputFormat.setInputPaths(job, new Path("D:\\11_input\\inputtable2"));
FileOutputFormat.setOutputPath(job, new Path("D:\\hadoop\\hadoop\\test12"));
// 7 提交
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
编写MapJoinMapper
package com.atxsz.mapereduce.mapjoin;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
public class MapJoinMapper extends Mapper<LongWritable, Text, Text,
NullWritable> {
private Map<String, String> pdMap = new HashMap<>();
private Text text = new Text();
//任务开始前将 pd 数据缓存进 pdMap
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
//通过缓存文件得到小表数据 pd.txt
URI[] cacheFiles = context.getCacheFiles();
Path path = new Path(cacheFiles[0]);
//获取文件系统对象,并开流
FileSystem fs = FileSystem.get(context.getConfiguration());
FSDataInputStream fis = fs.open(path);
//通过包装流转换为 reader,方便按行读取
BufferedReader reader = new BufferedReader(new
InputStreamReader(fis, "UTF-8"));
//逐行读取,按行处理
String line;
while (StringUtils.isNotEmpty(line = reader.readLine())) {
//切割一行
//01 小米
String[] split = line.split("\t");
pdMap.put(split[0], split[1]);
}
//关流
IOUtils.closeStream(reader);
}
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//读取大表数据
//1001 01 1
String[] fields = value.toString().split("\t");
//通过大表每行数据的 pid,去 pdMap 里面取出 pname
String pname = pdMap.get(fields[1]);
//将大表每行数据的 pid 替换为 pname
text.set(fields[0] + "\t" + pname + "\t" + fields[2]);
//写出
context.write(text,NullWritable.get());
}
}