- shuuffle:分区,排序,combiner,分组
一.MapReduce
- 源码分析
fileinputformat
for(bytesRemaining (剩余)= length; (double)bytesRemaining / (double)splitSize > 1.1D; bytesRemaining -= splitSize) {
blkIndex = this.getBlockIndex(blkLocations, length - bytesRemaining);
splits.add(this.makeSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
}
面试:mapreduce切片的时候会判断剩余的部分处以切片大小>1.1则会开辟新的maptask
二.分区练习(明确知道分区数量,不知道分区数量:reduceTask设置为最大:=数据量/一个reduce最多处理的数据量)
1.pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<version>1.7.25</version>
</dependency>
</dependencies>
<groupId>com.hlzq</groupId>
<artifactId>day10</artifactId>
<version>1.0-SNAPSHOT</version>
</project>
分区少的报错原因:
package com.hlzq;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class Jobmain {
public static void main(String[] args) throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException {
//1创建一个job任务对象
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration, "mypartioner_covid19");
//2.指定job所在jar包
job.setJarByClass(Jobmain.class);
//3.制定源文件的读取方式和源文件的读取路径
job.setInputFormatClass(TextInputFormat.class);//按照行读取
TextInputFormat.addInputPath(job,new Path("file:///G:\\input\\us-covid19-counties.dat"));//指定源文件所在目录
//4.指定自定义的mapper类,和k2,v2类型
job.setMapperClass(MyPartiMapper.class);//指定mapper累
job.setMapOutputKeyClass(Text.class);//k2类型
job.setMapOutputValueClass(NullWritable.class);//v2类型
//5.指定自定义分区(如果有的话)
job.setPartitionerClass(MyParti.class);
//设置reduce个数默认只有一个
job.setNumReduceTasks(55);
//6.指定自定义分组类(如果有的话)
//7.指定自定义的reduce类和k3v3的数据类型
job.setReducerClass(MyPartiReducer.class);//指定reduce累
job.setOutputKeyClass(Text.class);//k3
job.setOutputValueClass(NullWritable.class);//v3
Path outputPath = new Path("file:///G:\\\\output");
FileSystem fileSystem = FileSystem.get(new URI("file:///"), new Configuration());
boolean flag=fileSystem.exists(outputPath);
if (flag){
fileSystem.delete(outputPath,true);
}
//8.指定输出方式类和结果输出路径
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,outputPath);
//9.将job提交给样集群
boolean bl=job.waitForCompletion(true);//true表示可以看到任务的执行进度
//10.退出执行进程
System.exit(bl?0:1);
}
}
package com.hlzq;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class MyPartiMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
// ki 0 v2:1 0 1 2017-07-31 23:10:12 837255 6 4+1+1=6 小,双 0 0.00 0.00 1 0.00 1 1
// k21 0 1 2017-07-31 23:10:12 837255 6 4+1+1=6 小,双 0 0.00 0.00 1 0.00 1 1 v2:nullwritable
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
context.write(value,NullWritable.get());
}
}
package com.hlzq;
//自定义分区规则:集成partitioner类,重写getPartioner
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
import java.util.HashMap;
public class MyParti extends Partitioner<Text, NullWritable> {
private static int index = -1;
private static HashMap<String, Integer> map = new HashMap<String, Integer>();//string周明 integer 分区编号
//k2 :Text text,v2: NullWritable nullWritable,rduce个数: int i在jobmain设这
public static int getPar(String stateName) {//传人洲名
//判断map中的key是否存在,不存在添加,存在获取value
boolean b = map.containsKey(stateName);
if (!b) {
map.put(stateName, ++index);//map中键值对不能重复
return index;
} else {
return map.get(stateName);
}
}
@Override
public int getPartition(Text text, NullWritable nullWritable, int i) {
//拆分行文本数据获取州
String[] split = text.toString().split(",");
int par = getPar(split[2]);
return par;
}
}
package com.hlzq;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class MyPartiReducer extends Reducer <Text, NullWritable,Text,NullWritable>{
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Reducer<Text, NullWritable, Text, NullWritable>.Context context) throws IOException, InterruptedException {
context.write(key,NullWritable.get());
}
}
三.默认分区
- 哈希值
- 大量的key相同则拼接时间戳 (1970年)
- 为了避免分区编号变为负数与21471483647
四.排序与序列化
1.序列化:结构化对象转换为字节流
Java中是通过实现接口Serializable.
package com.hlzq;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
//如果在mapReduce中使用Javabean,则该Javabean类必须能够被序列化
public class Student implements Writable {
private String name;
private int age;
public Student() {
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getAge() {
return age;
}
public void setAge(int age) {
this.age = age;
}
//序列化写
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(name);
out.writeInt(age);
}
//反序列化读,序列化顺序必须和反序列化一致
@Override
public void readFields(DataInput in) throws IOException {
this.name=in.readUTF();
this.age=in.readInt();
}
}
- 排序:指定排序规则,没有则按照地址值。
package com.hlzq;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**如果在mapReduce中使用Javabean,则该Javabean类必须能够被序列化
如果在mapReduce中使用Javabean,想让Javabean作为k2,而且自定义排序规则,则需要实现WritableComparable,则该类也可以被序列化
public interface WritableComparable<T> extends Writable, Comparable<T> {
}*/
public class Student2 implements WritableComparable <Student2>{
private String name;
private int age;
public Student2() {
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getAge() {
return age;
}
public void setAge(int age) {
this.age = age;
}
//序列化写
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(name);
out.writeInt(age);
}
//反序列化读,序列化顺序必须和反序列化一致
@Override
public void readFields(DataInput in) throws IOException {
this.name=in.readUTF();
this.age=in.readInt();
}
//比较器,设置规则大于0,小于0,=0;
@Override
public int compareTo(Student2 o) {
int result=this.getAge()-o.getAge();
return result;
}
}
- 例子:数据格式如下
a 1
a 9
b 3
a 7
b 8
b 10
a 5
a 1
a 5
a 7
a 9
b 3
b 8
b 10
要求:
第一列按照字典顺序进行排列
第一列相同的时候, 第二列按照升序进行排列
package com.hlzq.sort;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class SortBean implements WritableComparable<SortBean> {
private String word;
private int num;
@Override
public String toString() {
return
word + '\t' + num ;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public int getNum() {
return num;
}
public void setNum(int num) {
this.num = num;
}
@Override
//排序算法快排和归并排序
public int compareTo(SortBean o) {
int i = this.word.compareTo(o.getWord());
if (i==0){
return this.getNum()-o.getNum();
}else
return i;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(word);
out.writeInt(num);
}
@Override
public void readFields(DataInput in) throws IOException {
this.word=in.readUTF();
this.num=in.readInt();
}
}
package com.hlzq.sort;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class SortBeanMapper extends Mapper<LongWritable, Text,SortBean, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, SortBean, NullWritable>.Context context) throws IOException, InterruptedException {
//1.拆分v1de 行文本数据
String[] split = value.toString().split("\t");
//2.封装sortbean得到k2
SortBean sortBean=new SortBean();
sortBean.setWord(split[0]);
sortBean.setNum(Integer.parseInt(split[1]));
//3.v2是nullwritable
//4.将k2和v2写入上下文中
context.write(sortBean, NullWritable.get());
}
}
package com.hlzq.sort;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class SortBeanReducer extends Reducer<SortBean, NullWritable,SortBean,NullWritable> {
@Override
protected void reduce(SortBean key, Iterable<NullWritable> values, Reducer<SortBean, NullWritable, SortBean, NullWritable>.Context context) throws IOException, InterruptedException {
context.write(key,NullWritable.get());
}
}
package com.hlzq.sort;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class Jobmain {
public static void main(String[] args) throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException {
//1.创建一个job任务
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration, "mysort");
//2.指定job所在的jar包
job.setJarByClass(Jobmain.class);
//3.指定源文件的读取路径
job.setInputFormatClass(TextInputFormat.class);//按照行读取
TextInputFormat.addInputPath(job,new Path("file:///G:\\input"));//文件所在目录
//4.指定指定以的Mapper类的k2和v2的类型
job.setMapperClass(SortBeanMapper.class);
job.setMapOutputKeyClass(SortBean.class);
job.setMapOutputValueClass(NullWritable.class);
//5.指定reduce
job.setReducerClass(SortBeanReducer.class);
job.setOutputKeyClass(SortBean.class);
job.setMapOutputValueClass(NullWritable.class);
//6.输出目录
Path outputPath = new Path("file:///G:\\\\output");
FileSystem fileSystem = FileSystem.get(new URI("file:///"), new Configuration());
boolean flag=fileSystem.exists(outputPath);
if (flag){
fileSystem.delete(outputPath,true);
}
//8.指定输出方式类和结果输出路径
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,outputPath);
//9.将job提交给样集群
boolean bl=job.waitForCompletion(true);//true表示可以看到任务的执行进度
//10.退出执行进程
System.exit(bl?0:1);
}
}
五.Combineer(规约)
- 每一个map都可能会产生大量的本地输出,combiner的作用就是对map端的输出先做一次合并,减少在map和reduce节点间数据传输量,提高IO性能,是MapReduce的一种优化手段之一。
- 代码
package com.hlzq.mapreduce;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class MyCombiner extends Reducer<Text, LongWritable,Text,LongWritable> {
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
long count=0L;
for (LongWritable value:values){
count+=value.get();
}
//通过context将k3和v3写入上下文
context.write(key,new LongWritable(count));
}
}
在jobmain中指定combiner类
job.setCombinerClass(Mycombiner.class);
如果combiner和reduce相同可以不用写combiner
job.setCombinerClass(MyReduce.class);
六.自定义分组
- 分组:默认是相同k2放在集合中
- 例子:有如下订单数据
订单id 商品id 成交金额
Order_0000001 Pdt_01 222.8
Order_0000001 Pdt_05 25.8
Order_0000002 Pdt_03 522.8
Order_0000002 Pdt_04 122.4
Order_0000002 Pdt_05 722.4
Order_0000003 Pdt_01 2221.8
现在需要求出每一个订单中成交金额最大的一笔交易(TopN)