1、Shuffle机制
1)、Partition分区
关于如何分区是一个难点,这边看下HashPartitioner源码
因此默认分区是按照key的hashCode对ReduceTasks个数取模得到的,用户没法控制哪个key存储到哪个分区。
但是这样会导致什么问题的,可能其中一个分区的数据很多,另外一个分区的数据很少,就是所谓的数据倾斜。
2)、自定义Partiton
3)、自定义Partiton举例
将统计结果按照手机归属地不同省份输出到不同文件中(分区)手机号136、137、138、139开头都分别放到一个独立的4个文件中,其他开头的放到一个文件中。
①、MyPartitoner.java
public class MyPartitoner extends Partitioner<Text, FlowBean> {
/**
* 对每一条<k,v>对返回对应的分区号
*
* @param text
* @param flowBean
* @param numPartitions
* @return
*/
public int getPartition(Text text, FlowBean flowBean, int numPartitions) {
//取手机号的前三位
String phone_head = text.toString().substring(0, 3);
switch (phone_head) {
case "136":
return 0;
case "137":
return 1;
case "138":
return 2;
case "139":
return 3;
default:
return 4;
}
}
}
②、NewFlowDriver.java
public class NewFlowDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(NewFlowDriver.class);
job.setMapperClass(FlowMapper.class);
job.setReducerClass(FlowReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
//分区数
job.setNumReduceTasks(5);
job.setPartitionerClass(MyPartitoner.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
FileInputFormat.setInputPaths(job, new Path("d:/DATA/input"));
FileOutputFormat.setOutputPath(job, new Path("d:/DATA/output"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
③、待处理文件内容
phone_data .txt(文件名)
1 13736230513 192.196.100.1 www.atguigu.com 2481 24681 200
2 13846544121 192.196.100.2 264 0 200
3 13956435636 192.196.100.3 132 1512 200
4 13966251146 192.168.100.1 240 0 404
5 18271575951 192.168.100.2 www.atguigu.com 1527 2106 200
6 84188413 192.168.100.3 www.atguigu.com 4116 1432 200
7 13590439668 192.168.100.4 1116 954 200
8 15910133277 192.168.100.5 www.hao123.com 3156 2936 200
9 13729199489 192.168.100.6 240 0 200
10 13630577991 192.168.100.7 www.shouhu.com 6960 690 200
11 15043685818 192.168.100.8 www.baidu.com 3659 3538 200
12 15959002129 192.168.100.9 www.atguigu.com 1938 180 500
13 13560439638 192.168.100.10 918 4938 200
14 13470253144 192.168.100.11 180 180 200
15 13682846555 192.168.100.12 www.qq.com 1938 2910 200
16 13992314666 192.168.100.13 www.gaga.com 3008 3720 200
17 13509468723 192.168.100.14 www.qinghua.com 7335 110349 404
18 18390173782 192.168.100.15 www.sogou.com 9531 2412 200
19 13975057813 192.168.100.16 www.baidu.com 11058 48243 200
20 13768778790 192.168.100.17 120 120 200
21 13568436656 192.168.100.18 www.alibaba.com 2481 24681 200
22 13568436656 192.168.100.19 1116 954 200
④、输出结果
4)、自定义WritableComparable排序案例实操
需求:对总流量进行排序
①、FlowBean.java
/**
* 实现WritableComparable接口
*/
public class FlowBean implements WritableComparable<FlowBean> {
private long upFlow;
private long downFlow;
private long sumFlow;
public void set(long upFlow, long downFlow) {
this.upFlow = upFlow;
this.downFlow = downFlow;
this.sumFlow = upFlow + downFlow;
}
@Override
public String toString() {
return upFlow + "\t" + downFlow + "\t" + sumFlow;
}
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getDownFlow() {
return downFlow;
}
public void setDownFlow(long downFlow) {
this.downFlow = downFlow;
}
public long getSumFlow() {
return sumFlow;
}
public void setSumFlow(long sumFlow) {
this.sumFlow = sumFlow;
}
/**
* 序列化:将对象数据写到框架指定的地方
*
* @param dataOutput 数据的容器
* @throws IOException
*/
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeLong(upFlow);
dataOutput.writeLong(downFlow);
dataOutput.writeLong(sumFlow);
}
/**
* 反序列化:从框架指定的地方读取数据填充对象
*
* @param dataInput 数据的容器
* @throws IOException
*/
public void readFields(DataInput dataInput) throws IOException {
this.upFlow = dataInput.readLong();
this.downFlow = dataInput.readLong();
this.sumFlow = dataInput.readLong();
}
/**
* 比较方法,按照总流量的降序进行排序
*
* @param o
* @return
*/
@Override
public int compareTo(FlowBean o) {
/*if(this.sumFlow<o.sumFlow){
return 1;
}else if(this.sumFlow==o.sumFlow){
return 0;
}else{
return 1;
}*/
return Long.compare(o.sumFlow, this.sumFlow);
}
}
②、CompareMapper.java
public class CompareMapper extends Mapper<LongWritable, Text, FlowBean, Text> {
private FlowBean flow = new FlowBean();
private Text phone = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//一行数据
String line = value.toString();
//切分
String[] fileds = line.split("\t");
//封装
phone.set(fileds[0]);
flow.setUpFlow(Long.parseLong(fileds[1]));
flow.setDownFlow(Long.parseLong(fileds[2]));
flow.setSumFlow(Long.parseLong(fileds[3]));
//写出去
context.write(flow, phone);
}
}
③、CompareReducer.java
public class CompareReducer extends Reducer<FlowBean,Text,Text,FlowBean> {
/**
* Reduce收到的数据已经排完序了,输出就可以
* @param key
* @param values
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(FlowBean key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text value : values) {
context.write(value,key);
}
}
}
④、CompareDriver.java
/**
* 对总流量进行降序排序
*/
public class CompareDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
BasicConfigurator.configure();
Job job = Job.getInstance(new Configuration());
job.setJarByClass(CompareDriver.class);
job.setMapperClass(CompareMapper.class);
job.setReducerClass(CompareReducer.class);
job.setMapOutputKeyClass(FlowBean.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
FileInputFormat.setInputPaths(job, new Path("d:/DATA/output"));
FileOutputFormat.setOutputPath(job, new Path("d:/DATA/output1"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
⑤、要处理的数据(txt)
13470253144 180 180 360
13509468723 7335 110349 117684
13560439638 918 4938 5856
13568436656 3597 25635 29232
13590439668 1116 954 2070
13630577991 6960 690 7650
13682846555 1938 2910 4848
13729199489 240 0 240
13736230513 2481 24681 27162
13768778790 120 120 240
13846544121 264 0 264
13956435636 132 1512 1644
13966251146 240 0 240
13975057813 11058 48243 59301
13992314666 3008 3720 6728
15043685818 3659 3538 7197
15910133277 3156 2936 6092
15959002129 1938 180 2118
18271575951 1527 2106 3633
18390173782 9531 2412 11943
84188413 4116 1432 5548
⑥、处理的结果
13509468723 7335 110349 117684
13975057813 11058 48243 59301
13568436656 3597 25635 29232
13736230513 2481 24681 27162
18390173782 9531 2412 11943
13630577991 6960 690 7650
15043685818 3659 3538 7197
13992314666 3008 3720 6728
15910133277 3156 2936 6092
13560439638 918 4938 5856
84188413 4116 1432 5548
13682846555 1938 2910 4848
18271575951 1527 2106 3633
15959002129 1938 180 2118
13590439668 1116 954 2070
13956435636 132 1512 1644
13470253144 180 180 360
13846544121 264 0 264
13729199489 240 0 240
13768778790 120 120 240
13966251146 240 0 240
5)、自定义SortComparator排序案例实操
就那上面的题目进行讲解,直接增加一个自定义SortComparator排序
①、FlowComparator,java
public class FlowComparator extends WritableComparator {
protected FlowComparator() {
super(FlowBean.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
FlowBean fa = (FlowBean) a;
FlowBean fb = (FlowBean) b;
return Long.compare(fb.getSumFlow(), fa.getSumFlow());
}
}
②、在上述的CompareDriver添加
job.setSortComparatorClass(FlowComparator.class);
二者功能差不多,大多数可能会用第一个,比较直观的看出
6)、Combiner合并
上图中红框就是combiner的两次操作(相当于提前实现了Reduce的功能)
①、自定义Combiner实现步骤:
a、自定义一个Combiner继承Reducer,重写Reduce方法
public class WordcountCombiner extends Reducer<Text, IntWritable, Text,IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
// 1 汇总操作
int count = 0;
for(IntWritable v :values){
count += v.get();
}
// 2 写出
context.write(key, new IntWritable(count));
}
}
b、在Job驱动类中设置
job.setCombinerClass(WordcountCombiner.class);
7)、GroupingComparator分组(辅助排序)
对Reduce阶段的数据根据某一个或几个字段进行分组。
①、分组排序步骤
a、自定义类继承WritableComparator
b、重写compare()方法
@Override
public int compare(WritableComparable a, WritableComparable b) {
// 比较的业务逻辑
return result;
}
c、创建一个构造将比较对象的类传给父类
protected OrderGroupingComparator() {
super(OrderBean.class, true);
}
8)、GroupingComparator分组案例实操
需求:现在需要求出每一个订单中最贵的商品
①、要处理的数据(按照订单id、商品id、成交金额排列)
0000001 Pdt_01 222.8
0000002 Pdt_05 722.4
0000001 Pdt_02 33.8
0000003 Pdt_06 232.8
0000003 Pdt_02 33.8
0000002 Pdt_03 522.8
0000002 Pdt_04 122.4
②、需求解析(思路)
③、代码实现
a、OrderBean.java
public class OrderBean implements WritableComparable<OrderBean> {
private String orderId;
private String productId;
private double price;
@Override
public String toString() {
return orderId + '\t' + productId + '\t' + + price;
}
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public String getProductId() {
return productId;
}
public void setProductId(String productId) {
this.productId = productId;
}
public double getPrice() {
return price;
}
public void setPrice(double price) {
this.price = price;
}
/**
* 排序逻辑:先按照订单排序,订单相同按照价格降序排列
* @param o
* @return
*/
@Override
public int compareTo(OrderBean o) {
int compare = this.orderId.compareTo(o.orderId);
if(compare !=0){
return compare;
}else {
return Double.compare(o.price,this.price);
}
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(orderId);
dataOutput.writeUTF(productId);
dataOutput.writeDouble(price);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.orderId=dataInput.readUTF();
this.productId=dataInput.readUTF();
this.price=dataInput.readDouble();
}
}
b、OrderMapper.java
/**
* 封装OrderBean
*/
public class OrderMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable> {
private OrderBean order = new OrderBean();
/**
* Map用于封装OrderBean
*
* @param key
* @param value
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//拆分
String[] fields = value.toString().split("\t");
//封装过程
order.setOrderId(fields[0]);
order.setProductId(fields[1]);
order.setPrice(Double.parseDouble(fields[2]));
context.write(order, NullWritable.get());
}
}
c、OrderComparator.java
/**
* 按照订单编号对数据进行分组
*/
public class OrderComparator extends WritableComparator {
protected OrderComparator() {
super(OrderBean.class, true);
}
/**
* 分组比较方法,按照相同订单进入一组进行比较
*
* @param a
* @param b
* @return
*/
@Override
public int compare(WritableComparable a, WritableComparable b) {
OrderBean oa = (OrderBean) a;
OrderBean ob = (OrderBean) b;
return oa.getOrderId().compareTo(ob.getOrderId());
}
}
d、OrderReducer.java
/**
* 取每个订单的最高价格
*/
public class OrderReducer extends Reducer<OrderBean, NullWritable, OrderBean, NullWritable> {
/**
* 取每个订单的最高价格
*
* @param key 订单信息(最大值)
* @param values 无任何有效信息
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
e、OrderDriver.java
public class OrderDriver {
public static void main(String[] args) throws Exception {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(OrderDriver.class);
job.setMapperClass(OrderMapper.class);
job.setReducerClass(OrderReducer.class);
//设置分组比较器
job.setGroupingComparatorClass(OrderComparator.class);
job.setMapOutputKeyClass(OrderBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("D:/DATA/input"));
FileOutputFormat.setOutputPath(job, new Path("D:/DATA/output3"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
④、拓展(取每个订单的前二高价格)
唯一变动的就是OrderReducer的代码
public class OrderReducer extends Reducer<OrderBean, NullWritable, OrderBean, NullWritable> {
/**
* 取每个订单的最高价格
*
* @param key 订单信息(最大值)
* @param values 无任何有效信息
* @param context
* @throws IOException
* @throws InterruptedException
@Override protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}*/
/**
* 取每个订单的前二高价格
*
* @param key
* @param values
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
Iterator<NullWritable> iterator = values.iterator();
for (int i = 0; i < 2; i++) {
if (iterator.hasNext()) {
NullWritable value = iterator.next();
context.write(key, value);
}
}
}
}
注解:这边为什么要这样做?是因为Reducer传来的数据是序列化的,Reducer用的时候会将数据反序列化,会提前准备<key,value>的空对象,当第一组过来反序列化操作,next后第二组反序列化操作,但是这个对象的地址没变
2、如何在网页打开hdfs上直接操作
<property>
<name>hadoop.http.staticuser.user</name>
<value>root</value>
</property>
这样就授权为root,还是要根据自己虚拟机hadoop的权限来设置
3、对比Map、MapTask、Mapper、Mapper.map
Map阶段:是抽象的概念,在这个时期实际执行的是MapTask
MapTask:里面调用run方法就是一个Map的阶段(等同于上面概念的实现类)
Mapper:在MapTask中会调用写的Mapper(自己写的在run中会建立Mapper对象/如果没写就用系统默认的)
Mapper.map:run会调用Mapper对象的map方法
4、MapTask工作机制
5、ReduceTask工作机制
提问:reduce设置几个呢?
reduce设置几个是手动设置的,根据具体的数据量人为设定,比如设置10个,就是:setNumReduceTasks(10),