一.mapreduce框架的设计思想:
二.简单的单词统计:
map:
package hadoop.mapreduce.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
- @program:bigdata
- @package:hadoop.mapreduce.wordcount
- @filename:WordCountMapper.java
- @create:2019.09.24.09.08
- @author:Administrator
- @descrption.
*/
/*
- 1:默认情况下,是mr框架所读到的一行文本的起始偏移量,Long
- 但是在hadoop中有更精简的序列化接口,所有不能直接用Long,而用LongWritable
- 2:默认情况下是mr框架所读到的一行文本内容,String ,同上Text
- 3:是用户自定义逻辑处理完成之后输出数据中的key,在此处是单词,String,同上Text
- 4:是用户自定义逻辑处理完成之后输出数据中的value,在此处是单词次数,Integer,同上IntWritable
- /
public class WordCountMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
/- map阶段的业务逻辑就写在自定义的map方法中
- maptask会对每一行输入数据调用一次我们自定义的map()方法
- */
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//将maptask传给我们的文本内容转换成String
String line = value.toString();
//根据空格切割
String[] words = line.split(" ");
//将单词输出<单词,1>
for (String word: words){
//将单词作为key,次数作为value,以便后续的数据分发,以便于相同的单词会到相同的reducetask
context.write(new Text(word),new IntWritable(1));
}
}
}
reduce:
package hadoop.mapreduce.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
/**
- @program:bigdata
- @package:hadoop.mapreduce.wordcount
- @filename:WordCountReduce.java
- @create:2019.09.24.09.47
- @author:Administrator
- @descrption.
*/
/*
- 1,2:对应map的输出
- 3,4:自定义reduce程序的输出结果
- /
public class WordCountReduce extends Reducer<Text, IntWritable,Text,IntWritable> {
/- <a,1><a,1><a,1><a,1><a,1>
<b,1><b,1><b,1><b,1><b,1>- key是一组单词的key
- */
@Override
protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
int count =0;
// 可以用迭代器
// Iterator iterator = values.iterator();
// 也可以用for
for (IntWritable value:values){
count+=value.get();
}
context.write(new Text(key),new IntWritable(count));
}
}
- <a,1><a,1><a,1><a,1><a,1>
driver类
package hadoop.mapreduce.wordcount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
-
@program:bigdata
-
@package:hadoop.mapreduce.wordcount
-
@filename:WordCountDriver.java
-
@create:2019.09.24.10.09
-
@author:Administrator
-
@descrption.相当于一个yarn集群的客户端
-
需要再此封装我们的mr程序的相关参数,指定jar包
-
最后提交给yarn
*/
public class WordCountDriver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//指定本程序的jar包所在的本地路径
job.setJarByClass(WordCountDriver.class);//指定本业务job要使用的mapper/reduce业务类 job.setMapperClass(WordCountMapper.class); job.setReducerClass(WordCountReduce.class); //指定mapper输出的kv类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); //指定最终的输出类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //指定job的输入原始文件的所在目录 FileInputFormat.setInputPaths(job,new Path(args[0])); //指定job的输出结果所在目录 FileOutputFormat.setOutputPath(job,new Path(args[1])); //将job中配置的相关参数以及job以及job所用的java类所在的jar包,提交给yarn去运行、
// job.submit();//看不到结果
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
普通的jar运行:
jar -cp bigdata-1.0-SNAPSHOT.jar hadoop.mapreduce.wordcount.WordCountDriver /wordcount/input wordcount/output这时没有hadoop的jar包所以用hadoop jar
hadoop jar bigdata-1.0-SNAPSHOT.jar hadoop.mapreduce.wordcount.WordCountDriver /wordcount/input /wordcount/output
三.wordcount运行过程的解析
四:流量汇总程序
package hadoop.mapreduce.flowsum;
import hadoop.mapreduce.wordcount.WordCountDriver;
import hadoop.mapreduce.wordcount.WordCountMapper;
import hadoop.mapreduce.wordcount.WordCountReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
-
@program:bigdata
-
@package:hadoop.mapreduce.flowsum
-
@filename:FlowCountMapper.java
-
@create:2019.09.24.16.25
-
@author:Administrator
-
@descrption.
*/
public class FlowCount {static class FlowCountMapper extends Mapper<LongWritable, Text,Text,FlowBean> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//将一行内容转成string
String line = value.toString();
//切分自动
String[] fields = line.split("\t");
//取出手机号
String phoneNbr=fields[1];
//取出上行流量
long upFlow=Long.parseLong(fields[fields.length-3]);
//取出下行流量
long dFlow=Long.parseLong(fields[fields.length-2]);
context.write(new Text(phoneNbr),new FlowBean(upFlow,dFlow));
}
}static class FlowCountReduce extends Reducer<Text,FlowBean,Text,FlowBean> {
@Override
protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
long sum_upFlow=0;
long sum_dFlow=0;for (FlowBean bean:values){ sum_upFlow+=bean.getUpFlow(); sum_dFlow+=bean.getdFlow(); } context.write(new Text(key),new FlowBean(sum_upFlow,sum_dFlow)); }
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//指定本程序的jar包所在的本地路径
job.setJarByClass(FlowCount.class);//指定本业务job要使用的mapper/reduce业务类 job.setMapperClass(FlowCountMapper.class); job.setReducerClass(FlowCountReduce.class); //指定mapper输出的kv类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(FlowBean.class); //指定最终的输出类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(FlowBean.class); //指定job的输入原始文件的所在目录 FileInputFormat.setInputPaths(job,new Path(args[0])); //指定job的输出结果所在目录 FileOutputFormat.setOutputPath(job,new Path(args[1])); //将job中配置的相关参数以及job以及job所用的java类所在的jar包,提交给yarn去运行、
// job.submit();//看不到结果
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
运行: hadoop jar bigdata-1.0-SNAPSHOT.jar hadoop.mapreduce.flowsum.FlowCount /flowsum/input /flowsum/output
4.1:在四的基础上自定义分区
按省分区
package hadoop.mapreduce.province;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Partitioner;
import java.util.HashMap;
/**
-
@program:bigdata
-
@package:hadoop.mapreduce.province
-
@filename:ProvincePartitioner.java
-
@create:2019.09.25.09.27
-
@author:Administrator
-
@descrption.
/
/ -
泛型1,2是map的kv
-
*/
public class ProvincePartitioner extends Partitioner<Text,FlowBean> {
public static HashMap<String,Integer> prId=new HashMap<String, Integer>();
static {
prId.put(“136”,0);
prId.put(“137”,1);
prId.put(“138”,2);
prId.put(“139”,3);
}public int getPartition(Text text, FlowBean flowBean, int i) {
String prefix = text.toString().substring(0, 3);
Integer provinceId = prId.get(prefix);
return (provinceId==null)?4:provinceId;
}
}
其他代码于四一致,只需在main方法中添加一下代码:
//指定我们自定义的数据分区
job.setPartitionerClass(ProvincePartitioner.class);
//同时指定相应数据分区数量的reducetask
job.setNumReduceTasks(5);
五:maptask任务分配切片机制
1.3.1 mapTask并行度的决定机制
一个job的map阶段并行度由客户端在提交job时决定
而客户端对map阶段并行度的规划的基本逻辑为:
将待处理数据执行逻辑切片(即按照一个特定切片大小,将待处理数据划分成逻辑上的多个split),然后每一个split分配一个mapTask并行实例处理
这段逻辑及形成的切片规划描述文件,由FileInputFormat实现类的getSplits()方法完成,其过程如下图:
1.3.2 FileInputFormat切片机制
1、切片定义在InputFormat类中的getSplit()方法
2、FileInputFormat中默认的切片机制:
a) 简单地按照文件的内容长度进行切片
b) 切片大小,默认等于block大小
c) 切片时不考虑数据集整体,而是逐个针对每一个文件单独切片
比如待处理数据有两个文件:
file1.txt 320Mfile2.txt 10M
经过FileInputFormat的切片机制运算后,形成的切片信息如下:
file1.txt.split1-- 0~128file1.txt.split2-- 128~256file1.txt.split3-- 256~320file2.txt.split1-- 0~10M
如果剩下的文件大小/切片大小<1.1,切成一块
3、FileInputFormat中切片的大小的参数配置
通过分析源码,在FileInputFormat中,计算切片大小的逻辑:Math.max(minSize, Math.min(maxSize, blockSize)); 切片主要由这几个值来运算决定
minsize:默认值:1 配置参数: mapreduce.input.fileinputformat.split.minsize
maxsize:默认值:Long.MAXValue 配置参数:mapreduce.input.fileinputformat.split.maxsize
blocksize
因此,默认情况下,切片大小=blocksize
maxsize(切片最大值):
参数如果调得比blocksize小,则会让切片变小,而且就等于配置的这个参数的值
minsize (切片最小值):
参数调的比blockSize大,则可以让切片变得比blocksize还大
选择并发数的影响因素:
1、运算节点的硬件配置
2、运算任务的类型:CPU密集型还是IO密集型
3、运算任务的数据量
六:MapReduce程序的整体提交流程