-
1.部署mr app
- 官网:hadoop.apache.org
- https://github.com/apache/hadoop
- https://github.com/apache/project
[hadoop@bigdata13 wc]$ hadoop jar Usage: hadoop jar <jar> [mainClass] args... 临时测试启动: hadoop jar /home/hadoop/project/wc/wc.jar com.bigdata.mapreduce.WCAPP \ /wc/input /wc/output shell脚本: if [ $# -lt 2 ];then echo "Usage: WCAPP <in> <out>" echo "<in> hdfs input path" echo "<out> hdfs output path" echo "eg:$0 /wc/input /wc/out" fi inputpath=$1 outputpath=$2 hadoop jar /home/hadoop/project/wc/wc.jar com.bigdata.mapreduce.WCAPP \ ${inputpath} ${outputpath}
- 官网:hadoop.apache.org
-
2. Map task 个数由什么决定
- mr中核心
- intput : InputFormat 如何加载数据
- 1. getSplits : 获取数据的切片数量
- 2. createRecordReader :
- map : Mapper
- reduce : Reducer
- output : OutputFormat =》指定数据输出
- intput : InputFormat 如何加载数据
- mr中核心
intput 文件
- FileInputFoormat
TextInputFormat:<LongWritable, Test>- key
- vaslue
- 1.isSplitable : 判断你的文件是否可以被切分
- 可以切分 : 分片数量 等同于 maptask的数据
map task 的数量 - 不可以切分 :切片数量就是1 对应一个map task
- 可以切分 : 分片数量 等同于 maptask的数据
- 2. createRecordReader
- 1.isSplitable : 判断你的文件是否可以被切分
- 总结:
1.可以被切分 分片数量 等同于 maptask的数据 补充: map task数据由 input 的切片数量决定的 2.不可以被切分 切片数量就是1 对应一个map task
文件能被切分- 1.文件大小小于128M
- 就有一个切片 => 一个maptask
- 2.文件大小大于128M
- filesize/splitsize = num 切片数
- filesize剩余的部分:比较大小 splisize * 10%
filesize-splitsize|
大:额外开启一个切片
小:不会开启切片
- 注:idea 本地环境
splitsize 32M - eg
- input 130M
130/128 = 1
130 - 128 = 2M 128M * 10% = 12.8M - intput 160M =》 2个
160/128 = 1
160M - 128M = 32M 128M * 10% = 12.8M
32M > 12.8M 切片+1 - input 560M 5个切片
560/128M = 4
560M - 512M = 48M 128M * 10% = 12.8M
48M > 12.8M 切片+1
- input 130M
- 1.文件大小小于128M
-
3.Reduce task 个数
- 1.mr 用户自己决定
- 2.默认reduce task 个数是1个
- 3.reduce task 个数 最终文件输出的时候 就有多少个文件
-
4.分区
- ”把相同的key拉倒一起“ key按照某个规则进行分发到 一起
- 需求
基于phone结果数据,进行分文件存储 13开头的结果数据 存储到 一个文件中 15开头的结果数据 存储到 一个文件中 其他结果数据 存储到 一个文件中
- 1.reduce task 3
- 2.分区需要自定义
- 总结
- 1. reduce task 个数 > partitions 会有空文件
- 2. reduce task 个数 < partitions 会报错
- 2. reduce task 个数 = partitions 没问题
- sql vs mr
- sql
- group by
- distinct
- join
- order by
- 1.group by => mr
- map =>
- reduce =>
- 2.distinct
- emp : ename job sql
-
去重
select
distinct(ename)
from emp ;select
name
from user_info
group by name;name:
zs
zs
zsmap:
zs
zs
zsreduce:
zs,<null,null,null> -
/** * mapper */ public static class MyMapper extends Mapper<Object, Text, Text, NullWritable> { @Override protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] names = line.split(","); for (String name : names) { context.write(new Text(name),NullWritable.get()); } } } /** * reducer */ public static class MyReducer extends Reducer<Text, NullWritable,Text,NullWritable> { /** * map: * zhangsan,null * zhangsan,null * zhangsan,null */ @Override protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException { /** * shuffle: * zhangsan,<null,null,null> */ context.write(key,NullWritable.get()); /** * zhangsan */ } }
-
3.order by
-
mr
-
1.全局排序 reduce task 1
-
2.分区排序 reduce task 4
-
-
mr : k 每个阶段都是有排序的
map kv
reduce kv-
k : intwritebale
重写里面的compare to 这个方法实现排序 -
需求:
-
基于phone(day07)统计流量 总流量按降序排序
map: k =》 all v => other reducer k v
-
public class MyIntWriteAble extends IntWritable { public MyIntWriteAble() { } public MyIntWriteAble(int value) { super(value); } @Override public int compareTo(IntWritable o) { return super.compareTo(o); } }
-
public class OrderByApp { /** * driver * @param args */ public static void main(String[] args) throws Exception { String input="out/phone/part-r-00000"; String output="out/orderby"; Configuration conf = new Configuration(); //0.todo... 删除目标路径 FileUtils.deletePath(conf,output); //1.设置 作业名称 Job job = Job.getInstance(conf, "OrderByApp"); //2.设置map reduce 执行代码的主类 job.setJarByClass(OrderByApp.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); //3.指定 oupput kv类型 job.setMapOutputKeyClass(MyIntWriteAble.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); //4. 设置数据源路径 数据输出路径 FileInputFormat.addInputPath(job, new Path(input)); FileOutputFormat.setOutputPath(job, new Path(output)); //5. 提交mr yarn System.exit(job.waitForCompletion(true) ? 0 : 1); } /** * mapper */ public static class MyMapper extends Mapper<Object, Text, MyIntWriteAble, Text> { @Override protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); // 13480253104 180 180 360 String[] split = line.split("\t"); String all = split[3]; String phone = split[0]; String up = split[1]; String down = split[2]; context.write(new MyIntWriteAble(Integer.parseInt(all)), new Text(phone+"\t"+up+"\t"+down)); } } /** * reducer */ public static class MyReducer extends Reducer<MyIntWriteAble, Text,Text,NullWritable> { /** * map : * 360,<(13480253104 180 180),(135 120 240)> */ @Override protected void reduce(MyIntWriteAble key, Iterable<Text> values, Context context) throws IOException, InterruptedException { for (Text value : values) { context.write(new Text(value+"\t"+key),NullWritable.get()); } } } }
-
-
-
- join
- 1.普通的join
- 2. map join
- shuffle:
生产上能不用shuffle的操作就不用shuffle - input
- emp.log
- dept.log
- map:
dept:10,ACCOUNTING,NEW YORK
emp: 7369,SMITH,CLERK,7902,1980-12-17 00:00:00,800.00,,20
on key
deptno,emp表字段
deptno,dept表字段 -
reduce
deptno,<(emp表字段),(dept表字段)>
emp表字段,dept表字段 =>需求
-
package com.bigdata.mapreduce; import com.bigdata.mapreduce.writeable.MyIntWriteAble; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; import java.util.ArrayList; /** * @author sxwang * 11 18 15:56 */ public class JoinApp { /** * driver * @param args */ public static void main(String[] args) throws Exception { String input="data/join"; String output="out/join"; Configuration conf = new Configuration(); //0.todo... 删除目标路径 FileUtils.deletePath(conf,output); //1.设置 作业名称 Job job = Job.getInstance(conf, "JoinApp"); //2.设置map reduce 执行代码的主类 job.setJarByClass(JoinApp.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); //3.指定 oupput kv类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); //4. 设置数据源路径 数据输出路径 FileInputFormat.addInputPath(job, new Path(input)); FileOutputFormat.setOutputPath(job, new Path(output)); //5. 提交mr yarn System.exit(job.waitForCompletion(true) ? 0 : 1); } /** * mapper */ public static class MyMapper extends Mapper<Object, Text, Text, Text> { String tableName; @Override protected void setup(Context context) throws IOException, InterruptedException { FileSplit inputSplit = (FileSplit)context.getInputSplit(); tableName = inputSplit.getPath().getName(); } @Override protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { if(tableName.startsWith("dept")){ //部门表 String line = value.toString(); // 10,ACCOUNTING,NEW YORK String[] split = line.split(","); String deptno = split[0]; String dname = split[1]; String loc = split[2]; context.write(new Text(deptno),new Text(dname+"\t"+loc+"\t"+"dept")); }else { //员工表 String line = value.toString(); String[] split = line.split(","); // 7566,JONES,MANAGER,7839,1981-04-02 00:00:00,2975.00,,20 String deptno = split[split.length - 1]; String ename = split[1]; String job = split[2]; context.write(new Text(deptno),new Text(ename+"\t"+job+"\t"+"emp")); } } } /** * reducer */ public static class MyReducer extends Reducer<Text, Text,Text,Text> { /** * map后 reduce之前: * dept + emp * 10,<(ACCOUNTING,NEW YORK,dept),(JONES,MANAGER,emp),(JONES,MANAGER,emp)(JONES,MANAGER,emp)> * */ @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { String deptTableColumns=""; ArrayList<String> empData=new ArrayList<>(); for (Text value : values) { String[] split = value.toString().split("\t"); String table = split[split.length - 1]; if(table.equals("dept")){ //dept deptTableColumns=split[0]+"\t"+split[1]; }else { // emp // JONES,MANAGER,emp empData.add(split[0]+"\t"+split[1]); } } for (String emp : empData) { context.write(new Text(emp),new Text(key+"\t"+deptTableColumns)); } } } }
- sql