一、入门案例
自定义一个mapreduce程序(自定义分区):
FlowBean.java(实现hadoop的序列化)
package lltj;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* 自定义bean
* Created by tianjun on 2017/3/14.
*/
public class FlowBean implements WritableComparable<FlowBean> {
long upflow;
long downflow;
long sumflow;
public FlowBean() {
}
public FlowBean(long upflow, long downflow) {
this.upflow = upflow;
this.downflow = downflow;
this.sumflow = upflow + downflow;
}
public long getUpflow() {
return upflow;
}
public void setUpflow(long upflow) {
this.upflow = upflow;
}
public long getDownflow() {
return downflow;
}
public void setDownflow(long downflow) {
this.downflow = downflow;
}
public long getSumflow() {
return sumflow;
}
public void setSumflow(long sumflow) {
this.sumflow = sumflow;
}
@Override
public String toString() {
return upflow +
"\t" + downflow +
"\t" + sumflow ;
}
@Override
public int compareTo(FlowBean o) {
//自定义倒序比较规则
return sumflow > o.getSumflow() ? -1 : 1;
}
//序列化,将对象的字段信息写入输出流
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(upflow);
out.writeLong(downflow);
out.writeLong(sumflow);
}
//反序列化,从输入流中读取各个字段信息
@Override
public void readFields(DataInput in) throws IOException {
upflow = in.readLong();
downflow = in.readLong();
sumflow = in.readLong();
}
}
FlowCount.java
package lltj;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* mapper、reducer、job
* Created by tianjun on 2017/3/14.
*/
public class FlowCount {
//mapper
static class FlowCountMapper extends Mapper<LongWritable,Text,Text,FlowBean>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split("\t");
try{
String phonenbr = fields[1];
long upflow = Long.parseLong(fields[fields.length-3]);
long dflow = Long.parseLong(fields[fields.length-2]);
FlowBean flowBean = new FlowBean(upflow,dflow);
context.write(new Text(phonenbr), flowBean);
}catch (Exception e){
e.printStackTrace();
}
}
}
//reducer
static class FlowCountReducer extends Reducer<Text,FlowBean,Text,FlowBean>{
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
long sum_upFlow = 0;
long sum_dFlow = 0;
//遍历所有的bean,将其上下行流量分别相加
for(FlowBean bean : values){
sum_upFlow += bean.getUpflow();
sum_dFlow += bean.getDownflow();
}
FlowBean resultBean = new FlowBean(sum_upFlow,sum_dFlow);
context.write(key,resultBean);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
String os = System.getProperty("os.name").toLowerCase();
if(os.contains("windows")){
System.setProperty("HADOOP_USER_NAME","root");
}
Configuration conf = new Configuration();
conf.set("mapreduce.framework.name","local");
conf.set("mapreduce.jobtracker.address","local");
conf.set("fs.defaultFS","file:///");
Job job = Job.getInstance(conf);
job.setJarByClass(FlowCount.class);
job.setMapperClass(FlowCountMapper.class);
job.setReducerClass(FlowCountReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
//自定义patition同时制定相应数量的reducetask
job.setPartitionerClass(ProvincePartitioner.class);
job.setNumReduceTasks(5);
FileInputFormat.setInputPaths(job,new Path("hdfs://mini01:9000/input/flow.log"));
FileOutputFormat.setOutputPath(job,new Path("hdfs://mini01:9000/wc/output/flow1"));
boolean res = job.waitForCompletion(true);
System.exit(res ? 0 : 1);
}
}
ProvincePartitioner.java
package lltj;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
import java.util.HashMap;
/**
* k v 对应map的输出
* 定义自己的从map到reduce之间的数据(分组)分发规则
* 按照手机号所属的省份来分发(分组)ProvincePartitioner
* 默认的分组组件是HashPartitioner
* Created by tianjun on 2017/3/14.
*/
public class ProvincePartitioner extends Partitioner<Text,FlowBean> {
static HashMap<String, Integer> provinceMap = new HashMap<>();
static {
provinceMap.put("136",0);
provinceMap.put("137",1);
provinceMap.put("138",2);
provinceMap.put("139",3);
}
/**
* k v 对应map的输出
* @param key
* @param bean
* @param numPartitions
* @return
*/
@Override
public int getPartition(Text key, FlowBean bean, int numPartitions) {
Integer code = provinceMap.get(key.toString().substring(0,3));
return code == null ? 4:code;
}
}
二、切片源码简析
源码简析:
首先由waitForCompletion–>
job.submit()–>
JobSubmiter(成员:(
================= Cluster(成员:
======================== proxy(其中一种就是yarnRunner,如果本地就是localJobRunner)
======================)
)–>
调用FileInputFormat.getSplits()获取切片规划的List—>
序列化生成job.split
—>
将job相关参数写道job.xml文件
原理图如下:
三、入门案例增加
随便打开一个最后的结果:
[root@mini01 ~]# hdfs dfs -cat /wc/output/flow1/part-r-00004
13480253104 180 180 360
13502468823 7335 110349 117684
13560436666 1116 954 2070
13560439658 2034 5892 7926
15013685858 3659 3538 7197
15920133257 3156 2936 6092
15989002119 1938 180 2118
18211575961 1527 2106 3633
18320173382 9531 2412 11943
84138413 4116 1432 5548
可以观察到,并没有按照总流量的大小排序,现在我们在这个基础上解决这个问题.
思路:
由于mapTask中,key是有序的,所以在map中我们可以把flowBean变为key,这样只需要我们重写flowBean继承序列化接口中的compareTo()函数即可轻松实现排序的要求,最后,我们reduce汇总到一个文件里,这样我们所有的数据都按照有序排列了。
FlowCountSort.java
package lltj;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* 在流量统计的基础上进行排序
* Created by tianjun on 2017/3/17.
*/
public class FlowCountSort {
static class FlowCountSortMapper extends Mapper<LongWritable,Text,FlowBean,Text>{
FlowBean flowBean = new FlowBean();
Text t = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//基于统计出来的总流量
String line = value.toString();
String[] files = line.split("\t");
String phone = files[0];
long upFlow = Long.parseLong(files[1]);
long dFlow = Long.parseLong(files[2]);
flowBean.set(upFlow,dFlow);
t.set(phone);
//write是序列化,直接写出去了,所以不用担心最终flowBean每次的值都一样的问题
//输出是按照key排序的
context.write(flowBean,t);
}
}
static class FlowCountSortReducer extends Reducer<FlowBean,Text,Text,FlowBean> {
//<bean(),phone>
@Override
protected void reduce(FlowBean key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
context.write(values.iterator().next(),key);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
String os = System.getProperty("os.name").toLowerCase();
if(os.contains("windows")){
System.setProperty("HADOOP_USER_NAME","root");
}
Configuration conf = new Configuration();
conf.set("mapreduce.framework.name","local");
conf.set("mapreduce.jobtracker.address","local");
conf.set("fs.defaultFS","file:///");
Job job = Job.getInstance(conf);
job.setJarByClass(FlowCountSort.class);
job.setMapperClass(FlowCountSortMapper.class);
job.setReducerClass(FlowCountSortReducer.class);
job.setMapOutputKeyClass(FlowBean.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
FileSystem fs = FileSystem.get(new URI("hdfs://mini01:9000"),new Configuration(),"root");
Path path = new Path("hdfs://mini01:9000/wc/output/flowSort");
if(fs.exists(path)){
fs.delete(path,true);
}
FileInputFormat.setInputPaths(job, new Path("hdfs://mini01:9000/wc/output/flow1"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://mini01:9000/wc/output/flowSort"));
boolean res = job.waitForCompletion(true);
System.exit(res ? 0 : 1);
}
}
结果如下:
[root@mini01 ~]# hdfs dfs -cat /wc/output/flowSort/*
13502468823 7335 110349 117684
13925057413 11058 48243 59301
13726230503 2481 24681 27162
13726238888 2481 24681 27162
18320173382 9531 2412 11943
13560439658 2034 5892 7926
13660577991 6960 690 7650
15013685858 3659 3538 7197
13922314466 3008 3720 6728
15920133257 3156 2936 6092
84138413 4116 1432 5548
13602846565 1938 2910 4848
18211575961 1527 2106 3633
15989002119 1938 180 2118
13560436666 1116 954 2070
13926435656 132 1512 1644
13480253104 180 180 360
13826544101 264 0 264
13719199419 240 0 240
13760778710 120 120 240
13926251106 240 0 240
四、切片需要注意的问题
关于大量小文件的优化策略:
1)默认情况下,TextInputFormat对任务的切片机制是按照文件规划切片,不管文件多小,都会是一个单独的切片,都会交给一个maptask,这样如果有大量小文件,就会产生大量的maptask,处理效率极其地下
2)优化策略:
最好的办法:在数据处理系统的最前端(预处理/采集),就将小文件先合并成大文件,在上传到HDFS做后续分析。
补救方法:如果已经是大量小文件在hdfs上了,可以使用另外一种InputFormat来做切片(CombineFileInputFormat),它的切片逻辑和FileInputFormat不同,他可以将多个小文件从逻辑上规划到一个切片中,这样,多个小文件就可以交给一个maptask了。
使用方式:
在driver类中
//如果不设置InputFormat,默认就是使用TextInputFormat.class
wcjob.setInputFormatClass(CombineFileInputFormat.class);
CombineFileInputFormat.setMaxInputSplitSize(wcjob,4194304);
CombineFileInputFormat.setMinInputSplitSize(wcjob,2097152);