MapReduce经典案例总结
首先得搭好hadoop环境,windows搭好单机环境
1、根据手机号统计上网流量,统计上行流量、下行流量和总流量
数据如下,文件 flow.log,上传到hadoop集群目录 /flow/input,经过一次mapper,Reducer完成统计,mapper默认使用TextInput,每次读取文件一行内容,用手机号作为key,上行、下行、总流量封装到bean中,map阶段输出bean,把同一个手机号的上网流量汇总在一起,合并排序。放到reducer阶段,reducer阶段只需要把同一个手机号产生的流量相加便得到总流量。
1363157985066 13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
1363157995052 13826544101 5C-0E-8B-C7-F1-E0:CMCC 120.197.40.4 4 0 264 0 200
1363157991076 13926435656 20-10-7A-28-CC-0A:CMCC 120.196.100.99 2 4 132 1512 200
1363154400022 13926251106 5C-0E-8B-8B-B1-50:CMCC 120.197.40.4 4 0 240 0 200
1363157993044 18211575961 94-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99 iface.qiyi.com 视频网站 15 12 1527 2106 200
1363157995074 84138413 5C-0E-8B-8C-E8-20:7DaysInn 120.197.40.4 122.72.52.12 20 16 4116 1432 200
1363157993055 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15 1116 954 200
1363157995033 15920133257 5C-0E-8B-C7-BA-20:CMCC 120.197.40.4 sug.so.360.cn 信息安全 20 20 3156 2936 200
1363157983019 13719199419 68-A1-B7-03-07-B1:CMCC-EASY 120.196.100.82 4 0 240 0 200
1363157984041 13660577991 5C-0E-8B-92-5C-20:CMCC-EASY 120.197.40.4 s19.cnzz.com 站点统计 24 9 6960 690 200
1363157973098 15013685858 5C-0E-8B-C7-F7-90:CMCC 120.197.40.4 rank.ie.sogou.com 搜索引擎 28 27 3659 3538 200
1363157986029 15989002119 E8-99-C4-4E-93-E0:CMCC-EASY 120.196.100.99 www.umeng.com 站点统计 3 3 1938 180 200
1363157992093 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 15 9 918 4938 200
1363157986041 13480253104 5C-0E-8B-C7-FC-80:CMCC-EASY 120.197.40.4 3 3 180 180 200
1363157984040 13602846565 5C-0E-8B-8B-B6-00:CMCC 120.197.40.4 2052.flash2-http.qq.com 综合门户 15 12 1938 2910 200
1363157995093 13922314466 00-FD-07-A2-EC-BA:CMCC 120.196.100.82 img.qfc.cn 12 12 3008 3720 200
1363157982040 13502468823 5C-0A-5B-6A-0B-D4:CMCC-EASY 120.196.100.99 y0.ifengimg.com 综合门户 57 102 7335 110349 200
1363157986072 18320173382 84-25-DB-4F-10-1A:CMCC-EASY 120.196.100.99 input.shouji.sogou.com 搜索引擎 21 18 9531 2412 200
1363157990043 13925057413 00-1F-64-E1-E6-9A:CMCC 120.196.100.55 t3.baidu.com 搜索引擎 69 63 11058 48243 200
1363157988072 13760778710 00-FD-07-A4-7B-08:CMCC 120.196.100.82 2 2 120 120 200
1363157985066 13726238888 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
1363157993055 13560436666 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15 1116 954 200
java代码如下
package com.skymesh.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
import java.io.IOException;
import java.util.Iterator;
/**
* 流量统计经典题目
*/
public class FlowMapReduce {
static class FlowMapper extends Mapper<LongWritable, Text,Text,FlowBean> {
Text t = new Text();
FlowBean bean=new FlowBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] str = value.toString().split("\t");
if (str.length>3){
String phone = str[1];
long upFlow = Long.parseLong(str[str.length - 3]);
long downFlow = Long.parseLong(str[str.length - 2]);
t.set(phone);
bean.setDownFlow(downFlow);
bean.setUpFlow(upFlow);
//文件很多行会创建很多对象,此处可以使用一个对象和一个地址是由于对象直接被序列化,数据被写入了文件
//context.write(new Text(phone),new FlowBean(upFlow, downFlow));
context.write(t,bean);
}
}
}
static class FlowReducer extends Reducer<Text,FlowBean,Text,FlowBean> {
FlowBean bean=new FlowBean();
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
long sumUpFlow=0;
long sumDFlow=0;
Iterator<FlowBean> iterator = values.iterator();
while (iterator.hasNext()){
FlowBean flowBean = iterator.next();
sumUpFlow = sumUpFlow +flowBean.getUpFlow();
sumDFlow = sumDFlow + flowBean.getDownFlow();
}
bean.setUpFlow(sumUpFlow);
bean.setDownFlow(sumDFlow);
context.write(key,bean);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//指定mapreduce执行在哪个环境
Configuration conf = new Configuration();
conf.set("fs.default.name", "hdfs://192.168.136.128:9000");
conf.set("yarn.resoucemanager.hostname", "hadoop1");
Job job = Job.getInstance(conf);
job.setJarByClass(FlowMapReduce.class);
//指定mapper和reducer的执行
job.setMapperClass(FlowMapper.class);
job.setReducerClass(FlowReducer.class);
//设置返回键值类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
//设置输入输出文件系统所在的位置
FileInputFormat.setInputPaths(job,new Path("/flow/input"));
FileOutputFormat.setOutputPath(job,new Path("/flow/output"));
//将job中配置的相关参数,以及job所用的java类所在的jar包,提交给yarn去运行
/*job.submit();*/
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
//---------------------------bean-----------------------//
package com.skymesh.mapreduce;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* 流量统计
*
*/
public class FlowBean implements WritableComparable<FlowBean> {
private long upFlow;
private long downFlow;
private long sumFlow;
//反序列化时,需要反射调用空参构造函数,所以要显示定义一个
public FlowBean(){}
public FlowBean(long upFlow, long dFlow) {
this.upFlow = upFlow;
this.downFlow = dFlow;
this.sumFlow = upFlow + dFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public void setDownFlow(long downFlow) {
this.downFlow = downFlow;
}
public long getUpFlow() {
return upFlow;
}
public long getDownFlow() {
return downFlow;
}
public void setSumFlow(long sumFlow) {
this.sumFlow = sumFlow;
}
public long getSumFlow() {
return sumFlow;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeLong(upFlow);
dataOutput.writeLong(downFlow);
dataOutput.writeLong(sumFlow);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
upFlow = dataInput.readLong();
downFlow = dataInput.readLong();
sumFlow = dataInput.readLong();
}
@Override
public String toString() {
return "FlowBean{" +
"upFlow=" + upFlow +
", downFlow=" + downFlow +
", sumFlow=" + sumFlow +
'}';
}
@Override
public int compareTo(FlowBean o) {
return this.sumFlow>o.getSumFlow()?-1:1;
}
}
2、大数据表关联,传统关系型数据无法存放大量数据
原始数据模型,需要把用户表和部门表命名成user*.txt 和 dept*.txt,并且传到hadoop集群 /table/input中
/**
* mapreduce 实现多表关联
* sys_user
* userId userName age deptId
* 10 zhangsan 18 1000
* 11 lisi 19 1001
* 12 wangwu 24 1000
* 13 lier 23 1001
* 14 jd 32 1001
*
* sys_dept
* deptId deptName layerCode
* 1000 综合部 001
* 1001 网络安全 002
*/
java代码
package com.skymesh.mapreduce;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* mapreduce 实现多表关联
* sys_user
* userId userName age deptId
* 10 zhangsan 18 1000
* * 11 lisi 19 1001
* * 12 wangwu 24 1000
* * 13 lier 23 1001
* * 14 jd 32 1001
*
* sys_dept
* deptId deptName layerCode
* 1000 综合部 001
* 1001 网络安全 002
*/
public class TableJoinMapReduce {
static class TableJoinMapper extends Mapper<LongWritable, Text,Text,TbaleJoinBean>{
TbaleJoinBean bean = new TbaleJoinBean();
Text text = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
FileSplit fileSplit = (FileSplit) context.getInputSplit();
String fileNname = fileSplit.getPath().getName();
String[] fileds = line.split(" ");
//读取的是用户表 userId userName age deptId
if (fileNname.contains("user")){
text.set(fileds[3]);
bean.setUserId(fileds[0]);
bean.setUserName(fileds[1]);
bean.setAge(Integer.parseInt(fileds[2]));
bean.setDeptId(fileds[3]);
bean.setDeptName("");
bean.setLayerCode("");
bean.setFlag("user");
context.write(text,bean);
}else {//读取的是部门表 deptId deptName layerCode
text.set(fileds[0]);
bean.setUserId("");
bean.setUserName("");
bean.setAge(0);
bean.setDeptId(fileds[0]);
bean.setDeptName(fileds[1]);
bean.setLayerCode(fileds[2]);
bean.setFlag("dept");
context.write(text,bean);
}
}
}
static class TableJoinReducer extends Reducer<Text, TbaleJoinBean,Text,TbaleJoinBean>{
@Override
protected void reduce(Text key, Iterable<TbaleJoinBean> values, Context context) throws IOException, InterruptedException {
Iterator<TbaleJoinBean> iterator = values.iterator();
TbaleJoinBean deptBean = new TbaleJoinBean();
List<TbaleJoinBean> userList = new ArrayList<TbaleJoinBean>();
while (iterator.hasNext()){
TbaleJoinBean bean = iterator.next();
if ("user".equals(bean.getFlag())){ //用户
TbaleJoinBean userBean = new TbaleJoinBean();
try {
BeanUtils.copyProperties(userBean,bean);
userList.add(userBean);
} catch (Exception e) {
e.printStackTrace();
}
}else { //部门
try {
BeanUtils.copyProperties(deptBean,bean);
} catch (Exception e) {
e.printStackTrace();
}
}
}
for (TbaleJoinBean bean: userList){
bean.setDeptName(deptBean.getDeptName());
bean.setLayerCode(deptBean.getLayerCode());
context.write(key,bean);
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("fs.default.name", "hdfs://192.168.136.128:9000");
conf.set("yarn.resoucemanager.hostname", "hadoop1");
Job job = Job.getInstance(conf);
job.setMapOutputValueClass(TbaleJoinBean.class);
job.setMapOutputKeyClass(Text.class);
job.setMapperClass(TableJoinMapper.class);
job.setReducerClass(TableJoinReducer.class);
job.setJarByClass(TableJoinMapReduce.class);
job.setUser("root");
FileInputFormat.setInputPaths(job,new Path("/table/input"));
FileOutputFormat.setOutputPath(job,new Path("/table/output"));
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
//----------------------------bean------------------------------//
package com.skymesh.mapreduce;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class TbaleJoinBean implements Writable {
private String userId ;
private String userName;
public String getUserId() {
return userId;
}
public void setUserId(String userId) {
this.userId = userId;
}
public String getUserName() {
return userName;
}
public void setUserName(String userName) {
this.userName = userName;
}
public int getAge() {
return age;
}
public void setAge(int age) {
this.age = age;
}
public String getDeptName() {
return deptName;
}
public void setDeptName(String deptName) {
this.deptName = deptName;
}
public String getDeptId() {
return deptId;
}
public void setDeptId(String deptId) {
this.deptId = deptId;
}
public String getLayerCode() {
return layerCode;
}
public void setLayerCode(String layerCode) {
this.layerCode = layerCode;
}
private int age;
private String deptName;
private String deptId;
private String layerCode;
private String flag;
public String getFlag() {
return flag;
}
public void setFlag(String flag) {
this.flag = flag;
}
@Override
public String toString() {
return
"userId='" + userId + '\'' +
", userName='" + userName + '\'' +
", age=" + age +
", deptName='" + deptName + '\'' +
", deptId='" + deptId + '\'' +
", layerCode='" + layerCode + '\''+
", flag='"+flag+'\'';
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(userId);
dataOutput.writeUTF(userName);
dataOutput.writeUTF(deptName);
dataOutput.writeUTF(deptId);
dataOutput.writeUTF(layerCode);
dataOutput.writeInt(age);
dataOutput.writeUTF(flag);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
userId=dataInput.readUTF();
userName=dataInput.readUTF();
deptName=dataInput.readUTF();
deptId= dataInput.readUTF();
layerCode =dataInput.readUTF();
age = dataInput.readInt();
flag=dataInput.readUTF();
}
}
3、共同朋友计算
原始数据
/**
* 经典问题共同朋友解决
* A:B,C,D,F,E,O
* B:A,C,E,K
* C:F,A,D,I
* D:A,E,F,L
* E:B,C,D,M,L
* F:A,B,C,D,E,O,M
* G:A,C,D,E,F
* H:A,C,D,E,O
* I:A,O
* J:B,O
* K:A,C,D
* L:D,E,F
* M:E,F,G
* O:A,H,I,J
*
* 求出两两之间有共同好友的"用户对",及他俩的共同好友
* 比如:
* a-b : c ,e
*/
java代码
package com.skymesh.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.Arrays;
/**
* 经典问题共同朋友解决
* A:B,C,D,F,E,O
* B:A,C,E,K
* C:F,A,D,I
* D:A,E,F,L
* E:B,C,D,M,L
* F:A,B,C,D,E,O,M
* G:A,C,D,E,F
* H:A,C,D,E,O
* I:A,O
* J:B,O
* K:A,C,D
* L:D,E,F
* M:E,F,G
* O:A,H,I,J
*
* 求出两两之间有共同好友的"用户对",及他俩的共同好友
* 比如:
* a-b : c ,e
*/
public class SameFriendMapReduce {
static class FriendMapper extends Mapper<LongWritable, Text,Text,Text>{
@Override
protected void map(LongWritable key, Text value,Mapper<LongWritable, Text, Text, Text>.Context context)throws IOException, InterruptedException {
String line = value.toString();
String[] friend_users = line.split(":");
String friend = friend_users[0];
String[] users = friend_users[1].split(",");
Arrays.sort(users);
for (int i = 0; i < users.length-1; i++) {
for (int j = i+1; j < users.length; j++) {
context.write(new Text(users[i]+"-"+users[j]), new Text(friend));
}
}
}
}
static class FriendReducer extends Reducer<Text,Text,Text,Text>{
@Override
protected void reduce(Text friend, Iterable<Text> users, Context context) throws IOException, InterruptedException {
StringBuffer buf = new StringBuffer();
for (Text user : users) {
buf.append(user).append(",");
}
context.write(new Text(friend), new Text(buf.toString()));
}
}
public static void main(String[] args) throws Exception {
//指定mapreduce执行在哪个环境
Configuration conf = new Configuration();
conf.set("fs.default.name", "hdfs://192.168.136.128:9000");
conf.set("yarn.resoucemanager.hostname", "hadoop1");
conf.set("mapreduce.reduce.maxattempts","3");
Job job = Job.getInstance(conf);
job.setJarByClass(SameFriendMapReduce.class);
//指定mapper和reducer的执行
job.setMapperClass(FriendMapper.class);
job.setReducerClass(FriendReducer.class);
//设置返回键值类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//设置输入输出文件系统所在的位置
FileInputFormat.setInputPaths(job,new Path("/friend/input"));
FileOutputFormat.setOutputPath(job,new Path("/friend/output"));
//将job中配置的相关参数,以及job所用的java类所在的jar包,提交给yarn去运行
/*job.submit();*/
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}