标签(空格分隔): hadoop
简介
本节主要是针对hdfs在业务中的日常应用而讲解的一些案例,用于训练我们在使用hdfs的方式。(ps:mr程序的本质是根据规则做数据拆分,之后根据key做好reduce的分组操作)
1 案例
数据存放: 链接:https://pan.baidu.com/s/1nsAcNdWE_glFqyx4AJ-GVg
提取码:lkdr
1.1 使用mr实现 join
1.数据准备:join主要包含两方面的数据,班级和学员信息
班级信息
班级id | 班级 |
---|---|
1 | 1班 |
2 | 2班 |
学员信息
10,yifang,15,1 学员id 学员名称 年龄 所属班级id
2.案例分析
需要将此两种信息汇总到一起,我们如何实现join呢,我们需要控制好key的流转即可,因为reduce本身会根据key做好分组,如果我们控制班级id的key作为1组的话,那么就可以实现学员信息和班级信息的join。
3.code
可去 github mr/join下查看代码
package com.lcy.hadoop.mr.join;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import com.lcy.hadoop.mr.flowsum.FlowBean;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* Created by luo on 2019/6/2.
*/
public class JoinDriver {
static class JoinMapper extends Mapper<LongWritable,Text,IntWritable,JoinBean>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] splictValuse = value.toString().split(",");
//通过文件名称来判断该文件时class还是student,这个信息在构建mr之前会放入到context中
FileSplit splict = (FileSplit)context.getInputSplit();
String name = splict.getPath().getName();
JoinBean joinBean = null;
int cid;
if(name.contains("class")){
//如果是class则设置class相关信
joinBean = new JoinBean(-1,"",-1,Integer.valueOf(splictValuse[0]),splictValuse[1],true);
cid = Integer.valueOf(splictValuse[0]);
}else{
//这部分是student的信息
joinBean = new JoinBean(Integer.valueOf(splictValuse[0]),splictValuse[1],Integer.valueOf(splictValuse[2]),Integer.valueOf(splictValuse[3]),"",false);
cid = Integer.valueOf(splictValuse[3]);
}
context.write(new IntWritable(cid),joinBean);
}
}
//reducer主要做数据join之后的输出操作
static class JoinReducer extends Reducer<IntWritable,JoinBean,JoinBean,NullWritable>{
@Override
protected void reduce(IntWritable key, Iterable<JoinBean> joinBeans, Context context) throws IOException, InterruptedException {
//先找出对应的class和student做区分
JoinBean classBean = new JoinBean();
List<JoinBean> studentBenas = new ArrayList<>();
try {
for(JoinBean bean:joinBeans){
//这里是挨个序列化,所以bean实际上以最后一个bean的数据会覆盖前面的需要坐下拷贝
if(bean.isClassFlas()){
BeanUtils.copyProperties(classBean,bean);
}else{
JoinBean sBean = new JoinBean();
BeanUtils.copyProperties(sBean,bean);
studentBenas.add(sBean);
}
}
}catch (Exception e){
e.printStackTrace();
}
for(JoinBean bean:studentBenas){
bean.setCName(classBean.getCName());
context.write(bean,NullWritable.get());
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","file:///");
conf.set("mapreduce.framework.name","local");
//操作本地可不设置任何参数,如果以yarn的形式去提交任务则需要设置执行框架为yarn并且需要配置文件系统为hdfs
Job job = Job.getInstance(conf);
job.setJarByClass(JoinDriver.class);
job.setMapperClass(JoinMapper.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(JoinBean.class);
job.setReducerClass(JoinReducer.class);
job.setOutputKeyClass(FlowBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job,new Path(args[0]));
deleteFIle(args[1]);
FileOutputFormat.setOutputPath(job,new Path(args[1]));
boolean isSuccess = job.waitForCompletion(true);
System.exit(isSuccess?0:1);
}
private static void deleteFIle(String arg) {
File file = new File(arg);
if(file.exists()){
if(file.isDirectory()){
String[] files = file.list();
for(String f : files){
File fi = new File(file.getParent(),f);
fi.delete();
}
file.delete();
}else{
file.delete();
}
}
}
}
JoinBean
package com.lcy.hadoop.mr.join;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import lombok.Data;
import org.apache.hadoop.io.Writable;
/**
-
Created by luo on 2019/6/2.
*/
@Data
public class JoinBean implements Writable {
//学员id 学员名称 年龄 所属班级id
private int sId;
private String sName;
private int sAge;
private int cId; //班级id
private String cName;//班级名称
private boolean isClassFlas;public JoinBean(int sId, String sName, int sAge, int cId,String cName,boolean isClassFlas) {
this.sId = sId;
this.sName = sName;
this.sAge = sAge;
this.cId = cId;
this.isClassFlas = isClassFlas;
this.cName = cName;}
public JoinBean() {
}@Override
public void write(DataOutput output) throws IOException {
output.writeInt(sId);
output.writeUTF(sName);
output.writeInt(sAge);
output.writeInt(cId);
output.writeUTF(cName);
output.writeBoolean(isClassFlas);
}@Override
public void readFields(DataInput input) throws IOException {
this.sId = input.readInt();
this.sName = input.readUTF();
this.sAge = input.readInt();
this.cId = input.readInt();
this.cName = input.readUTF();
this.isClassFlas = input.readBoolean();
}@Override
public String toString() {
return “” + sId + ‘\t’ +
sName + ‘\t’ +
sAge + ‘\t’ +
cId + ‘\t’ +
cName;
}
}
1.2 数据倾斜处理思路之去除map
1.数据准备:数据还是刚刚那个数据
2.案例分析:现在假如这个以上方join程序为例子,出现了学生信息很多的情况,我们会进行适当的reduce数量配置,而这时候可能就会出现hash之后的数据在reduce出现某些reducetask处理数据集特别多,而另一部分的数据集特别少的情况下,那么我们怎样才能够解决这种情况呢。(ps:假设在这上面的情况就是班级信息有限,而学生信息无限的情况下),思路即使假如我们可以让map端直接实现join那么是不是就不用怕reduce端数据倾斜呢。而map端在启用的时候都会调用setup之后获取数据调用map最后调用cleanup,我们在setup上面将班级信息做加载,那就能完成我们这个针对数据倾斜的方案。
package com.lcy.hadoop.mr.mapjoin;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;
import com.lcy.hadoop.mr.join.JoinBean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* Created by luo on 2019/6/2.
*/
public class MapJoinDriver {
static class MapJoinMapper extends Mapper<LongWritable,Text,JoinBean,NullWritable> {
Map<Integer,String> classMap = new HashMap();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream("join_class.txt")));
String line = null;
String[] valuse;
while ((line = reader.readLine())!=null){
valuse = line.split(",");
classMap.put(Integer.valueOf(valuse[0]),valuse[1]);
}
}finally {
reader.close();
}
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] splictValuse = value.toString().split(",");
//通过文件名称来判断该文件时class还是student,这个信息在构建mr之前会放入到context中
FileSplit splict = (FileSplit)context.getInputSplit();
String name = splict.getPath().getName();
JoinBean joinBean = null;
int cid;
if(name.contains("student")){
joinBean = new JoinBean(Integer.valueOf(splictValuse[0]),splictValuse[1],Integer.valueOf(splictValuse[2]),Integer.valueOf(splictValuse[3]),"",false);
cid = Integer.valueOf(splictValuse[3]);
joinBean.setCName(classMap.get(joinBean.getCId()));
context.write(joinBean,NullWritable.get());
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","file:///");
conf.set("mapreduce.framework.name","local");
//操作本地可不设置任何参数,如果以yarn的形式去提交任务则需要设置执行框架为yarn并且需要配置文件系统为hdfs
Job job = Job.getInstance(conf);
job.setJarByClass(MapJoinDriver.class);
job.addCacheFile(new URI("file:/E:/mr/join/input/join_class.txt"));
job.setMapperClass(MapJoinMapper.class);
job.setMapOutputKeyClass(JoinBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(JoinBean.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job,new Path(args[0]));
deleteFIle(args[1]);
FileOutputFormat.setOutputPath(job,new Path(args[1]));
boolean isSuccess = job.waitForCompletion(true);
System.exit(isSuccess?0:1);
}
private static void deleteFIle(String arg) {
File file = new File(arg);
if(file.exists()){
if(file.isDirectory()){
String[] files = file.list();
for(String f : files){
File fi = new File(file.getParent(),f);
fi.delete();
}
file.delete();
}else{
file.delete();
}
}
}
}
1.3 求两两之间的共同好友
1.数据准备:
A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J
求出哪些人两两之间有共同好友,及他俩的共同好友都是谁
比如:
a-b : c ,e
2.案例分析
这道题的思路有很多,我将一下我这边的解题思路。1)先找出所有以当前人为共同好友的所有人选,之后起第二个mr 对共同好友进行排序(因为两两共同好友a-b:c 和b-a:c 是一致的所以我们需要统一处理下),之后输出两人为key,对应的共同好友人为value,在reduce端做下两人拥有的共同好友的所有人输出处理。
3.code (这个案例的代码放在mr/friend里面)
第一步:获取以key为共同好友的所有人
package com.lcy.hadoop.mr.friend;
import java.io.IOException;
import com.lcy.hadoop.mr.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* Created by luo on 2019/6/2.
*/
public class FriendOneDriver {
//A:b,c,d a有bcd几个好友,我们其实要找的是b的好友有谁c的好友有谁
static class FriendOneMapper extends Mapper<LongWritable,Text,Text,Text> {
Text fValue = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
System.out.println(value);
String[] splictValuse = value.toString().split(":");
String keyStr = splictValuse[0];
//获取所有以此为好友的
String[] frieds = splictValuse[1].split(",");
fValue.set(keyStr);
for(String s:frieds){
context.write(new Text(s),fValue);
}
}
}
//
static class FriendOneReducer extends Reducer<Text,Text,Text,Text> {
/**
* 现在获取到的就是以key为共同好友的一组人员数据,我们先求出key的共同好友都有谁
* @param key
* @param values
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
StringBuilder sb = new StringBuilder();
for(Text v:values){
sb.append(v.toString()).append(",");
}
context.write(key,new Text(sb.toString()));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","file:///");
conf.set("mapreduce.framework.name","local");
//操作本地可不设置任何参数,如果以yarn的形式去提交任务则需要设置执行框架为yarn并且需要配置文件系统为hdfs
Job job = Job.getInstance(conf);
job.setJarByClass(FriendOneDriver.class);
job.setMapperClass(FriendOneMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(FriendOneReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileUtils.deleteFile(args[1]);
FileOutputFormat.setOutputPath(job,new Path(args[1]));
boolean isSuccess = job.waitForCompletion(true);
System.exit(isSuccess?0:1);
}
}
2.第二步 获取所有的两两拥有的共同好友
package com.lcy.hadoop.mr.friend;
import java.io.IOException;
import java.util.Arrays;
import com.lcy.hadoop.mr.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* Created by luo on 2019/6/2.
*/
public class FriendTwoDriver {
//将key为公共好友的 两两组成一队,value以当前的共同好友为value
static class FriendTwoMapper extends Mapper<LongWritable,Text,Text,Text> {
Text fValue = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] splicts = value.toString().split("\t");
fValue.set(splicts[0]);
String[] commomFs = splicts[1].split(",");
Arrays.sort(commomFs);
String twoPerson = null;
for(int i = 0;i<commomFs.length-1;i++){
for(int j = i +1;j<commomFs.length;j++){
twoPerson = commomFs[i] + "--" + commomFs[j];
context.write(new Text(twoPerson),fValue);
}
}
}
}
//
static class FriendTwoReducer extends Reducer<Text,Text,Text,Text> {
/**
* 现在获取到的就是以key为共同好友的一组人员数据,我们先求出key的共同好友都有谁
* @param key
* @param values
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
StringBuilder sb = new StringBuilder();
for(Text v:values){
sb.append(v.toString()).append(",");
}
String result = sb.toString();
if(sb.length()>1){
result = sb.substring(0,sb.length()-1);
}
context.write(key,new Text(result));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","file:///");
conf.set("mapreduce.framework.name","local");
//操作本地可不设置任何参数,如果以yarn的形式去提交任务则需要设置执行框架为yarn并且需要配置文件系统为hdfs
Job job = Job.getInstance(conf);
job.setJarByClass(FriendTwoDriver.class);
job.setMapperClass(FriendTwoMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(FriendTwoReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setCombinerClass(FriendTwoReducer.class);//由于不影响最终结果所以这里以combiner可以设置,可以提升reduce效率
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileUtils.deleteFile(args[1]);
FileOutputFormat.setOutputPath(job,new Path(args[1]));
boolean isSuccess = job.waitForCompletion(true);
System.exit(isSuccess?0:1);
}
}
2 总结
我觉得mr程序主要的难点就在于对key的定位,如果对key的定位好了,统计分析程序用一个到2个的mr就能实现。所以我们在做mr程序的时候一定要考虑好用什么作为key。