【MapReduce】系列学习笔记:
文章目录
1 需求
根据数据文件phone_data.txt按照如下要求实现MapReduce分析程序编写:
需求:
1)统计每一个手机号耗费的总上行流量、下行流量、总流量
2)将统计结果按照手机归属地不同号段(手机号前3位)输出到不同文件中
3)根据需求1)产生的结果再次对总流量进行排序。
4)按照要求2)每个手机号段输出的文件中按照总流量内部排序。
输入数据格式:
输出数据格式:
2 实验环境
实验环境 | 版本 |
---|---|
操作系统 | Linux(Centos7) |
JDK | jdk 1.8.0_271 |
IDE | IntelliJ IDEA 2020.2 |
Hadoop | 3.2.1 |
3 实验内容
3.1 数据准备
本次实验的数据为所提供的phone_data数据,单行数据包括手机号码、MAC地址、IP地址、上行流量、下行流量等,部分数据如下图所示:
3.2 分析和设计
根据本次实验的四个功能模块,结合MapReduce的过程,具体的流程设计如下:
统计求和
在MapReduce的过程中:
K1:行数量 V1:每一行的数据
K2:电话号码 V2:由上行流量和下行流量构建的PhoneFlow类
K3:电话号码 V3:由所有的上行流量之和和所有的下行流量之和构建的PhoneFlow类
分区
与第一个实验不同的是加入了分区类,在分区类中,分为134、135、136、137、138、139和其他7个分区,分别返回0、1、2、3、4、5、6。
总体排序
该实验使用经过第一次实验已经处理过的数据作为输入文件,对于PhoneFlow类继承了WritableComparable,重写了CompareTo方法用于比较大小,在后续使用该类是会自动按照规则进行排序。
其中:
K1:行数 V1:每一行的数据
K2:PhoneFlow类 V2:手机号码
K3:手机号码 V3:PhoneFlow类
分区排序
其中:
K1:行数 V1:每一行的数据
K2:PhoneFlow类 V2:手机号码
K3:手机号码 V3:PhoneFlow类
4 编码
4.1 统计求和
PhoneFlow
package com.tao.CountFlow;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* @author zhangtao
* 封装上行流量、下行流量、总流量
*/
public class PhoneFlow implements Writable{
private Long up;
private Long down;
private Long sum;
public PhoneFlow() {
}
public PhoneFlow(Long up, Long down) {
this.up = up;
this.down = down;
this.sum = up+down;
}
public Long getUp() {
return up;
}
public void setUp(Long up) {
this.up = up;
}
public Long getDown() {
return down;
}
public void setDown(Long down) {
this.down = down;
}
public Long getSum() {
return sum;
}
public void setSum(Long sum) {
this.sum = sum;
}
@Override
public String toString() {
return String.valueOf(up) + " "+String.valueOf(down) +" "+ String.valueOf(sum);
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeLong(up);
dataOutput.writeLong(down);
dataOutput.writeLong(sum);
}
public void readFields(DataInput dataInput) throws IOException {
up = dataInput.readLong();
down = dataInput.readLong();
sum = dataInput.readLong();
}
}
Mapper
package com.tao.CountFlow;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* @author zhangtao
*/
public class PhoneFlowMapper extends Mapper<LongWritable,Text, Text, PhoneFlow> {
@Override
/**
* 参数:
* key:k1 行偏移量
* value:v1 每一行的文本数据
* context:表示上下文对象
*/
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//1、将每一行的内容转换为string并进行拆分
String[] split = value.toString().split("\\s+");
//2、获取手机号码、上行流量、下行流量
String phoneNumber = split[1];
long up = Long.parseLong(split[split.length-3]);
long down = Long.parseLong(split[split.length-2]);
//3、写入上下文
context.write(new Text(phoneNumber), new PhoneFlow(up, down));
}
}
Reudcer
package com.tao.CountFlow;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @author zhangtao
*/
public class PhoneFlowReducer extends Reducer<Text,PhoneFlow,Text,PhoneFlow> {
@Override
protected void reduce(Text key, Iterable<PhoneFlow> values, Context context) throws IOException, InterruptedException {
long sumUp = 0;
long sumDown = 0;
//1、将所有的上行流量和下行流量求和
for(PhoneFlow phoneFlow :values){
sumUp = sumUp + phoneFlow.getUp();
sumDown = sumDown + phoneFlow.getDown();
}
//2、获取总流量
PhoneFlow phoneFlow = new PhoneFlow(sumUp,sumDown);
//3、写入上下文
context.write(key,phoneFlow);
}
}
Main
package com.tao.CountFlow;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.net.URI;
/**
* @author zhangtao
*/
public class PhoneFlowMain extends Configured implements Tool {
/**
* 该方法用于指定一个job任务
*/
public int run(String[] strings) throws Exception {
//1、创建一个job
Job job = Job.getInstance(super.getConf(),"phoneFlow");
//2、配置job任务对象(八个步骤)
//第一步:指定文件的读取方式和读取路径
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://192.168.164.134:9000/PhoneData"));
//第二步:指定map阶段的处理方式和数据类型
job.setMapperClass(PhoneFlowMapper.class);
job.setJarByClass(PhoneFlowMapper.class);
//设置map阶段k2的类型
job.setMapOutputKeyClass(Text.class);
//设置map阶段v2的类型
job.setMapOutputValueClass(PhoneFlow.class);
//第三、四、五、六步:shuffle 采用默认的方式
//第七步:指定reduce阶段的处理方式和数据类型
job.setReducerClass(PhoneFlowReducer.class);
job.setJarByClass(PhoneFlowReducer.class);
//设置k3的类型
job.setOutputKeyClass(Text.class);
//设置v3的类型
job.setOutputValueClass(PhoneFlow.class);
//第八步:设置输出类型
job.setOutputFormatClass(TextOutputFormat.class);
//设置输出的路径
Path path = new Path("hdfs://192.168.164.134:9000/out/PhoneFlow_out");
TextOutputFormat.setOutputPath(job,path);
FileSystem fs = FileSystem.get(new URI("hdfs://192.168.164.134:9000"),new Configuration());
if(fs.exists(path)){
fs.delete(path,true);
}
//等待任务结束
boolean b1 = job.waitForCompletion(true);
return b1? 0:1;
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//启动job任务
int run = ToolRunner.run(conf,new PhoneFlowMain(),args);
System.exit(run);
}
}
4.2 分区
Partitioner
package com.tao.LocationPart;
import com.tao.CountFlow.PhoneFlow;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* @author zhangtao
* 输入类型和map阶段相同
*/
public class MyPartitioner extends Partitioner<Text, PhoneFlow> {
@Override
public int getPartition(Text text, PhoneFlow phoneFlow, int i) {
//1、获取前三位号码
String num = text.toString().substring(0,3);
//2、观察文档,有134、135、136、137、138、139、182、841、159、150、183开头的地区
//设置分区为134、135、136、137、138、139和其他
int part = 6;
String p1 = "134";
String p2 = "135";
String p3 = "136";
String p4 = "137";
String p5 = "138";
String p6 = "139";
if(p1.equals(num)){
part = 0;
}
if(p2.equals(num)){
part = 1;
}
if(p3.equals(num)){
part = 2;
}
if(p4.equals(num)){
part = 3;
}
if(p5.equals(num)){
part = 4;
}
if(p6.equals(num)){
part = 5;
}
return part;
}
}
Main
package com.tao.LocationPart;
import com.tao.CountFlow.PhoneFlow;
import com.tao.CountFlow.PhoneFlowMapper;
import com.tao.CountFlow.PhoneFlowReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.net.URI;
/**
* @author zhangtao
*/
public class LocationPartMain extends Configured implements Tool {
/**
* 该方法用于指定一个job任务
*/
public int run(String[] strings) throws Exception {
//1、创建一个job
Job job = Job.getInstance(super.getConf(),"locationPart");
//2、配置job任务对象(八个步骤)
//第一步:指定文件的读取方式和读取路径
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://192.168.164.134:9000/PhoneData"));
//第二步:指定map阶段的处理方式和数据类型
job.setMapperClass(PhoneFlowMapper.class);
job.setJarByClass(PhoneFlowMapper.class);
//设置map阶段k2的类型
job.setMapOutputKeyClass(Text.class);
//设置map阶段v2的类型
job.setMapOutputValueClass(PhoneFlow.class);
//第三、四、五、六步:shuffle 采用默认的方式
//第七步:指定reduce阶段的处理方式和数据类型
job.setReducerClass(PhoneFlowReducer.class);
job.setJarByClass(PhoneFlowReducer.class);
//设置k3的类型
job.setOutputKeyClass(Text.class);
//设置v3的类型
job.setOutputValueClass(PhoneFlow.class);
//第八步:设置输出类型
job.setOutputFormatClass(TextOutputFormat.class);
//设置输出的路径
Path path = new Path("hdfs://192.168.164.134:9000/out/locationPart_out");
TextOutputFormat.setOutputPath(job,path);
FileSystem fs = FileSystem.get(new URI("hdfs://192.168.164.134:9000"),new Configuration());
if(fs.exists(path)){
fs.delete(path,true);
}
//设置part
job.setPartitionerClass(MyPartitioner.class);
//设置分区数量
job.setNumReduceTasks(7);
//等待任务结束
boolean b1 = job.waitForCompletion(true);
return b1? 0:1;
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//启动job任务
int run = ToolRunner.run(conf,new LocationPartMain(),args);
System.exit(run);
}
}
4.3 全排序
PhoneFlow
package com.tao.CountFlowSort;
import com.tao.CountFlow.PhoneFlow;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* @author zhangtao
* 封装上行流量、下行流量、总流量
*/
public class PhoneFlow2 implements WritableComparable<PhoneFlow2> {
private Long up;
private Long down;
private Long sum;
public PhoneFlow2() {
}
public PhoneFlow2(Long up, Long down) {
this.up = up;
this.down = down;
this.sum = up+down;
}
public Long getUp() {
return up;
}
public void setUp(Long up) {
this.up = up;
}
public Long getDown() {
return down;
}
public void setDown(Long down) {
this.down = down;
}
public Long getSum() {
return sum;
}
public void setSum(Long sum) {
this.sum = sum;
}
@Override
public String toString() {
return String.valueOf(up) + " "+String.valueOf(down) +" "+ String.valueOf(sum);
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeLong(up);
dataOutput.writeLong(down);
dataOutput.writeLong(sum);
}
public void readFields(DataInput dataInput) throws IOException {
up = dataInput.readLong();
down = dataInput.readLong();
sum = dataInput.readLong();
}
/**
* 指定排序的规则
*/
public int compareTo(PhoneFlow2 o) {
//升序
return (int) (this.sum-o.getSum());
//降序
// return (int) (-this.sum+o.getSum());
}
}
Mapper
package com.tao.CountFlowSort;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* @author zhangtao
* k2:phoneflow
* v2:手机号
*/
public class FlowSortMapper extends Mapper<LongWritable, Text, PhoneFlow2,Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//拆分v1,获取流量
String v = value.toString();
String[] data = v.split("\\s+");
String phone = data[0];
long up = Long.parseLong(data[1]);
long down = Long.parseLong(data[2]);
//存储k2
PhoneFlow2 phoneFlow = new PhoneFlow2(up,down);
//存储v2
Text text =new Text();
text.set(phone);
//存入上下文
context.write(phoneFlow,text);
}
}
Reducer
package com.tao.CountFlowSort;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @author zhangtao
*
* k2:phoneflow
* v2:phone
*
* k3:phone
* v3:phoneflow
*/
public class FlowSortReducer extends Reducer<PhoneFlow2, Text, Text, PhoneFlow2> {
@Override
protected void reduce(PhoneFlow2 key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//取出k3,将k3和v3写入上下文
for(Text value : values){
context.write(value,key);
}
}
}
Main
package com.tao.CountFlowSort;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.net.URI;
/**
* @author zhangtao
*/
public class FlowSortMain extends Configured implements Tool {
/**
* 该方法用于指定一个job任务
*/
public int run(String[] strings) throws Exception {
//1、创建一个job
Job job = Job.getInstance(super.getConf(),"Flowsort");
//2、配置job任务对象(八个步骤)
//第一步:指定文件的读取方式和读取路径
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://192.168.164.134:9000/out/PhoneFlow_out"));
//第二步:指定map阶段的处理方式和数据类型
job.setMapperClass(FlowSortMapper.class);
job.setJarByClass(FlowSortMapper.class);
//设置map阶段k2的类型
job.setMapOutputKeyClass(PhoneFlow2.class);
//设置map阶段v2的类型
job.setMapOutputValueClass(Text.class);
//第三、四、五、六步:shuffle 采用默认的方式
//第七步:指定reduce阶段的处理方式和数据类型
job.setReducerClass(FlowSortReducer.class);
job.setJarByClass(FlowSortReducer.class);
//设置k3的类型
job.setOutputKeyClass(Text.class);
//设置v3的类型
job.setOutputValueClass(PhoneFlow2.class);
//第八步:设置输出类型
job.setOutputFormatClass(TextOutputFormat.class);
//设置输出的路径
Path path = new Path("hdfs://192.168.164.134:9000/out/FlowSort_out");
TextOutputFormat.setOutputPath(job,path);
FileSystem fs = FileSystem.get(new URI("hdfs://192.168.164.134:9000"),new Configuration());
if(fs.exists(path)){
fs.delete(path,true);
}
//等待任务结束
boolean b1 = job.waitForCompletion(true);
return b1? 0:1;
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//启动job任务
int run = ToolRunner.run(conf,new FlowSortMain(),args);
System.exit(run);
}
}
4.4 内部排序
Partitioner
package com.tao.LocationPartSort;
import com.tao.CountFlowSort.PhoneFlow2;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* @author zhangtao
* 输入类型和map阶段相同
*/
public class MyPartitioner2 extends Partitioner<PhoneFlow2, Text> {
@Override
public int getPartition(PhoneFlow2 phoneFlow, Text text, int i) {
//1、获取前三位号码
String num = text.toString().substring(0,3);
//2、观察文档,有134、135、136、137、138、139、182、841、159、150、183开头的地区
//设置分区为134、135、136、137、138、139和其他
int part = 6;
String p1 = "134";
String p2 = "135";
String p3 = "136";
String p4 = "137";
String p5 = "138";
String p6 = "139";
if(p1.equals(num)){
part = 0;
}
if(p2.equals(num)){
part = 1;
}
if(p3.equals(num)){
part = 2;
}
if(p4.equals(num)){
part = 3;
}
if(p5.equals(num)){
part = 4;
}
if(p6.equals(num)){
part = 5;
}
return part;
}
}
Main
package com.tao.LocationPartSort;
import com.tao.CountFlowSort.FlowSortMapper;
import com.tao.CountFlowSort.FlowSortReducer;
import com.tao.CountFlowSort.PhoneFlow2;
import com.tao.LocationPart.MyPartitioner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.net.URI;
/**
* @author zhangtao
*/
public class LocationPartSortMain extends Configured implements Tool {
/**
* 该方法用于指定一个job任务
*/
public int run(String[] strings) throws Exception {
//1、创建一个job
Job job = Job.getInstance(super.getConf(),"LocationPartSort");
//2、配置job任务对象(八个步骤)
//第一步:指定文件的读取方式和读取路径
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://192.168.164.134:9000/out/FlowSort_out"));
//第二步:指定map阶段的处理方式和数据类型
job.setMapperClass(FlowSortMapper.class);
job.setJarByClass(FlowSortMapper.class);
//设置map阶段k2的类型
job.setMapOutputKeyClass(PhoneFlow2.class);
//设置map阶段v2的类型
job.setMapOutputValueClass(Text.class);
//第三、四、五、六步:shuffle 采用默认的方式
//第七步:指定reduce阶段的处理方式和数据类型
job.setReducerClass(FlowSortReducer.class);
job.setJarByClass(FlowSortReducer.class);
//设置k3的类型
job.setOutputKeyClass(Text.class);
//设置v3的类型
job.setOutputValueClass(PhoneFlow2.class);
//第八步:设置输出类型
job.setOutputFormatClass(TextOutputFormat.class);
//设置输出的路径
Path path = new Path("hdfs://192.168.164.134:9000/out/LocationPartSort_out");
TextOutputFormat.setOutputPath(job,path);
FileSystem fs = FileSystem.get(new URI("hdfs://192.168.164.134:9000"),new Configuration());
if(fs.exists(path)){
fs.delete(path,true);
}
//设置part
job.setPartitionerClass(MyPartitioner2.class);
//设置分区数量
job.setNumReduceTasks(7);
//等待任务结束
boolean b1 = job.waitForCompletion(true);
return b1? 0:1;
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//启动job任务
int run = ToolRunner.run(conf,new LocationPartSortMain(),args);
System.exit(run);
}
}