前言
hadoop对Java数据类型进行了包装,hadoop的数据类型与Java那些数据类型对应如下表
数据类型 | hadoop数据类型 | java数据类型 |
---|---|---|
布尔型 | BooleanWritable | boolean |
整形 | IntWritable | int |
浮点型 | FloatWritable | float |
双精度型 | DoubleWritable | double |
字节类型 | ByteWritable | byte |
文本类型 | Text | string |
数组 | ArrayWritable | array |
Java类型如何转化为Hadoop基本类型?
直接通过new的方式转换,例如Java的Long转换成Hadoop的long
new LongWritable(123L);
hadoop基本类型如何转化为java类型?
对于Text,需要调用toString()方法,其他类型调用get()方法。
WordCount:词频统计
假如我们有这样的一份文本数据需要进行词频统计
hadoop,hadoop,hadoophadoop hbase,hbase,hbase spark,spark flink
处理思路
1.文件的内容读取进来
2.每行数据是有分隔符的,这里的分隔符是逗号
3.每个单词赋值为1
hadoop,1
hadoop,1
········
flink,1
4. Shuffle: 相同的key分发到一个reduce上
hadoop,<1,1,1,1>
hbase,<1,1,1>
spark,<1,1>
flink,<1>
5. 数据归并
hadoop,4
hbase,3
spark,2
flink,1
代码如下
代码的含义都写在代码注释中了
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* MapReduce单词统计
*/
public class WordCountApp {
/**
* job启动类,设置参数并集群中提交job
*
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
//面向套路变成,所有的MapReduce作业都是这么提交
//step1:获取job对象
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
String input = "data/wc.data";
String output = "output/";
FileUtils.deleteTarget(output, configuration);
//step2:获取job相关信息
job.setJarByClass(WordCountApp.class);
//step3:设置自定义的Mapper和Reducer
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
//step4:设置Mapper阶段输出的key和value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//step5:设置reducer阶段输出的key和value类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//step6:设置输入输出路径
FileInputFormat.setInputPaths(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
//step7:提交job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
/**
* 自定义Mapper继承:org.apache.hadoop.mapreduce.Mapper,实现map方法
*
* 总体实现思路
* 1.读取文件
* 2.每行数据是有分隔符的,这里的分隔符是逗号
* 3.每个单词赋值为1 map阶段
* 4.shuffler 相同的key分发到一个reduce下
* 5.数据分发,数据的归并
*/
/**
* KEYIN 输入数据key的数据类型 每行数据的偏移量
* KEYVALUE 输入数据value的数据类型
* KEYOUT 输出数据key的数据类型
* VALUEOUT 输出数据value的数据类型
*/
public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
IntWritable ONE = new IntWritable(1);
/**
* map 阶段完成的事情
* 1) 文件的内容读取进来
* 2) 每行数据是有分隔符的,这里的分隔符是逗号
* 3)每个单词赋值为1
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] splits = value.toString().split(",");
for (String split : splits) {
context.write(new Text(split), ONE);
}
}
}
/**
* 自定义Reducer继承:org.apache.hadoop.mapreduce.Reducer,实现reduce方法
*/
public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
/**
* @param word
* @param values 为什么是values 是Iterable 的,原因是shuffler过程,相同的key会分发到一个reduce上
* 例如 a,<1,1,1> b,<1,1> c,<1>
*/
@Override
protected void reduce(Text word, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count += value.get();
}
context.write(word, new IntWritable(count));
}
}
}
序列化
序列化就是 把 对象==>字节数组
反序列化就 把 是字节数组==>转换成对象
JAVA中的Serializable.java,java自身的序列化性能非常的差。MapReduce中的序列化是hadoop重写的序列化
自定义MapReduce开发
假如我们有一份这样的数据
数据中,第二列为手机号,倒数第三列为上行流量、倒数第二个为下行流量
1363157985066 13712345678 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
1363157995052 13812345678 5C-0E-8B-C7-F1-E0:CMCC 120.197.40.4 4 0 264 0 200
1363157985066 13912345678 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
1363157985066 13712345678 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 10000 20000 200
现在我们要求:按照手机号分组,求一个手机号上行流量总和、下行流量总和、以及上下行流量的总和(需求并无实际意义,只是为了讲解自定义MapReduce开发)
流程
1.实现Writable接口
2.重写write和readFileds方法,注意字段顺序要一直
3.必须要带有一个默认无参的构造方法
4.toString复写,这个是可选的
5.如果你的自定义类要实现排序,需要实现Comparable
最终实现的代码如下
主类
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* 日志统计统计
*/
public class AccessApp {
public static void main(String[] args) throws Exception {
//面向套路变成,所有的MapReduce作业都是这么提交
//step1:获取job对象
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
String input = "data/access.log";
String output = "output/access/";
//step2:获取job相关信息
job.setJarByClass(AccessApp.class);
//step3:设置自定义的Mapper和Reducer
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
//step4:设置Mapper阶段输出的key和value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Access.class);
//step5:设置reducer阶段输出的key和value类型
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Access.class);
//step6:设置输入输出路径
FileInputFormat.setInputPaths(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
//step7:提交job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
public static class MyMapper extends Mapper<LongWritable, Text, Text, Access> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] splits = value.toString().split("\t");
String phone = splits[1];
long up = Long.parseLong(splits[splits.length - 3]);
long down = Long.parseLong(splits[splits.length - 2]);
context.write(new Text(phone), new Access(phone, up, down));
}
}
public static class MyReducer extends Reducer<Text, Access, NullWritable, Access> {
@Override
protected void reduce(Text key, Iterable<Access> values, Context context) throws IOException, InterruptedException {
long ups = 0;
long downs = 0;
for (Access access : values) {
ups += access.getUp();
downs += access.getDown();
}
context.write(NullWritable.get(), new Access(key.toString(), ups, downs));
}
}
}
重写的writable接口类
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class Access implements Writable {
private String phone;
private long up;
private long down;
private long sum;
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(phone);
out.writeLong(up);
out.writeLong(down);
out.writeLong(sum);
}
@Override
public void readFields(DataInput in) throws IOException {
//TODO...写进去的顺序,必须要和write方法的字段顺序一致
this.phone = in.readUTF();
this.up = in.readLong();
this.down = in.readLong();
this.sum = in.readLong();
}
//空构造方法一定要保留
public Access(){}
public Access(String phone,long up,long down){
this.phone = phone;
this.up = up;
this.down = down;
this.sum = up + down;
}
@Override
public String toString() {
return "Access{" +
"phone='" + phone + '\'' +
", up=" + up +
", down=" + down +
", sum=" + sum +
'}';
}
public String getPhone() {
return phone;
}
public void setPhone(String phone) {
this.phone = phone;
}
public long getUp() {
return up;
}
public void setUp(long up) {
this.up = up;
}
public long getDown() {
return down;
}
public void setDown(long down) {
this.down = down;
}
public long getSum() {
return sum;
}
public void setSum(long sum) {
this.sum = sum;
}
}
WordCount的延伸:Combine操作
mapreduce中的combainer
拿wordcount举例
InputSplit过程,经过map会生成如下数据
hadoop,1
hadoop,1
········
flink,1
map的数据经过reduce
hadoop,<1,1,1,1>
hbase,<1,1,1>
spark,<1,1>
flink,<1>
reduce过程会有shuffle过程,我们可以使用combiner操作提前预聚合。这样能够减少shuffle的影响。
Combiner是在map中完成的。Combiner跟reduce类似,就是一个本地化的Reduce,局部的聚合 Ruducer是全局的聚合。
但是不是所有场景都能使用Combiner,有些业务是无法使用Combiner的,例如求平均数的操作
例如如下的数据
5,7,9
4,8
这个组数据的平均数为:33/6 = 5.5。
但是假如我们把数据进行了预聚合,最终结果是 5 + 7 + 9 的平均数 7,4+ 8 的平均数是6,经过reduce后的结果就是(7 + 6)=6.5
上述的word count执行过程中,我们能看到下面的执行日志
Counters: 30
File System Counters
FILE: Number of bytes read=784
FILE: Number of bytes written=681113
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
Map-Reduce Framework
Map input records=4
Map output records=10
Map output bytes=104
Map output materialized bytes=130
Input split bytes=129
Combine input records=0
Combine output records=0
Reduce input groups=4
Reduce shuffle bytes=130
Reduce input records=10
Reduce output records=4
Spilled Records=20
Shuffled Maps =1
Failed Shuffles=0
Merged Map outputs=1
GC time elapsed (ms)=0
Total committed heap usage (bytes)=514850816
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=63
File Output Format Counters
Bytes Written=45
在执行日志中,我们可以看到
Combine input records=0
Combine output records=0
这个combine input 和output的记录数为0是因为我们在上述代码没有加入combine的操作 .
可以通过在设置map端时候,设置Combine操作
//step3:设置自定义的Mapper和Reducer job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setCombinerClass(MyReducer.class);
设置后的执行日志:
Combine input records=10
Combine output records=4