在 Hadoop 中操作的所有数据类型都要实现一个Writable 的接口,实现该接口才能进行序列化,才能读取和写入。
测试数据
1363157985066 13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
1363157995052 13826544101 5C-0E-8B-C7-F1-E0:CMCC 120.197.40.4 4 0 264 0 200
1363157991076 13926435656 20-10-7A-28-CC-0A:CMCC 120.196.100.99 2 4 132 1512 200
1363154400022 13926251106 5C-0E-8B-8B-B1-50:CMCC 120.197.40.4 4 0 240 0 200
1363157993044 18211575961 94-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99 iface.qiyi.com 视频网站 15 12 1527 2106 200
1363157995074 84138413 5C-0E-8B-8C-E8-20:7DaysInn 120.197.40.4 122.72.52.12 20 16 4116 1432 200
1363157993055 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15 1116 954 200
1363157995033 15920133257 5C-0E-8B-C7-BA-20:CMCC 120.197.40.4 sug.so.360.cn 信息安全 20 20 3156 2936 200
1363157983019 13719199419 68-A1-B7-03-07-B1:CMCC-EASY 120.196.100.82 4 0 240 0 200
1363157984041 13660577991 5C-0E-8B-92-5C-20:CMCC-EASY 120.197.40.4 s19.cnzz.com 站点统计 24 9 6960 690 200
1363157973098 15013685858 5C-0E-8B-C7-F7-90:CMCC 120.197.40.4 rank.ie.sogou.com 搜索引擎 28 27 3659 3538 200
1363157986029 15989002119 E8-99-C4-4E-93-E0:CMCC-EASY 120.196.100.99 www.umeng.com 站点统计 3 3 1938 180 200
1363157992093 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 15 9 918 4938 200
1363157986041 13480253104 5C-0E-8B-C7-FC-80:CMCC-EASY 120.197.40.4 3 3 180 180 200
1363157984040 13602846565 5C-0E-8B-8B-B6-00:CMCC 120.197.40.4 2052.flash2-http.qq.com 综合门户 15 12 1938 2910 200
1363157995093 13922314466 00-FD-07-A2-EC-BA:CMCC 120.196.100.82 img.qfc.cn 12 12 3008 3720 200
1363157982040 13502468823 5C-0A-5B-6A-0B-D4:CMCC-EASY 120.196.100.99 y0.ifengimg.com 综合门户 57 102 7335 110349 200
1363157986072 18320173382 84-25-DB-4F-10-1A:CMCC-EASY 120.196.100.99 input.shouji.sogou.com 搜索引擎 21 18 9531 2412 200
1363157990043 13925057413 00-1F-64-E1-E6-9A:CMCC 120.196.100.55 t3.baidu.com 搜索引擎 69 63 11058 48243 200
1363157988072 13760778710 00-FD-07-A4-7B-08:CMCC 120.196.100.82 2 2 120 120 200
1363157985079 13823070001 20-7C-8F-70-68-1F:CMCC 120.196.100.99 6 3 360 180 200
1363157985069 13600217502 00-1F-64-E2-E8-B1:CMCC 120.196.100.55 18 138 1080 186852 200
实现上面手机上网日志,第6~9字段是流量的信息,统计每个用户的
upPackNum、downPackNum、upPayLoad以及downPayLoad字段的和
Writable 接口
Writable 接口实现的主要是 write 和 readFields 方法
public interface Writable {
/**
* Serialize the fields of this object to <code>out</code>.
*/
void write(DataOutput out) throws IOException;
/**
* Deserialize the fields of this object from <code>in</code>.
*/
void readFields(DataInput in) throws IOException;
}
Write :把对象的属性序列化到 DataOutput 中
readFields:数据反序列化到对象的属性
封装 KpiWritable
封装用户的四个属性字段
package WriTable;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* Created by hubo on 2017/12/30
*/
public class KpiWritable implements Writable{
long upPackNum;
long downPackNum;
long upPayLoad;
long downpayLoad;
public KpiWritable() {
}
public KpiWritable(String upPackNum, String downPackNum, String upPayLoad, String downpayLoad) {
this.upPackNum = Long.parseLong(upPackNum);
this.downPackNum = Long.parseLong(downPackNum);
this.upPayLoad = Long.parseLong(upPayLoad);
this.downpayLoad = Long.parseLong(downpayLoad);
}
@Override
public String toString() {
return "KpiWritable{" +
"upPackNum=" + upPackNum +
", downPackNum=" + downPackNum +
", upPayLoad=" + upPayLoad +
", downpayLoad=" + downpayLoad +
'}';
}
@Override
public void readFields(DataInput dataInput) throws IOException {
upPayLoad = dataInput.readLong();
upPackNum = dataInput.readLong();
downPackNum = dataInput.readLong();
downpayLoad = dataInput.readLong();
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeLong(upPackNum);
dataOutput.writeLong(upPayLoad);
dataOutput.writeLong(downPackNum);
dataOutput.writeLong(downpayLoad);
}
}
实现 Mapper 类
package WriTable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* Created by hubo on 2017/12/30
*/
public class MyMapper extends Mapper<LongWritable,Text,Text,KpiWritable>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
Counter sensitiveCounter = context.getCounter("Sensitive Words:","Phone1");
Counter sensitiveCounter1 = context.getCounter("Sensitive Words:","Phone1");
String line = value.toString();
if(line.contains("1363154400022")){
sensitiveCounter.increment(1L);
}else if(line.contains("1363157993044")){
sensitiveCounter1.increment(1L);
}
String[] splits = value.toString().split("\t");
String phone = splits[1];
Text k = new Text(phone);
KpiWritable v = new KpiWritable(splits[6],splits[7],splits[8],splits[9]);
context.write(k,v);
}
}
实现 Reducer 类
package WriTable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* Created by hubo on 2017/12/30
*/
public class MyReducer extends Reducer<Text,KpiWritable,Text,KpiWritable> {
@Override
protected void reduce(Text key, Iterable<KpiWritable> values, Context context) throws IOException, InterruptedException {
long upPackNum = 0L;
long downPackNum = 0L;
long upPayLoad = 0L;
long downpayLoad = 0L;
for(KpiWritable kpiWritable : values){
upPackNum += kpiWritable.upPackNum;
upPayLoad += kpiWritable.upPayLoad;
downPackNum += kpiWritable.downPackNum;
downpayLoad += kpiWritable.downpayLoad;
}
KpiWritable v = new KpiWritable(upPackNum+"",downPackNum+"",upPayLoad+"",+downpayLoad+"");
context.write(key,v);
}
}
实现 run
package WriTable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* Created by hubo on 2017/12/30
*/
public class MyJob extends Configured implements Tool{
public static final String In_PATH = "/in";
public static final String Out_PATH = "/out";
@Override
public int run(String[] strings) throws Exception {
FileSystem fs = FileSystem.get(getConf());
Path outPath = new Path(Out_PATH);
if(fs.exists(outPath))
fs.delete(outPath,true);
Job job = new Job(getConf(),"MyJob");
FileInputFormat.setInputPaths(job,new Path(In_PATH));
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(KpiWritable.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(KpiWritable.class);
FileOutputFormat.setOutputPath(job,outPath);
job.waitForCompletion(true);
return 0;
}
public static void main(String[] args) {
System.setProperty("HADOOP_USER_NAME","root");
Configuration conf = new Configuration();
conf.set("fs.defaultFS","hdfs://node:9000");
try {
int res = ToolRunner.run(conf,new MyJob(),args);
System.exit(res);
} catch (Exception e) {
e.printStackTrace();
}
}
}
调试结果
13480253104 KpiWritable{upPackNum=180, downPackNum=3, upPayLoad=3, downpayLoad=180}
13502468823 KpiWritable{upPackNum=7335, downPackNum=102, upPayLoad=57, downpayLoad=110349}
13560439658 KpiWritable{upPackNum=2034, downPackNum=24, upPayLoad=33, downpayLoad=5892}
13600217502 KpiWritable{upPackNum=1080, downPackNum=138, upPayLoad=18, downpayLoad=186852}
13602846565 KpiWritable{upPackNum=1938, downPackNum=12, upPayLoad=15, downpayLoad=2910}
13660577991 KpiWritable{upPackNum=6960, downPackNum=9, upPayLoad=24, downpayLoad=690}
13719199419 KpiWritable{upPackNum=240, downPackNum=0, upPayLoad=4, downpayLoad=0}
13726230503 KpiWritable{upPackNum=2481, downPackNum=27, upPayLoad=24, downpayLoad=24681}
13760778710 KpiWritable{upPackNum=120, downPackNum=2, upPayLoad=2, downpayLoad=120}
13823070001 KpiWritable{upPackNum=360, downPackNum=3, upPayLoad=6, downpayLoad=180}
13826544101 KpiWritable{upPackNum=264, downPackNum=0, upPayLoad=4, downpayLoad=0}
13922314466 KpiWritable{upPackNum=3008, downPackNum=12, upPayLoad=12, downpayLoad=3720}
13925057413 KpiWritable{upPackNum=11058, downPackNum=63, upPayLoad=69, downpayLoad=48243}
13926251106 KpiWritable{upPackNum=240, downPackNum=0, upPayLoad=4, downpayLoad=0}
13926435656 KpiWritable{upPackNum=132, downPackNum=4, upPayLoad=2, downpayLoad=1512}
15013685858 KpiWritable{upPackNum=3659, downPackNum=27, upPayLoad=28, downpayLoad=3538}
15920133257 KpiWritable{upPackNum=3156, downPackNum=20, upPayLoad=20, downpayLoad=2936}
15989002119 KpiWritable{upPackNum=1938, downPackNum=3, upPayLoad=3, downpayLoad=180}
18211575961 KpiWritable{upPackNum=1527, downPackNum=12, upPayLoad=15, downpayLoad=2106}
18320173382 KpiWritable{upPackNum=9531, downPackNum=18, upPayLoad=21, downpayLoad=2412}
84138413 KpiWritable{upPackNum=4116, downPackNum=16, upPayLoad=20, downpayLoad=1432}
主要学习如何自定义类型(实现 write 和 readfields 和 toString(默认类型输出))
ToolRunner 如何运行(实现 Tool 接口,重写 run 方法,通过 getConf()获取 COnfiguation 对象)
提高工作效率