主要介绍的是自定义OutputFormat的使用,给出的需求很简单
对现有的日志文件内容进行增强。
1、从原始日志文件中读取数据
2、根据日志中的一个URL字段到外部知识库中获取信息增强到原始日志
3、如果成功增强,则输出到增强结果目录;如果增强失败,则抽取原始数据中URL字段输出到待爬清单目录
流程图
程序实现
/**
* 从数据库(知识库)中加载信息,为传进来的map初始化
* @author 12706
*
*/
public class MapLoaderUtils {
/**
* 知识库中加载信息,初始化map
* @param map
*/
public static void mapInit(Map<String,String> map){
Connection conn = null;
Statement state = null;
ResultSet rs = null;
try {
Class.forName("com.mysql.jdbc.Driver");
conn = DriverManager.getConnection("jdbc:mysql://192.168.191.2:3306/test", "root", "123456");
System.out.println("数据库连接成功");
String sql = "SELECT * FROM url_rule";
state = conn.createStatement();
rs = state.executeQuery(sql);
while(rs.next()){
//初始化map <url,content>
//System.out.println(rs.getString(1));
//System.out.println(rs.getString(2));
map.put(rs.getString(1), rs.getString(2));
//System.out.println(map);
}
} catch (Exception e) {
e.printStackTrace();
}finally {
if(conn != null){
try {
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
if(state != null){
try {
state.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
if(rs != null){
try {
rs.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
public static void main(String[] args) {
Map<String, String> map = new HashMap<String, String>();
mapInit(map);
System.out.println(map);
}
}
/**
* 自定义的OutoutFormat
* 拿到maptask或者reducetask传来的kv进行处理
* 将不同内容输入到不同的文件
* maptask或者reduceTask在最终输出时,先调用OutPutFormat的getRecordWrite方法得到
* 一个RecordWrite然后再调用RecordWrite的write(k,v)方法将数据写出
* @author 12706
*
*/
public class LogEnhanceOutoutFormat extends FileOutputFormat<Text, NullWritable>{
@Override
public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext job)
throws IOException, InterruptedException {
FileSystem fs = FileSystem.get(job.getConfiguration());
//获取增强日志对应的输出流
FSDataOutputStream enOs = fs.create(new Path("/logenhance/enhance.data"));
//获取待爬信息对应的输出流
FSDataOutputStream toCrawOs = fs.create(new Path("/logenhance/tocraw.data"));
//传入两个文件对应的输出流
LogEnhanceRecordWriter recordWriter = new LogEnhanceRecordWriter(enOs, toCrawOs);
return recordWriter;
}
static class LogEnhanceRecordWriter extends RecordWriter<Text,NullWritable>{
private FSDataOutputStream enOs = null;
private FSDataOutputStream toCrawOs = null;
public LogEnhanceRecordWriter(FSDataOutputStream enOs, FSDataOutputStream toCrawOs) {
this.enOs = enOs;
this.toCrawOs = toCrawOs;
}
@Override
public void write(Text key, NullWritable value) throws IOException, InterruptedException {
//拿到的信息要么www.abc.com abcdefg或者www.abc.com tocraw
//根据key内容来判断将key输出到哪个文件
String info = key.toString();
if(info.contains("tocraw")){
//输出到待待爬文件
toCrawOs.write(info.getBytes());
}else {
//输出待增强日志的文件
enOs.write(info.getBytes());
}
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
//关闭流
if(enOs != null){
enOs.close();
}
if(toCrawOs != null){
toCrawOs.close();
}
}
}
}
/**
* 主程序:读取日志内容,根据url去知识库看能否查到内容信息,如果能的话,
* 将原来那行日志信息增强写入到文件a.txt,如果没有查到,那么将信息写到到待爬文件b.txt
* 至于怎么使得不同内容输入到不同文件则使用自定义的OutputFormat,但主程序中要指定。
* @author 12706
*
*/
public class MyLogEnhance {
static class MyLogEnhanceMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
Map<String, String> logMap = new HashMap<String, String>();
Text k = new Text();
//
@Override
protected void setup(Context context)
throws IOException, InterruptedException {
//将信息加载到map缓存中
MapLoaderUtils.mapInit(logMap);
}
//读取日志文件一行文本信息,切割后第27个为url
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//计数器,用来记录字段不符合>=27的行
Counter counter = context.getCounter("mylog", "illegal_line");
String line = value.toString();
String[] fields = line.split("\t");
try {
//有的一行不一定有27个字段
String url = fields[26];
//根据url去缓存中查询是否有内容信息
String content = logMap.get(url);
if(content == null){
//没有内容,如果为空则只输出url到待爬清单
k.set(url+"\t"+"tocraw"+"\n");
}else {
//知识库(数据库)中有信息,增强日志信息
k.set(line+"\t"+context+"\n");
}
context.write(k, NullWritable.get());
} catch (Exception e) {
counter.increment(1);//一次加1
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(MyLogEnhance.class);
job.setMapperClass(MyLogEnhanceMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//使用自定义OutputFormat
job.setOutputFormatClass(LogEnhanceOutoutFormat.class);
//指明不需要使用reduce
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job, new Path(args[0]));
//尽管我们用的自定义Outputformat,但是它是继承自FileOutputFormat
//在FileOutputFormat中,必须输出一个_success文件,所以在此还需要输出path
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean exit = job.waitForCompletion(true);
System.exit(exit?0:1);
}
}
测试程序
工程打包上传到linux
将日志文件上传到linux
创建文件夹/logenhance/input
将日志文件上传到/logenhance/input
执行程序,查看
[root@mini1 ~]# hadoop fs -mkdir -p /logenhance/input
[root@mini1 ~]# hadoop fs -put 2013072404-http-combinedBy-1373892200521-log-1 /logenhance/input
[root@mini1 ~]# hadoop jar logen.jar com.scu.hadoop.t.logenhanceoutput.MyLogEnhance /logenhance/input /logenhance/output
...
mylog
illegal_line=1
File Input Format Counters
Bytes Read=61826249
File Output Format Counters
Bytes Written=68329573(这些是比以前多出来的)
[root@mini1 ~]# hadoop fs -ls /logenhance
Found 4 items
-rw-r--r-- 2 root supergroup 68329573 2017-10-17 07:32 /logenhance/enhance.data
drwxr-xr-x - root supergroup 0 2017-10-17 07:00 /logenhance/input
drwxr-xr-x - root supergroup 0 2017-10-17 07:32 /logenhance/output
-rw-r--r-- 2 root supergroup 0 2017-10-17 07:32 /logenhance/tocraw.data