Mapreduce之自定义OutputFormat应用-日志增强

最新推荐文章于 2022-03-10 16:50:33 发布

成都往右

最新推荐文章于 2022-03-10 16:50:33 发布

阅读量640

点赞数

分类专栏： hadoop 文章标签： mapreduce outputform hadoop

本文链接：https://blog.csdn.net/qq_37334135/article/details/78283034

版权

hadoop 专栏收录该内容

22 篇文章 0 订阅

订阅专栏

主要介绍的是自定义OutputFormat的使用，给出的需求很简单
对现有的日志文件内容进行增强。
1、从原始日志文件中读取数据
2、根据日志中的一个URL字段到外部知识库中获取信息增强到原始日志
3、如果成功增强，则输出到增强结果目录；如果增强失败，则抽取原始数据中URL字段输出到待爬清单目录
流程图
这里写图片描述
程序实现

/**
 * 从数据库（知识库）中加载信息，为传进来的map初始化
 * @author 12706
 *
 */
public class MapLoaderUtils {
    /**
     * 知识库中加载信息，初始化map
     * @param map
     */
    public static void mapInit(Map<String,String> map){
        Connection conn = null;
        Statement state = null;
        ResultSet rs = null;
        try {
            Class.forName("com.mysql.jdbc.Driver");
            conn = DriverManager.getConnection("jdbc:mysql://192.168.191.2:3306/test", "root", "123456");
            System.out.println("数据库连接成功");
            String sql = "SELECT * FROM url_rule";
            state = conn.createStatement();
            rs = state.executeQuery(sql);
            while(rs.next()){
                //初始化map   <url,content>
                //System.out.println(rs.getString(1));
                //System.out.println(rs.getString(2));
                map.put(rs.getString(1), rs.getString(2));
                //System.out.println(map);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }finally {
            if(conn != null){
                try {
                    conn.close();
                } catch (SQLException e) {
                    e.printStackTrace();
                }
            }
            if(state != null){
                try {
                    state.close();
                } catch (SQLException e) {
                    e.printStackTrace();
                }
            }
            if(rs != null){
                try {
                    rs.close();
                } catch (SQLException e) {
                    e.printStackTrace();
                }
            }
        }
    }
    public static void main(String[] args) {
        Map<String, String> map = new HashMap<String, String>();
        mapInit(map);
        System.out.println(map);
    }
}

/**
 * 自定义的OutoutFormat
 * 拿到maptask或者reducetask传来的kv进行处理
 * 将不同内容输入到不同的文件
 *  maptask或者reduceTask在最终输出时，先调用OutPutFormat的getRecordWrite方法得到
 * 一个RecordWrite然后再调用RecordWrite的write(k,v)方法将数据写出
 * @author 12706
 *
 */
public class LogEnhanceOutoutFormat extends FileOutputFormat<Text, NullWritable>{

    @Override
    public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext job)
            throws IOException, InterruptedException {
        FileSystem fs = FileSystem.get(job.getConfiguration());
        //获取增强日志对应的输出流 
        FSDataOutputStream enOs = fs.create(new Path("/logenhance/enhance.data"));
        //获取待爬信息对应的输出流
        FSDataOutputStream toCrawOs = fs.create(new Path("/logenhance/tocraw.data"));
        //传入两个文件对应的输出流
        LogEnhanceRecordWriter recordWriter = new LogEnhanceRecordWriter(enOs, toCrawOs);
        return recordWriter;
    }
    static class LogEnhanceRecordWriter extends RecordWriter<Text,NullWritable>{

        private FSDataOutputStream enOs = null;
        private FSDataOutputStream toCrawOs = null;

        public LogEnhanceRecordWriter(FSDataOutputStream enOs, FSDataOutputStream toCrawOs) {
            this.enOs = enOs;
            this.toCrawOs = toCrawOs;
        }

        @Override
        public void write(Text key, NullWritable value) throws IOException, InterruptedException {
            //拿到的信息要么www.abc.com    abcdefg或者www.abc.com    tocraw
            //根据key内容来判断将key输出到哪个文件
            String info = key.toString();
            if(info.contains("tocraw")){
                //输出到待待爬文件
                toCrawOs.write(info.getBytes());
            }else {
                //输出待增强日志的文件
                enOs.write(info.getBytes());
            }
        }

        @Override
        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            //关闭流
            if(enOs != null){
                enOs.close();
            }
            if(toCrawOs != null){
                toCrawOs.close();
            }
        }

    }
}

/**
 * 主程序：读取日志内容，根据url去知识库看能否查到内容信息，如果能的话，
 * 将原来那行日志信息增强写入到文件a.txt，如果没有查到，那么将信息写到到待爬文件b.txt
 * 至于怎么使得不同内容输入到不同文件则使用自定义的OutputFormat，但主程序中要指定。
 * @author 12706
 *
 */
public class MyLogEnhance {
    static class MyLogEnhanceMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
        Map<String, String> logMap = new HashMap<String, String>();
        Text k = new Text();
        //
        @Override
        protected void setup(Context context)
                throws IOException, InterruptedException {
            //将信息加载到map缓存中
            MapLoaderUtils.mapInit(logMap);
        }
        //读取日志文件一行文本信息，切割后第27个为url
        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            //计数器，用来记录字段不符合>=27的行
            Counter counter = context.getCounter("mylog", "illegal_line");
            String line = value.toString();
            String[] fields = line.split("\t");
            try {
                //有的一行不一定有27个字段
                String url = fields[26];
                //根据url去缓存中查询是否有内容信息
                String content = logMap.get(url);
                if(content == null){
                    //没有内容，如果为空则只输出url到待爬清单
                    k.set(url+"\t"+"tocraw"+"\n");
                }else {
                    //知识库（数据库）中有信息，增强日志信息
                    k.set(line+"\t"+context+"\n");
                }
                context.write(k, NullWritable.get());

            } catch (Exception e) {
                counter.increment(1);//一次加1
            }
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(MyLogEnhance.class);
        job.setMapperClass(MyLogEnhanceMapper.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        //使用自定义OutputFormat
        job.setOutputFormatClass(LogEnhanceOutoutFormat.class);
        //指明不需要使用reduce
        job.setNumReduceTasks(0);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        //尽管我们用的自定义Outputformat,但是它是继承自FileOutputFormat
        //在FileOutputFormat中，必须输出一个_success文件，所以在此还需要输出path
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        boolean exit = job.waitForCompletion(true);
        System.exit(exit?0:1);

    }
}

测试程序
工程打包上传到linux
将日志文件上传到linux
创建文件夹/logenhance/input
将日志文件上传到/logenhance/input
执行程序，查看

[root@mini1 ~]# hadoop fs -mkdir -p /logenhance/input
[root@mini1 ~]# hadoop fs -put 2013072404-http-combinedBy-1373892200521-log-1 /logenhance/input
[root@mini1 ~]# hadoop jar logen.jar com.scu.hadoop.t.logenhanceoutput.MyLogEnhance /logenhance/input /logenhance/output
...
 mylog
                illegal_line=1
        File Input Format Counters 
                Bytes Read=61826249
        File Output Format Counters 
                Bytes Written=68329573（这些是比以前多出来的）
                [root@mini1 ~]# hadoop fs -ls /logenhance
Found 4 items
-rw-r--r--   2 root supergroup   68329573 2017-10-17 07:32 /logenhance/enhance.data
drwxr-xr-x   - root supergroup          0 2017-10-17 07:00 /logenhance/input
drwxr-xr-x   - root supergroup          0 2017-10-17 07:32 /logenhance/output
-rw-r--r--   2 root supergroup          0 2017-10-17 07:32 /logenhance/tocraw.data