mr outputformat

最新推荐文章于 2022-03-02 15:51:08 发布

AuroraPetard

最新推荐文章于 2022-03-02 15:51:08 发布

阅读量152

点赞数

分类专栏： hadoop

本文链接：https://blog.csdn.net/qq_38250124/article/details/79915907

版权

hadoop 专栏收录该内容

38 篇文章 0 订阅

订阅专栏

此处没有reduce即map处理后直接输出，那么也不会经过shuffle 所以进来是什么顺序，输出就是什么数据

默认输出为TextOutputformat，既然要自定义输出就要extends FileOutputFormat

情景：分析日志根据url去相应数据库查找，如果查找到内容就把内容添加到url后面，并输出到hdfs指定目录

如果没有查找到内容，则在url后面添加toCrawl待爬字段再输出到hdfs指定目录，此处与上处目录不是同一个目录

so 就要自定义outputformat 将不同的文件放到不同的地方

首先在map初始化时即setup时加载数据库查询，将所有查询结果先缓存到一个HashMap中

package com.ll.bd;

import java.sql.*;
import java.util.Map;

public class Db {
    public static void load(Map<String, String> hashMap) throws ClassNotFoundException, SQLException {
        Connection conn = null;
        Statement st = null;
        ResultSet res = null;

        Class.forName("com.mysql.jdbc.Driver");
        // 建立数据库 连接
        String url = "jdbc:mysql://192.168.211.5:3306/jdbcdemo";
        String uid = "root";
        String pw = "111111";
        conn = (Connection) DriverManager.getConnection(url, uid, pw);
        st = conn.createStatement();
        res = st.executeQuery("select url,content from url_rule");
        while (res.next()) {
            hashMap.put(res.getString(1), res.getString(2));

        }

    }
}

然后map setup调用db并传入hashmap

不需要reduce，map阶段截取url hashMap.get(url)

是否有值添加处理

job.setoutputformat()

package com.ll.bd;

import com.lq.bd.OrderBean;
import com.lq.bd.group;
import com.lq.bd.groupPartitioner;
import com.lq.bd.groupingcomparat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.Map;

public class logEnhance {
    static class logEnhanceMap extends Mapper<LongWritable, Text, Text, NullWritable> {
        Map<String, String> ruleMap = new HashMap<String, String>();
        Text text = new Text();
        NullWritable v = NullWritable.get();

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            //加载数据库中的文件到hashmap中
            try {
                Db.load(ruleMap);
            } catch (ClassNotFoundException e) {
                e.printStackTrace();
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

//全局计数器
Counter contextCounter = context.getCounter("counter", "count");

String line = value . toString () ; String [] fields = line . split ( " \t " ) ; String url = fields[ 26 ] ; String content_tag = ruleMap . get (url) ; // 判断内容是否为空，为空输出到进一步处理的 difang // 不为 kong 输出 url+content if (content_tag == null ) { text . set (url + " \t " + "tocrawl" + " \n " ) ; context . write (text , v ) ; } else { text . set (line + " \t " + content_tag + " \n " ) ; context . write (text , v ) ; }
}

contextCounter.increment(1);

} public static void main ( String [] args ) throws InterruptedException , IOException , ClassNotFoundException { Configuration conf = new Configuration () ; Job job = Job . getInstance (conf) ; // 指定 mapper reduce class job . setMapperClass ( logEnhanceMap .class ) ; // job.setReducerClass(group.groupReduce.class); // 设置本程序 jar 所在路径 job . setJarByClass ( logEnhance .class ) ; // job.setJar("path"); // 指定 mapper 输出的类型 job . setMapOutputKeyClass ( OrderBean .class ) ; job . setMapOutputValueClass ( NullWritable .class ) ; // 指定最终输出的 key value 的类型 job . setOutputKeyClass ( OrderBean .class ) ; job . setOutputValueClass ( NullWritable .class ) ; // 指定输入文件的路径，指的是在 hdfs 上的路径 , 用 .lib.input FileInputFormat . setInputPaths (job , new Path ( "/home/lzq/input/test.txt" )) ; // 指定输出结果路径 FileOutputFormat . setOutputPath (job , new Path ( "/home/lzq/output3" )) ; job . setPartitionerClass ( groupPartitioner .class ) ; job . setGroupingComparatorClass ( groupingcomparat .class ) ; job . setNumReduceTasks ( 0 ) ; job . setOutputFormatClass ( outputform .class ) ; // job.submit(); // 前者 job.submit() 不知道程序运行情况，且不会推出 // true 表示把集群运行信息打印出来 boolean res = job . waitForCompletion ( true ) ; // 零表示退出 System . exit (res ? 0 : 1 ) ; // 可以用 shell 脚本来运行， $?, 获取程序退出码，为 0 成功为 1failed } }最重要的是extends FileOutPutFormat方法

package com.ll.bd;

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class outputform extends FileOutputFormat<Text, NullWritable> {
    //maptask或reducetask在最终输出时，先调用outputformat的getrecordwrite
    /*
    拿到一个recordwrite
     */
    //getRecordWriter返回RecordWriter，但RecordWriter为abstract class无法new 所以创建一个其子类
    @Override
    public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
            //再此构造一个hdfs流通过类构造传入EnhanceRecordWriter中
        //filesystem.get(new configuration())很慢，但conf已经存在上下文中了，所以
        //taskAttemptContext.getConfiguration()
        FileSystem fs = FileSystem.get(taskAttemptContext.getConfiguration());
        Path enhancePath = new Path("hdfs://mini01:9000/enhance.log");
        Path toCrawlPath = new Path("hdfs://mini01:9000/toCrawl.log");

        FSDataOutputStream enOs = fs.create(enhancePath);
        FSDataOutputStream toCrawlOs = fs.create(toCrawlPath);

        return new EnhanceRecordWriter(enOs, toCrawlOs);
    }

    //上面的方法需要返回一个recordWriter，
    static class EnhanceRecordWriter extends RecordWriter<Text, NullWritable> {
        FSDataOutputStream enOs = null;
        FSDataOutputStream toCrawlOs = null;

        public EnhanceRecordWriter(FSDataOutputStream enOs, FSDataOutputStream toCrawlOs) {
            this.enOs = enOs;
            this.toCrawlOs = toCrawlOs;
        }

        @Override
        public void write(Text text, NullWritable nullWritable) throws IOException, InterruptedException {
            //写到hdfs，如果在此构造hdfs流那么每次write都要创建流效率低
            if (text.toString().contains("toCrawl")) {
                toCrawlOs.write(text.getBytes());
            } else {
                enOs.write(text.getBytes());
            }
        }

        @Override
        public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
            if (toCrawlOs!=null){
                toCrawlOs.close();
            }
            if (enOs!=null){
                enOs.close();
            }
        }
    }
}