mr outputformat

此处没有reduce即map处理后直接输出,那么也不会经过shuffle 所以进来是什么顺序,输出就是什么数据

默认输出为TextOutputformat,既然要自定义输出就要extends FileOutputFormat

情景:分析日志根据url去相应数据库查找,如果查找到内容就把内容添加到url后面,并输出到hdfs指定目录

如果没有查找到内容,则在url后面添加toCrawl待爬字段再输出到hdfs指定目录,此处与上处目录不是同一个目录

so 就要自定义outputformat 将不同的文件放到不同的地方

首先在map初始化时 即setup时加载数据库查询,将所有查询结果先缓存到一个HashMap中

package com.ll.bd;

import java.sql.*;
import java.util.Map;

public class Db {
    public static void load(Map<String, String> hashMap) throws ClassNotFoundException, SQLException {
        Connection conn = null;
        Statement st = null;
        ResultSet res = null;

        Class.forName("com.mysql.jdbc.Driver");
        // 建立数据库 连接
        String url = "jdbc:mysql://192.168.211.5:3306/jdbcdemo";
        String uid = "root";
        String pw = "111111";
        conn = (Connection) DriverManager.getConnection(url, uid, pw);
        st = conn.createStatement();
        res = st.executeQuery("select url,content from url_rule");
        while (res.next()) {
            hashMap.put(res.getString(1), res.getString(2));

        }

    }
}
然后map setup调用db并传入hashmap

不需要reduce,map阶段 截取url hashMap.get(url)

是否有值 添加处理

job.setoutputformat()

package com.ll.bd;

import com.lq.bd.OrderBean;
import com.lq.bd.group;
import com.lq.bd.groupPartitioner;
import com.lq.bd.groupingcomparat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.Map;

public class logEnhance {
    static class logEnhanceMap extends Mapper<LongWritable, Text, Text, NullWritable> {
        Map<String, String> ruleMap = new HashMap<String, String>();
        Text text = new Text();
        NullWritable v = NullWritable.get();

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            //加载数据库中的文件到hashmap            try {
                Db.load(ruleMap);
            } catch (ClassNotFoundException e) {
                e.printStackTrace();
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
           
//全局计数器
Counter contextCounter = context.getCounter("counter", "count");

String line = value . toString () ; String [] fields = line . split ( " \t " ) ; String url = fields[ 26 ] ; String content_tag = ruleMap . get (url) ; // 判断内容是否为空,为空输出到 进一步处理的 difang // 不为 kong 输出 url+content if (content_tag == null ) { text . set (url + " \t " + "tocrawl" + " \n " ) ; context . write (text , v ) ; } else { text . set (line + " \t " + content_tag + " \n " ) ; context . write (text , v ) ; }
 
}
contextCounter.increment(1);
} public static void main ( String [] args ) throws InterruptedException , IOException , ClassNotFoundException { Configuration conf = new Configuration () ; Job job = Job . getInstance (conf) ; // 指定 mapper reduce class job . setMapperClass ( logEnhanceMap .class ) ; // job.setReducerClass(group.groupReduce.class); // 设置本程序 jar 所在路径 job . setJarByClass ( logEnhance .class ) ; // job.setJar("path"); // 指定 mapper 输出的类型 job . setMapOutputKeyClass ( OrderBean .class ) ; job . setMapOutputValueClass ( NullWritable .class ) ; // 指定最终输出的 key value 的类型 job . setOutputKeyClass ( OrderBean .class ) ; job . setOutputValueClass ( NullWritable .class ) ; // 指定输入文件的路径,指的是在 hdfs 上的路径 , .lib.input FileInputFormat . setInputPaths (job , new Path ( "/home/lzq/input/test.txt" )) ; // 指定输出结果路径 FileOutputFormat . setOutputPath (job , new Path ( "/home/lzq/output3" )) ; job . setPartitionerClass ( groupPartitioner .class ) ; job . setGroupingComparatorClass ( groupingcomparat .class ) ; job . setNumReduceTasks ( 0 ) ; job . setOutputFormatClass ( outputform .class ) ; // job.submit(); // 前者 job.submit() 不知道程序运行情况,且不会推出 // true 表示把集群运行信息打印出来 boolean res = job . waitForCompletion ( true ) ; // 零表示退出 System . exit (res ? 0 : 1 ) ; // 可以用 shell 脚本来运行, $?, 获取程序退出码,为 0 成功 为 1failed } }最重要的是extends FileOutPutFormat方法
package com.ll.bd;

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class outputform extends FileOutputFormat<Text, NullWritable> {
    //maptaskreducetask在最终输出时,先调用outputformatgetrecordwrite
    /*
    拿到一个recordwrite
     */
    //getRecordWriter返回RecordWriter,但RecordWriterabstract class无法new 所以创建一个其子类
    @Override
    public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
            //再此构造一个hdfs流通过类构造传入EnhanceRecordWriter        //filesystem.get(new configuration())很慢,但conf已经存在上下文中了,所以
        //taskAttemptContext.getConfiguration()
        FileSystem fs = FileSystem.get(taskAttemptContext.getConfiguration());
        Path enhancePath = new Path("hdfs://mini01:9000/enhance.log");
        Path toCrawlPath = new Path("hdfs://mini01:9000/toCrawl.log");

        FSDataOutputStream enOs = fs.create(enhancePath);
        FSDataOutputStream toCrawlOs = fs.create(toCrawlPath);

        return new EnhanceRecordWriter(enOs, toCrawlOs);
    }

    //上面的方法需要返回一个recordWriter    static class EnhanceRecordWriter extends RecordWriter<Text, NullWritable> {
        FSDataOutputStream enOs = null;
        FSDataOutputStream toCrawlOs = null;

        public EnhanceRecordWriter(FSDataOutputStream enOs, FSDataOutputStream toCrawlOs) {
            this.enOs = enOs;
            this.toCrawlOs = toCrawlOs;
        }

        @Override
        public void write(Text text, NullWritable nullWritable) throws IOException, InterruptedException {
            //写到hdfs,如果在此构造hdfs流那么每次write都要创建流效率低
            if (text.toString().contains("toCrawl")) {
                toCrawlOs.write(text.getBytes());
            } else {
                enOs.write(text.getBytes());
            }
        }

        @Override
        public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
            if (toCrawlOs!=null){
                toCrawlOs.close();
            }
            if (enOs!=null){
                enOs.close();
            }
        }
    }
}



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值