此处没有reduce即map处理后直接输出,那么也不会经过shuffle 所以进来是什么顺序,输出就是什么数据
默认输出为TextOutputformat,既然要自定义输出就要extends FileOutputFormat
情景:分析日志根据url去相应数据库查找,如果查找到内容就把内容添加到url后面,并输出到hdfs指定目录
如果没有查找到内容,则在url后面添加toCrawl待爬字段再输出到hdfs指定目录,此处与上处目录不是同一个目录
so 就要自定义outputformat 将不同的文件放到不同的地方
首先在map初始化时 即setup时加载数据库查询,将所有查询结果先缓存到一个HashMap中
package com.ll.bd; import java.sql.*; import java.util.Map; public class Db { public static void load(Map<String, String> hashMap) throws ClassNotFoundException, SQLException { Connection conn = null; Statement st = null; ResultSet res = null; Class.forName("com.mysql.jdbc.Driver"); // 建立数据库 连接 String url = "jdbc:mysql://192.168.211.5:3306/jdbcdemo"; String uid = "root"; String pw = "111111"; conn = (Connection) DriverManager.getConnection(url, uid, pw); st = conn.createStatement(); res = st.executeQuery("select url,content from url_rule"); while (res.next()) { hashMap.put(res.getString(1), res.getString(2)); } } }然后map setup调用db并传入hashmap
不需要reduce,map阶段 截取url hashMap.get(url)
是否有值 添加处理
job.setoutputformat()
package com.ll.bd; import com.lq.bd.OrderBean; import com.lq.bd.group; import com.lq.bd.groupPartitioner; import com.lq.bd.groupingcomparat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; import java.sql.SQLException; import java.util.HashMap; import java.util.Map; public class logEnhance { static class logEnhanceMap extends Mapper<LongWritable, Text, Text, NullWritable> { Map<String, String> ruleMap = new HashMap<String, String>(); Text text = new Text(); NullWritable v = NullWritable.get(); @Override protected void setup(Context context) throws IOException, InterruptedException { //加载数据库中的文件到hashmap中 try { Db.load(ruleMap); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (SQLException e) { e.printStackTrace(); } } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//全局计数器 Counter contextCounter = context.getCounter("counter", "count");
String line = value . toString () ; String [] fields = line . split ( " \t " ) ; String url = fields[ 26 ] ; String content_tag = ruleMap . get (url) ; // 判断内容是否为空,为空输出到 进一步处理的 difang // 不为 kong 输出 url+content if (content_tag == null ) { text . set (url + " \t " + "tocrawl" + " \n " ) ; context . write (text , v ) ; } else { text . set (line + " \t " + content_tag + " \n " ) ; context . write (text , v ) ; }
}
contextCounter.increment(1);} public static void main ( String [] args ) throws InterruptedException , IOException , ClassNotFoundException { Configuration conf = new Configuration () ; Job job = Job . getInstance (conf) ; // 指定 mapper reduce class job . setMapperClass ( logEnhanceMap .class ) ; // job.setReducerClass(group.groupReduce.class); // 设置本程序 jar 所在路径 job . setJarByClass ( logEnhance .class ) ; // job.setJar("path"); // 指定 mapper 输出的类型 job . setMapOutputKeyClass ( OrderBean .class ) ; job . setMapOutputValueClass ( NullWritable .class ) ; // 指定最终输出的 key value 的类型 job . setOutputKeyClass ( OrderBean .class ) ; job . setOutputValueClass ( NullWritable .class ) ; // 指定输入文件的路径,指的是在 hdfs 上的路径 , 用 .lib.input FileInputFormat . setInputPaths (job , new Path ( "/home/lzq/input/test.txt" )) ; // 指定输出结果路径 FileOutputFormat . setOutputPath (job , new Path ( "/home/lzq/output3" )) ; job . setPartitionerClass ( groupPartitioner .class ) ; job . setGroupingComparatorClass ( groupingcomparat .class ) ; job . setNumReduceTasks ( 0 ) ; job . setOutputFormatClass ( outputform .class ) ; // job.submit(); // 前者 job.submit() 不知道程序运行情况,且不会推出 // true 表示把集群运行信息打印出来 boolean res = job . waitForCompletion ( true ) ; // 零表示退出 System . exit (res ? 0 : 1 ) ; // 可以用 shell 脚本来运行, $?, 获取程序退出码,为 0 成功 为 1failed } }最重要的是extends FileOutPutFormat方法
package com.ll.bd; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class outputform extends FileOutputFormat<Text, NullWritable> { //maptask或reducetask在最终输出时,先调用outputformat的getrecordwrite /* 拿到一个recordwrite */ //getRecordWriter返回RecordWriter,但RecordWriter为abstract class无法new 所以创建一个其子类 @Override public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { //再此构造一个hdfs流通过类构造传入EnhanceRecordWriter中 //filesystem.get(new configuration())很慢,但conf已经存在上下文中了,所以 //taskAttemptContext.getConfiguration() FileSystem fs = FileSystem.get(taskAttemptContext.getConfiguration()); Path enhancePath = new Path("hdfs://mini01:9000/enhance.log"); Path toCrawlPath = new Path("hdfs://mini01:9000/toCrawl.log"); FSDataOutputStream enOs = fs.create(enhancePath); FSDataOutputStream toCrawlOs = fs.create(toCrawlPath); return new EnhanceRecordWriter(enOs, toCrawlOs); } //上面的方法需要返回一个recordWriter, static class EnhanceRecordWriter extends RecordWriter<Text, NullWritable> { FSDataOutputStream enOs = null; FSDataOutputStream toCrawlOs = null; public EnhanceRecordWriter(FSDataOutputStream enOs, FSDataOutputStream toCrawlOs) { this.enOs = enOs; this.toCrawlOs = toCrawlOs; } @Override public void write(Text text, NullWritable nullWritable) throws IOException, InterruptedException { //写到hdfs,如果在此构造hdfs流那么每次write都要创建流效率低 if (text.toString().contains("toCrawl")) { toCrawlOs.write(text.getBytes()); } else { enOs.write(text.getBytes()); } } @Override public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { if (toCrawlOs!=null){ toCrawlOs.close(); } if (enOs!=null){ enOs.close(); } } } }