HIVE 处理日志，自定义inputformat 完整版

最新推荐文章于 2022-12-08 11:03:06 发布

wisgood

最新推荐文章于 2022-12-08 11:03:06 发布

阅读量3.2k

点赞数

分类专栏： Hive

Hive 专栏收录该内容

93 篇文章 7 订阅

订阅专栏

网上找了很多材料都是写了部份代码的，今天在峰哥的帮助下实现了此功能。

为何要设置此功能是由于 hive fields terminated by '||||' 不支持字符串导致

将你的inputformat类打成jar包，如MyInputFormat.jar
将MyInputFormat.jar放到 hive/lib里，然后就可以建表了
假设你的inputFormat类路径是com.hive.myinput
则建表语句为：create table tbname(name stirng,id int, ...) stored as INPUTFORMAT 'com.hive.myinput' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'

HiveIgnoreKeyTextOutputFormat是系统自带的outputformat类，你也可以自定义

由于hive是基于hadoop集群运行的，所以hadoop/lib里面也必须放入MyInputFormat.jar,

此功能需要二个CLASS 类：ClickstreamInputFormat ClickstreamRecordReader

[java]view plaincopyprint? 
   
 package com.jd.cloud.clickstore;  
   
 import java.io.IOException;    
   
 import org.apache.hadoop.io.LongWritable;    
 import org.apache.hadoop.io.Text;    
 import org.apache.hadoop.mapred.FileSplit;    
 import org.apache.hadoop.mapred.InputSplit;    
 import org.apache.hadoop.mapred.JobConf;    
 import org.apache.hadoop.mapred.JobConfigurable;    
 import org.apache.hadoop.mapred.RecordReader;    
 import org.apache.hadoop.mapred.Reporter;    
 import org.apache.hadoop.mapred.TextInputFormat;  
   
 /**  
  * 自定义hadoop的 org.apache.hadoop.mapred.InputFormat  
  *   
  * @author winston  
  *   
  */    
 public class ClickstreamInputFormat extends TextInputFormat implements    
         JobConfigurable {    
     
     public RecordReader<LongWritable, Text> getRecordReader(    
             InputSplit genericSplit, JobConf job, Reporter reporter)    
             throws IOException {    
     
         reporter.setStatus(genericSplit.toString());    
         return new ClickstreamRecordReader((FileSplit) genericSplit,job);    
     }    
 }    

[java]view plaincopyprint? 
   
 package com.jd.cloud.clickstore;  
   
 import java.io.IOException;  
 import java.io.InputStream;  
 import org.apache.hadoop.conf.Configuration;  
 import org.apache.hadoop.fs.FSDataInputStream;  
 import org.apache.hadoop.fs.FileSystem;  
 import org.apache.hadoop.fs.Path;  
 import org.apache.hadoop.io.LongWritable;  
 import org.apache.hadoop.io.Text;  
 import org.apache.hadoop.io.compress.CompressionCodec;  
 import org.apache.hadoop.io.compress.CompressionCodecFactory;  
 import org.apache.hadoop.mapred.FileSplit;  
 import org.apache.hadoop.util.LineReader;  
 import org.apache.hadoop.mapred.RecordReader;  
   
   
 public class ClickstreamRecordReader implements  
         RecordReader<LongWritable, Text> {  
   
   
     private CompressionCodecFactory compressionCodecs = null;  
     private long start;  
     private long pos;  
     private long end;  
     private LineReader lineReader;  
     int maxLineLength;  
   
     public ClickstreamRecordReader(FileSplit inputSplit, Configuration job)  
             throws IOException {  
         maxLineLength = job.getInt("mapred.ClickstreamRecordReader.maxlength",  
                 Integer.MAX_VALUE);  
         start = inputSplit.getStart();  
         end = start + inputSplit.getLength();  
         final Path file = inputSplit.getPath();  
         compressionCodecs = new CompressionCodecFactory(job);  
         final CompressionCodec codec = compressionCodecs.getCodec(file);  
   
         // Open file and seek to the start of the split  
         FileSystem fs = file.getFileSystem(job);  
         FSDataInputStream fileIn = fs.open(file);  
         boolean skipFirstLine = false;  
         if (codec != null) {  
             lineReader = new LineReader(codec.createInputStream(fileIn), job);  
             end = Long.MAX_VALUE;  
         } else {  
             if (start != 0) {  
                 skipFirstLine = true;  
                 --start;  
                 fileIn.seek(start);  
             }  
             lineReader = new LineReader(fileIn, job);  
         }  
         if (skipFirstLine) {  
             start += lineReader.readLine(new Text(), 0,  
                     (int) Math.min((long) Integer.MAX_VALUE, end - start));  
         }  
         this.pos = start;  
     }  
   
     public ClickstreamRecordReader(InputStream in, long offset, long endOffset,  
             int maxLineLength) {  
         this.maxLineLength = maxLineLength;  
         this.lineReader = new LineReader(in);  
         this.start = offset;  
         this.pos = offset;  
         this.end = endOffset;  
     }  
   
     public ClickstreamRecordReader(InputStream in, long offset, long endOffset,  
             Configuration job) throws IOException {  
         this.maxLineLength = job.getInt(  
                 "mapred.ClickstreamRecordReader.maxlength", Integer.MAX_VALUE);  
         this.lineReader = new LineReader(in, job);  
         this.start = offset;  
         this.pos = offset;  
         this.end = endOffset;  
     }  
   
     public LongWritable createKey() {  
         return new LongWritable();  
     }  
   
     public Text createValue() {  
         return new Text();  
     }  
   
     /** 
      * Reads the next record in the split. get usefull fields from the raw nginx 
      * log. 
      *  
      * @param key 
      *            key of the record which will map to the byte offset of the 
      *            record's line 
      * @param value 
      *            the record in text format 
      * @return true if a record existed, false otherwise 
      * @throws IOException 
      */  
     public synchronized boolean next(LongWritable key, Text value)  
             throws IOException {  
         // Stay within the split  
         while (pos < end) {  
             key.set(pos);  
             int newSize = lineReader.readLine(value, maxLineLength,  
                     Math.max((int) Math.min(Integer.MAX_VALUE, end - pos),  
                             maxLineLength));  
   
             if (newSize == 0)  
                 return false;  
   
             String str = value.toString().toLowerCase()  
                     .replaceAll("\\@\\_\\@", "\001");  
             value.set(str);  
             pos += newSize;  
   
             if (newSize < maxLineLength)  
                 return true;  
         }  
   
         return false;  
     }  
   
     public float getProgress() {  
         if (start == end) {  
             return 0.0f;  
         } else {  
             return Math.min(1.0f, (pos - start) / (float) (end - start));  
         }  
     }  
   
     public synchronized long getPos() throws IOException {  
         return pos;  
     }  
   
     public synchronized void close() throws IOException {  
         if (lineReader != null)  
             lineReader.close();  
     }  
       
     // 测试 输出  
     //public static void main(String ags[]){  
     //    String str1 ="123@_@abcd@_@fk".replaceAll("\\@\\_\\@", "\001");  
     //    System.out.println(str1);  
     //}  
 }  

1.上传到 HIVE 服务器上 JAVAC 编译

[plain]view plaincopyprint? 
   
 javac -cp ./:/usr/lib/hadoop/hadoop-common.jar:/home/op1/hadoop/hadoop-core-1.0.3.jar:/usr/lib/hadoop/lib/commons-logging-1.1.1.jar */**/*/*/*  

2.JAR 打包类文件

[java]view plaincopyprint? 
   
 jar -cf ClickstreamInputFormat.jar /home/op1/uerdwdb/src/  

3.复制 Hive/lib Hadoop/lib 文件夹内

4.Hive 创建表命令

[sql]view plaincopyprint? 
   
 create table hive_text(num int,name string,`add` string)  
 stored as INPUTFORMAT 'com.jd.cloud.clickstore.ClickstreamInputFormat'   
 OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'   
 location '/home/op1/uerdwdb/text.txt';  

wisgood

关注

0
点赞
踩
6

收藏

觉得还不错? 一键收藏
0
评论
HIVE 处理日志，自定义inputformat 完整版

网上找了很多材料都是写了部份代码的，今天在峰哥的帮助下实现了此功能。为何要设置此功能是由于 hive fields terminated by '||||' 不支持字符串导致将你的inputformat类打成jar包，如MyInputFormat.jar将MyInputFormat.jar放到 hive/lib里，然后就可以建表了假设你的input
复制链接

扫一扫

专栏目录