利用hive做sql语句分析日志时,需要设定分割字段,如利用hive创建table时会有可用下面到格式
CREATE TABLE (...)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS INPUTFORMAT ‘NginxLogInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat';
为了使得原始日志在获取后就可直接存放在hadoop中供给hive分析,此时可以修改hadoop 在做mapreduce时需要用到到inputformat类。如下代码即是做里部分inputformat的修改
直接可以从原始日志中获取关键到字段。
以下是自定义的NginxLogInputFormat类
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
/**
* A custom input format for dealing with nginx log
* @author johnson
*/
class NginxLogInputFormat extends TextInputFormat {
@Override
public RecordReader<LongWritable, Text> getRecordReader(InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException {
return new NginxLogRecordReader((FileSplit)inputSplit, job);
}
public static void main(String[] args){
System.out.println("test!");
}
}
有了NginxLogInputFormat 还需要一个继承RecordReader类用来读取keyvalue值,以下是NginxLogRecordReader类
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.util.LineReader;
/**
* A record reader for splits generated by nginx daily access log.
* @author jonson
*
*/
public class NginxLogRecordReader implements RecordReader<LongWritable, Text> {
private static final Log LOG = LogFactory.getLog(NginxLogRecordReader.class.getName());
private CompressionCodecFactory compressionCodecs = null;
private long start;
private long pos;
private long end;
private LineReader lineReader;
int maxLineLength;
public NginxLogRecordReader(FileSplit inputSplit, Configuration job) throws IOException {
maxLineLength = job.getInt("mapred.nginxlogrecordreader.maxlength", Integer.MAX_VALUE);
start = inputSplit.getStart();
end = start + inputSplit.getLength();
final Path file = inputSplit.getPath();
compressionCodecs = new CompressionCodecFactory(job);
final CompressionCodec codec = compressionCodecs.getCodec(file);
// Open file and seek to the start of the split
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(file);
boolean skipFirstLine = false;
if (codec != null) {
lineReader = new LineReader(codec.createInputStream(fileIn), job);
end = Long.MAX_VALUE;
} else {
if (start != 0) {
skipFirstLine = true;
--start;
fileIn.seek(start);
}
lineReader = new LineReader(fileIn, job);
}
if (skipFirstLine) {
start += lineReader.readLine(new Text(), 0, (int)Math.min((long)Integer.MAX_VALUE, end - start));
}
this.pos = start;
}
public NginxLogRecordReader(InputStream in, long offset, long endOffset, int maxLineLength) {
this.maxLineLength = maxLineLength;
this.lineReader = new LineReader(in);
this.start = offset;
this.pos = offset;
this.end = endOffset;
}
public NginxLogRecordReader(InputStream in, long offset, long endOffset, Configuration job)
throws IOException {
this.maxLineLength = job.getInt("mapred.nginxlogrecordreader.maxlength", Integer.MAX_VALUE);
this.lineReader = new LineReader(in, job);
this.start = offset;
this.pos = offset;
this.end = endOffset;
}
public LongWritable createKey() {
return new LongWritable();
}
public Text createValue() {
return new Text();
}
/**
* Reads the next record in the split. get usefull fields from the raw nginx log.
* @param key key of the record which will map to the byte offset of the record's line
* @param value the record in text format
* @return true if a record existed, false otherwise
* @throws IOException
*/
public synchronized boolean next(LongWritable key, Text value) throws IOException {
// Stay within the split
while (pos < end) {
key.set(pos);
int newSize = lineReader.readLine(value, maxLineLength, Math.max((int)Math.min(Integer.MAX_VALUE, end - pos), maxLineLength));
if (newSize == 0)
return false;
NginxLogLineParser parser = new NginxLogLineParser(value.toString(), "\"");
String str = parser.getClientIP() + "\t" + parser.getFinishTime() + "\t" +
parser.getRequestTime() + "\t" + parser.getRequest() + "\t" + parser.getCode() + "\t" +
parser.getTraffc() + "\t" + parser.getReffer() + "\t" + parser.getUserAgent() + "\t" +
parser.getForward();
value.set(str);
pos += newSize;
if (newSize < maxLineLength)
return true;
}
return false;
}
public float getProgress() {
if (start == end) {
return 0.0f;
} else {
return Math.min(1.0f, (pos - start) / (float)(end - start));
}
}
public synchronized long getPos() throws IOException {
return pos;
}
public synchronized void close() throws IOException {
if (lineReader != null)
lineReader.close();
}
}
NginxLogLineParser类用来提取原始日志中到关键字段,其实现代码如下:
public class NginxLogLineParser {
private String[] data;
public NginxLogLineParser(String str, String delimter){
String[] splitStr = str.split(delimter);
data = splitStr;
}
public String getClientIP() {
String clientIP = "--";
try{
String ipContext = data[0];
String[] splitIpContext = ipContext.split(" ");
clientIP = splitIpContext[0].trim();
}catch (Exception e){
// return clientIP;
}
return clientIP;
}
public String getFinishTime() {
String finishTime = "--";
try{
String timeContext = data[0].trim();
String pattern = "([^\\[]*)\\[([^\\+]*)(.*)";
finishTime = timeContext.replaceAll(pattern, "$2").trim();
}catch(Exception e){
}
return finishTime;
}
public String getRequestTime() {
String requestTime = "--";
try{
String timeContext = data[0].trim();
String[] timeSplit = timeContext.split(" ");
requestTime = timeSplit[timeSplit.length - 1].trim();
}catch(Exception e){ }
return requestTime;
}
public String getRequest() {
String request = "--";
try{
request = data[1].trim();
}catch(Exception e) {}
return request;
}
public String getCode() {
String code = "--";
String code = "--";
try{
String statucContext = data[2].trim();
String[] codeSplit = statucContext.split(" ");
code = codeSplit[0].trim();
}catch(Exception e){}
return code;
}
public String getTraffc() {
String traffic = "--";
try{
String trafficContext = data[2].trim();
String[] trafficSplit = trafficContext.split(" ");
traffic = trafficSplit[1].trim();
}catch(Exception e){}
return traffic;
}
public String getReffer() {
String reffer = "--";
try{
reffer = data[3].trim();
}catch(Exception e){}
return reffer;
}
public String getUserAgent() {
String agent = "--";
try{
agent = data[5].trim();
}catch(Exception e){}
return agent;
}
public String getForward() {
String forward = "--";
try{
forward = data[7].trim();
}catch(Exception e){
}
return forward;
}
下面是其使用实例
如下是一条nginx log
192.168.194.28 - - [13/Sep/2012:00:00:01 +0800]0.044 "GET /robots.txt HTTP/1.1"200 243 646 195 "http://www.baidu.com" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Tri dent/4.0; SMT Player 3.9.1.430; .NET CLR 1.1.4322; .NET CLR 2.0.50727)" "192.168.194.92" "bytes=528384-1052671"
其中标注颜色到字段为需要用到到字段,hive在跑mapreduce分析时指定上面到格式NginxLogInputFormat为输入格式,即解析到对应到字段。
在hive中创建一个nginx log到table,如下:
create table nginx_log(
host STRING,
finishTime STRING,
requestTime STRING,
request STRING,
status STRING,
traffic STRING,
reffer STRING,
userAgent STRING,
forward STRING
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS INPUTFORMAT 'NginxLogInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat';
将编译后到NginxLoInputFormat.class NginxLogRecordReader.class, NginxLogLineParser.class 打包成nginx.jar包,在hive命令行下通过 add jar nginx.jar添加到hive到CLASSPATH,否测在执行hive语句时会提示找不到类。
将nginx access_log 通过hadoop put到对应到目录下。hadoop fs -put nginx_access_log /user/hive/warehouse/nginx_log.db/access_log,则在hive下可以直接查询nginx 日志中到对应字段了。