使用HiVE分析数据:
Hive 将会利用hdfs中的log进行分析, 你需要写好相应的分析SQL语句,hive将调用 map reduce完成你的分析任务。我测试用的log是squid log,log entry如下:
1356867313.430 109167 10.10.10.1 TCP_MISS/200 51498 CONNECT securepics.example.com:443 – HIER_DIRECT/securepics.example.com -
[ ]*([0-9]*)[^ ]*[ ]*([^ ]*) ([^ ]*) ([^ |^ /]*)/([0-9]*) ([0-9]*) ([^ ]*) ((?:([^:]*)://)?([^/:]+):?([0-9]*)?(/?[^ ]*)) ([^ ]*) ([^/]+)/([^ ]+) (.*)
hive>
CREATE EXTERNAL TABLE IF NOT EXISTS squidtable(ttamp STRING, duration STRING,
clientip STRING, action STRING, http_status STRING, bytes STRING, method STRING,
uri STRING, proto STRING, uri_host STRING, uri_port STRING, uri_path STRING,
username STRING, hierarchy STRING, server_ip STRING, content_type STRING)
ROW FORMAT SERDE ‘org.apache.hadoop.hive.contrib.serde2.RegexSerDe’
WITH SERDEPROPERTIES (
“input.regex” = “[ ]*([0-9]*)[^ ]*[ ]*([^ ]*) ([^ ]*) ([^ |^ /]*)/([0-9]*) ([0-9]*) ([^ ]*) ((?:([^:]*)://)?([^/:]+):?([0-9]*)?(/?[^ ]*)) ([^ ]*) ([^/]+)/([^ ]+) (.*)”,
“output.format.string” = “%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s %11$s %12$s %13$s %14$s %15$s %16$s”
)
STORED AS TEXTFILE
LOCATION ‘/user/hive/warehouse/squid’;
一些分析SQL例子:
# How many log entry inside table select count(*) from squidtable; # How many log entry inside table with client ip 10.10.10.1 select count(*) from squidtable where clientip = "10.10.10.1"; # some advance query SELECT clientip, COUNT(1) AS numrequest FROM squidtable GROUP BY clientip SORT BY numrequest DESC LIMIT 10;