需求:简单处理一个web日志,该日志共有11个字段,5794条,现要进行简单处理
展示日志中的一条数据:
"27.38.5.159" "-" "31/Aug/2015:00:04:37 +0800" "GET /course/view.php?id=27 HTTP/1.1" "303" "440" - "http://www.ibeifeng.com/user.php?act=mycourse" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36" "-" "learn.ibeifeng.com"
1、在hive中创建表并加载数据
//创建数据原表
create table if not exists default.web_log(
remote_addr string,
remote_user string,
time_local string,
request string,
status string,
body_bytes_sent string,
request_body string,
http_refer string,
http_user_agent string,
http_x_forwarded_for string,
host string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "(\"[^ ]*\") (\"-|[^ ]*\") (\"[^\]]*\") (\"[^\"]*\") (\"[0-9]*\") (\"[0-9]*\") (-|[^ ]*) (\"[^ ]*\") (\"[^\"]*\") (-|[^ ]*) (\"[^ ]*\")"
)
STORED AS TEXTFILE;
//加载数据
load data local inpath '/usr/datas/moodle.ibeifeng.access.log' into table default.bf_log;
2、创建一个新表,用来存储 ip、时间、请求、父链接
create table if not exists default.web_log_common(
remote_addr string,
time_local string,
request string,
http_refer string
)
row format delimited fields terminated by '\t'
stored as orc tblproperties("orc.compress"="SNAPPY");
3、UDF
//去除引号
public class RemoveQuotes extends UDF {
public Text evaluate(Text a) {
if(a == null) {
return new Text();
}
if(a.toString() == null) {
return new Text();
}
return new Text(a.toString().replaceAll("\"", ""));
}
}
//规范时间
public class FormatDate extends UDF {
private final SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.ENGLISH);
private final SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MMM-dd: HH:mm:ss");
@SuppressWarnings("unused")
public Text evaluate(Text input) {
Text output = new Text();
String inputdate = input.toString().trim();
if(input == null) {
return null;
}
if(inputdate == null) {
return null;
}
try {
Date oldDate = inputFormat.parse(inputdate);
String newDate = outputFormat.format(oldDate);
output.set(newDate);
} catch (ParseException e) {
e.printStackTrace();
return output;
}
return output;
}
}
//获取IP前两段
public class IpAddress extends UDF {
public Text evaluate(Text ip) {
Text output = new Text();
if(ip == null) {
return new Text();
}
if(ip.toString() == null) {
return new Text();
}
if(ip.toString() .equals("")) {
return new Text();
}
System.out.println(ip);
String[] split = (ip.toString()).split("\\.");
String ipAddress = split[0] + "." + split[1];
output.set(ipAddress);
return output;
}
}
add jar /usr/datas/myUDF.jar;
1、create temporary function remove_quotes as "com.bpf.hive.udf.RemoveQuotes";
2、create temporary function formatDate as "com.bpf.hive.udf.FormatDate";
3、create temporary function ipAddress as "com.bpf.hive.udf.IpAddress";
4、新表中插入数据
insert overwrite table default.web_log_common select remove_quotes(remote_addr), formatDate(remove_quotes(time_local)), remove_quotes(request),remove_quotes(http_refer) from default.web_log_src;
5、哪个时间段请求网页的次数多
select t.hour, count(*) cnt from
(select substring(time_local,13,2) hour from web_log_common) t
group by t.hour order by cnt desc;
6、IP所在地域多
select t.ip_prex, count(*) cnt from
(select ipAddress(remote_addr) ip_prex from web_log_common) t
group by t.ip_prex order by cnt desc;