1. 对access.log进行数据的预处理并进行清洗,将静态请求去除,
//过滤所有静态的资源请求
if(url.startsWith("GET /static")){
return;
}
if(url.startsWith("GET")){
url = url.substring("GET ".length()+1, url.length()-" HTTP/1.1".length());
}
if(url.startsWith("POST")){
url = url.substring("POST ".length()+1, url.length()-" HTTP/1.1".length());
}
if(url.endsWith(".js")||url.endsWith(".css")||url.endsWith(".js")||url.endsWith(".jpg")||url.endsWith(".png")||url.endsWith(".gif"))
return;
v.set(ip+"\t"+logtime +"\t"+url);
hadoop jar /home/hadoop/cleaner.jar /flume/$CURRENT /cleaned/$CURRENT
2 hive外部表创建
create EXTERNAL table accesslog(ip string, time string ,url string) partitioned by (cleantime string) row format delimited fields terminated by '\t'
3. 加载数据
LOAD DATA INPATH '/cleaned/20160623' INTO TABLE accesslog partition (20160623) ;
4. 创建ip地址范围区域表,并加载到hive中
create table ip_address (start_ip string,end_ip string ,province_city string,area string) row format delimited fields terminated by '\t';
./sqoop import --connect jdbc:mysql://10.28.0.132:3306/test --username root --password admin --query 'SELECT start_ip,end_ip,province_city,area FROM ip_address where $CONDITIONS' --target-dir '/user/hive/warehouse/ip_address' --hive-table ip_address -hive-import -m 1
5.对accesslog进行初步分析
create table iptimes(ip string ,times int) row format delimited fields terminated by '\t';
insert into iptimes select ip,count(ip) from accesslog where cleantime='20160623' group by ip;
6. 编写hiveudf inet_aton增加到hive中
add jar /home/hadoop/inet_aton.jar;
create temporary function inet_aton as 'com.qht.core.inet2n';
7,ip地址段预处理,加速运行速度
create table ipadress_temp (start_ip bigint,end_ip bigint ,province_city string) row format delimited fields terminated by '\t';
insert into ipadress_temp select inet_aton(start_ip), inet_aton(end_ip), province_city from ip_address;
8. 区域分析
create table iparea row format delimited fields terminated by '\t' as select a.ip, b.province_city , a.times from iptimes a left outer join ip_address b where inet_aton(a.ip)<inet_aton(b.end_ip) and inet_aton(a.ip)>inet_aton(b.start_ip);
9. 分析统计
<pre name="code" class="sql">create table arestimes row format delimited fields terminated by '\t' as select province_city,sum(times) as time from iparea group by province_city order by time;;