,日志各地区访问量统计分析

1. 对access.log进行数据的预处理并进行清洗,将静态请求去除,

//过滤所有静态的资源请求
			if(url.startsWith("GET /static")){
				return;
			}
			
			if(url.startsWith("GET")){
				url = url.substring("GET ".length()+1, url.length()-" HTTP/1.1".length());
			}
			if(url.startsWith("POST")){
				url = url.substring("POST ".length()+1, url.length()-" HTTP/1.1".length());
			}
			
			if(url.endsWith(".js")||url.endsWith(".css")||url.endsWith(".js")||url.endsWith(".jpg")||url.endsWith(".png")||url.endsWith(".gif"))
				return;
			v.set(ip+"\t"+logtime +"\t"+url);

hadoop jar /home/hadoop/cleaner.jar /flume/$CURRENT /cleaned/$CURRENT

2  hive外部表创建

create EXTERNAL  table accesslog(ip string, time string ,url string) partitioned by (cleantime string) row format delimited fields terminated by '\t' 

3. 加载数据

LOAD DATA INPATH '/cleaned/20160623' INTO TABLE accesslog partition (20160623) ;

4. 创建ip地址范围区域表,并加载到hive中

create table ip_address (start_ip string,end_ip string ,province_city string,area string) row format delimited fields terminated by '\t';

./sqoop import --connect jdbc:mysql://10.28.0.132:3306/test --username root --password admin  --query 'SELECT start_ip,end_ip,province_city,area FROM ip_address where $CONDITIONS' --target-dir '/user/hive/warehouse/ip_address'   --hive-table ip_address -hive-import  -m 1


5.对accesslog进行初步分析

create table iptimes(ip string ,times int) row format delimited fields terminated by '\t';

insert into iptimes  select ip,count(ip) from accesslog where cleantime='20160623' group by ip;

6. 编写hiveudf     inet_aton增加到hive中

add jar /home/hadoop/inet_aton.jar;
create temporary function inet_aton as 'com.qht.core.inet2n';


7,ip地址段预处理,加速运行速度

create table ipadress_temp (start_ip bigint,end_ip bigint ,province_city string) row format delimited fields terminated by '\t';
insert into ipadress_temp select inet_aton(start_ip), inet_aton(end_ip), province_city from ip_address;

8.  区域分析

create table iparea row format delimited fields terminated by '\t' as select a.ip, b.province_city , a.times from iptimes a left outer join ip_address b where inet_aton(a.ip)<inet_aton(b.end_ip) and inet_aton(a.ip)>inet_aton(b.start_ip);

9. 分析统计

<pre name="code" class="sql">create table arestimes row format delimited fields terminated by '\t' as select province_city,sum(times) as time from iparea group by province_city order by time;;


 

 










评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值