要实现的需求,跟【
Python根据AccessLog统计对应Url的点击量-->
http://blog.csdn.net/babyfish13/article/details/70304308】一文中所描述的需求相同,只是方法各异。
前文中是纯shell(cat、grep、wc等)+Python总控调度,采用多线程汇总统计Hdsf中的数据文件;本文是通过先将Hdfs中的文件按照规则拷贝至新目录,然后建Hive外部表的,最后在Hive里进行查询汇总。
两种方式,面对如此大的数据量和查询统计,性能都不具有明显优势;本文中,之所以没采用多线程,集群的内存限制是其一方面,一个线程就将内存吃满了。
1、处理日期小时的脚本
/Users/nisj/PycharmProjects/BiDataProc/hitsCalc2/DateHourCalc.py
2、拷贝数据及Hive建表添加分区的脚本
/Users/nisj/PycharmProjects/BiDataProc/hitsCalc2/cpOssFileAndCreateExtTable_byHour.py
3、统计链接点击量的查询脚本
/Users/nisj/PycharmProjects/BiDataProc/hitsCalc2/hitsCalc.py
前文中是纯shell(cat、grep、wc等)+Python总控调度,采用多线程汇总统计Hdsf中的数据文件;本文是通过先将Hdfs中的文件按照规则拷贝至新目录,然后建Hive外部表的,最后在Hive里进行查询汇总。
两种方式,面对如此大的数据量和查询统计,性能都不具有明显优势;本文中,之所以没采用多线程,集群的内存限制是其一方面,一个线程就将内存吃满了。
1、处理日期小时的脚本
/Users/nisj/PycharmProjects/BiDataProc/hitsCalc2/DateHourCalc.py
# -*- coding=utf-8 -*-
import warnings
import datetime
import threading
warnings.filterwarnings("ignore")
def dateRange(beginDate, endDate):
dates = []
dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
date = beginDate[:]
while date <= endDate:
dates.append(date)
dt = dt + datetime.timedelta(1)
date = dt.strftime("%Y-%m-%d")
return dates
c = threading.RLock()
def dateHourRange(beginDateHour, endDateHour):
with c:
dateHours = []
dt = datetime.datetime.strptime(beginDateHour, "%Y-%m-%d.%H")
dateHour = beginDateHour[:]
while dateHour <= endDateHour:
dateHours.append(dateHour)
dt = dt + datetime.timedelta(hours=1)
dateHour = dt.strftime("%Y-%m-%d.%H")
return dateHours
# print dateHourRange(beginDateHour='2017-04-14.03', endDateHour='2017-04-17.11')
2、拷贝数据及Hive建表添加分区的脚本
/Users/nisj/PycharmProjects/BiDataProc/hitsCalc2/cpOssFileAndCreateExtTable_byHour.py
# -*- coding=utf-8 -*-
import os
from DateHourCalc import *
warnings.filterwarnings("ignore")
def cpOssFileAndCreateExtTable_byHour(beginDateHour, endDateHour):
os.system("""hadoop dfs -rm -r -skipTrash /tmp/oss_access_test""");
os.system(
"""hive -e "drop table if exists xx_ext_oss_access; \
CREATE EXTERNAL TABLE xx_ext_oss_access( \
log_text string) \
PARTITIONED BY ( \
pt_day string, \
pt_hour string) \
ROW FORMAT SERDE \
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' \
WITH SERDEPROPERTIES ( \
'field.delim'=',', \
'serialization.format'=',') \
STORED AS INPUTFORMAT \
'org.apache.hadoop.mapred.TextInputFormat' \
OUTPUTFORMAT \
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' \
LOCATION \
'hdfs://emr-cluster/tmp/oss_access_test' \
;" """);
for dateHour in dateHourRange(beginDateHour=beginDateHour, endDateHour=endDateHour):
date = dateHour[0:10]
hour = dateHour[11:13]
# print date,hour,dateHour
os.system("""hadoop dfs -rm -r -skipTrash /tmp/oss_access_test/{date}/{hour}""".format(date=date, hour=hour));
os.system("""hadoop dfs -mkdir -p /tmp/oss_access_test/{date}/{hour}""".format(date=date, hour=hour));
os.system("""hadoop dfs -cp /tmp/oss_access/{date}/*_localhost_access_log.{dateHour}.txt /tmp/oss_access_test/{date}/{hour}/""".format(date=date, dateHour=dateHour, hour=hour));
os.system("""hive -e "alter table xx_ext_oss_access add partition (pt_day='{date}', pt_hour='{hour}') location '{date}/{hour}';" """.format(date=date, hour=hour));
os.system("""echo "{dateHour} data copy complete!" """.format(dateHour=dateHour));
os.system("""echo "{beginDateHour} ~ {endDateHour} data proc complete;Now ,you can start Hive Select!" """.format(beginDateHour=beginDateHour, endDateHour=endDateHour));
# cpOssFileAndCreateExtTable_byHour(beginDateHour='2017-04-17.10', endDateHour='2017-04-24.17')
3、统计链接点击量的查询脚本
/Users/nisj/PycharmProjects/BiDataProc/hitsCalc2/hitsCalc.py
# -*- coding=utf-8 -*-
import time
from cpOssFileAndCreateExtTable_byHour import *
warnings.filterwarnings("ignore")
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time
def hitsCalc(bigClass, url, beginDateHour, endDateHour):
os.system("""hive -e "select '{bigClass}:{url_rp}-{beginDateHour}~{endDateHour} -->',count(*) \
from xx_ext_oss_access a1
where concat(a1.pt_day,'.',a1.pt_hour) between '{beginDateHour}' and '{endDateHour}'
and log_text like '%{url}%';" > {bigClass}:{url_rp}-{beginDateHour}~{endDateHour}.txt"""
.format(bigClass=bigClass, url=url, url_rp=str(url).replace('/','#'), beginDateHour=beginDateHour, endDateHour=endDateHour))
batch_Parameter_list=[(['ZhanNeiXinLianJie','/event/april/captain/list.htm?_s=znx','2017-04-21.18','2017-04-23.00'],None),
(['KplZhanBao','/information/734.htm','2017-04-17.10','2017-04-22.10'],None),
(['KplZhanBao','/m/information/734.htm','2017-04-17.10','2017-04-22.10'],None),
(['KplZhanBao','/gamezone/video/play/1599710.htm?_s=zx','2017-04-17.10','2017-04-22.10'],None),
(['KplZhanBao','/gamezone/video/play/1600791.htm?_s=zx','2017-04-17.10','2017-04-22.10'],None),
(['KplZhanBao','/gamezone/video/play/1601607.htm?_s=zx','2017-04-17.10','2017-04-22.10'],None),
(['KplZhanBao','/gamezone/video/play/1602357.htm?_s=zx','2017-04-17.10','2017-04-22.10'],None),
(['KplShanZhouZhongJie','/information/735.htm','2017-04-17.11','2017-04-22.11'],None),
(['KplShanZhouZhongJie','/m/information/735.htm','2017-04-17.11','2017-04-22.11'],None),
(['WangZheZhengMongZhou3','/information/736.htm','2017-04-17.14','2017-04-22.14'],None),
(['WangZheZhengMongZhou3','/m/information/736.htm','2017-04-17.14','2017-04-22.14'],None),
(['XianMianYingXiongTaoLuWang','/room/3310097.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['XianMianYingXiongTaoLuWang','/room/34733.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['XianMianYingXiongTaoLuWang','/room/16367098.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['GuoFuZhuiQianJiaoXueYing','/room/35688.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['GuoFuZhuiQianJiaoXueYing','/room/1261052.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['GuoFuZhuiQianJiaoXueYing','/room/2527213.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['GuoFuZhuiQianJiaoXueYing','/room/2419313.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['GuoFuZhuiQianJiaoXueYing','/room/14571.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['GuoFuZhuiQianJiaoXueYing','/room/3881137.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['PaiWeiJiJieHao','/room/33290.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['PaiWeiJiJieHao','/room/5823473.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['PaiWeiJiJieHao','/room/32468513.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['PaiWeiJiJieHao','/room/18618.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['PaiWeiJiJieHao','/room/32777.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['PaiWeiJiJieHao','/room/22278549.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['PaiWeiJiJieHao','/room/1261052.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['MeiNvYuLe','/room/6148385.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['MeiNvYuLe','/room/14717142.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['MeiNvYuLe','/room/29227558.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['MeiNvYuLe','/room/34733.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['MeiNvYuLe','/room/30627511.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['MeiNvYuLe','/room/34733.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['ShuiYouBiPing','/room/20541.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['ShuiYouBiPing','/room/14294.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['ShuiYouBiPing','/room/14294.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['ShuiYouBiPing','/room/4522560.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['SheiShiYuGao','/information/737.htm','2017-04-17.15','2017-04-22.15'],None),
(['SheiShiYuGao','/m/information/737.htm','2017-04-17.15','2017-04-22.15'],None),
(['SheiShiYuGao','/room/1010.htm?_s=zx','2017-04-17.15','2017-04-22.15'],None),
(['SheiShiYuGao','/room/1077.htm?_s=zx','2017-04-17.15','2017-04-22.15'],None),
(['SheiShiYuGao','/room/1070.htm?_s=zx','2017-04-17.15','2017-04-22.15'],None),
(['SheiShiYuGao','/room/1011.htm?_s=zx','2017-04-17.15','2017-04-22.15'],None),
(['SheiShiYuGao','/room/1061.htm?_s=zx','2017-04-17.15','2017-04-22.15'],None),
(['SheiShiYuGao','/room/1079.htm?_s=zx','2017-04-17.15','2017-04-22.15'],None),
(['SheiShiYuGao','/room/1074.htm?_s=zx','2017-04-17.15','2017-04-22.15'],None),
(['ChuShouShiKe','/information/738.htm','2017-04-17.18','2017-04-22.18'],None),
(['ChuShouShiKe','/m/information/738.htm','2017-04-17.18','2017-04-22.18'],None),
(['ChuShouShiKe','/u/1239132928.htm?_s=zx','2017-04-17.18','2017-04-22.18'],None),
(['MengZhuZhengBaShai','/information/743.htm','2017-04-20.13','2017-04-24.13'],None),
(['MengZhuZhengBaShai','/m/information/743.htm','2017-04-20.13','2017-04-24.13'],None),
(['BaoBaoXiaZhuanFang','/information/744.htm','2017-04-20.15','2017-04-23.15'],None),
(['BaoBaoXiaZhuanFang','/m/information/744.htm','2017-04-20.15','2017-04-23.15'],None),
(['PaiWeiLaoSiJi','/information/745.htm','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLaoSiJi','/m/information/745.htm','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLaoSiJi','/gamezone/pvp.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/26031018.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/1948541.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/1382675.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/29962075.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/15963830.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/12612445.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/21143935.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/29962075.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/8061320.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/34733.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/3016002.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/2815598.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/18814265.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/6932195.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/33290.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/2861735.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/35344.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['KplZhanBao2','/information/748.htm','2017-04-22.11','2017-04-24.11'],None),
(['KplZhanBao2','/m/information/748.htm','2017-04-22.11','2017-04-24.11'],None),
(['KplZhanBao2','/gamezone/video/play/1623346.htm?_s=zx','2017-04-22.11','2017-04-24.11'],None),
(['KplZhanBao2','/gamezone/video/play/1624558.htm?_s=zx','2017-04-22.11','2017-04-24.11'],None),
(['KplZhanBao2','/gamezone/video/play/1625740.htm?_s=zx','2017-04-22.11','2017-04-24.11'],None),
(['KplZhanBao2','/gamezone/video/play/1626037.htm?_s=zx','2017-04-22.11','2017-04-24.11'],None),
(['KplZhanBao3','/information/749.htm','2017-04-23.12','2017-04-24.12'],None),
(['KplZhanBao3','/m/information/749.htm','2017-04-23.12','2017-04-24.12'],None),
(['KplZhanBao3','/gamezone/video/play/1633213.htm?_s=zx','2017-04-23.12','2017-04-24.12'],None),
(['KplZhanBao3','/gamezone/video/play/1634176.htm?_s=zx','2017-04-23.12','2017-04-24.12'],None),
(['KplZhanBao3','/gamezone/video/play/1635420.htm?_s=zx','2017-04-23.12','2017-04-24.12'],None),
(['KplZhanBao3','/gamezone/video/play/1636706.htm?_s=zx','2017-04-23.12','2017-04-24.12'],None)]
for batch_Parameter in batch_Parameter_list:
bigClass = batch_Parameter[0][0]
url = batch_Parameter[0][1]
beginDateHour = batch_Parameter[0][2]
endDateHour = batch_Parameter[0][3]
hitsCalc(bigClass=bigClass, url=url, beginDateHour=beginDateHour, endDateHour=endDateHour)
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time