Python根据AccessLog统计对应Url的点击量2

85 篇文章 1 订阅
36 篇文章 0 订阅
要实现的需求,跟【 Python根据AccessLog统计对应Url的点击量--> http://blog.csdn.net/babyfish13/article/details/70304308】一文中所描述的需求相同,只是方法各异。
前文中是纯shell(cat、grep、wc等)+Python总控调度,采用多线程汇总统计Hdsf中的数据文件;本文是通过先将Hdfs中的文件按照规则拷贝至新目录,然后建Hive外部表的,最后在Hive里进行查询汇总。
两种方式,面对如此大的数据量和查询统计,性能都不具有明显优势;本文中,之所以没采用多线程,集群的内存限制是其一方面,一个线程就将内存吃满了。
1、处理日期小时的脚本
/Users/nisj/PycharmProjects/BiDataProc/hitsCalc2/DateHourCalc.py
# -*- coding=utf-8 -*-
import warnings
import datetime
import threading

warnings.filterwarnings("ignore")

def dateRange(beginDate, endDate):
    dates = []
    dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
    date = beginDate[:]
    while date <= endDate:
        dates.append(date)
        dt = dt + datetime.timedelta(1)
        date = dt.strftime("%Y-%m-%d")
    return dates

c = threading.RLock()
def dateHourRange(beginDateHour, endDateHour):
    with c:
        dateHours = []
        dt = datetime.datetime.strptime(beginDateHour, "%Y-%m-%d.%H")
        dateHour = beginDateHour[:]
        while dateHour <= endDateHour:
            dateHours.append(dateHour)
            dt = dt + datetime.timedelta(hours=1)
            dateHour = dt.strftime("%Y-%m-%d.%H")
        return dateHours


# print dateHourRange(beginDateHour='2017-04-14.03', endDateHour='2017-04-17.11')

2、拷贝数据及Hive建表添加分区的脚本
/Users/nisj/PycharmProjects/BiDataProc/hitsCalc2/cpOssFileAndCreateExtTable_byHour.py
# -*- coding=utf-8 -*-
import os
from DateHourCalc import *

warnings.filterwarnings("ignore")

def cpOssFileAndCreateExtTable_byHour(beginDateHour, endDateHour):
    os.system("""hadoop dfs -rm -r -skipTrash /tmp/oss_access_test""");
    os.system(
        """hive -e "drop table if exists xx_ext_oss_access; \
                    CREATE EXTERNAL TABLE xx_ext_oss_access( \
                      log_text string) \
                    PARTITIONED BY ( \
                      pt_day string, \
                      pt_hour string) \
                    ROW FORMAT SERDE \
                      'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' \
                    WITH SERDEPROPERTIES ( \
                      'field.delim'=',', \
                      'serialization.format'=',') \
                    STORED AS INPUTFORMAT \
                      'org.apache.hadoop.mapred.TextInputFormat' \
                    OUTPUTFORMAT \
                      'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' \
                    LOCATION \
                      'hdfs://emr-cluster/tmp/oss_access_test' \
                      ;" """);
    for dateHour in dateHourRange(beginDateHour=beginDateHour, endDateHour=endDateHour):
        date = dateHour[0:10]
        hour = dateHour[11:13]
        # print date,hour,dateHour
        os.system("""hadoop dfs -rm -r -skipTrash /tmp/oss_access_test/{date}/{hour}""".format(date=date, hour=hour));
        os.system("""hadoop dfs -mkdir -p /tmp/oss_access_test/{date}/{hour}""".format(date=date, hour=hour));
        os.system("""hadoop dfs -cp /tmp/oss_access/{date}/*_localhost_access_log.{dateHour}.txt /tmp/oss_access_test/{date}/{hour}/""".format(date=date, dateHour=dateHour, hour=hour));
        os.system("""hive -e "alter table xx_ext_oss_access add partition (pt_day='{date}', pt_hour='{hour}') location '{date}/{hour}';" """.format(date=date, hour=hour));

        os.system("""echo "{dateHour} data copy complete!" """.format(dateHour=dateHour));

    os.system("""echo "{beginDateHour} ~ {endDateHour} data proc complete;Now ,you can start Hive Select!" """.format(beginDateHour=beginDateHour, endDateHour=endDateHour));


# cpOssFileAndCreateExtTable_byHour(beginDateHour='2017-04-17.10', endDateHour='2017-04-24.17')

3、统计链接点击量的查询脚本
/Users/nisj/PycharmProjects/BiDataProc/hitsCalc2/hitsCalc.py
# -*- coding=utf-8 -*-
import time
from cpOssFileAndCreateExtTable_byHour import *

warnings.filterwarnings("ignore")

now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time

def hitsCalc(bigClass, url, beginDateHour, endDateHour):
    os.system("""hive -e "select '{bigClass}:{url_rp}-{beginDateHour}~{endDateHour} -->',count(*) \
                            from xx_ext_oss_access a1
                            where concat(a1.pt_day,'.',a1.pt_hour) between '{beginDateHour}' and '{endDateHour}'
                            and log_text like '%{url}%';" > {bigClass}:{url_rp}-{beginDateHour}~{endDateHour}.txt"""
              .format(bigClass=bigClass, url=url, url_rp=str(url).replace('/','#'), beginDateHour=beginDateHour, endDateHour=endDateHour))

batch_Parameter_list=[(['ZhanNeiXinLianJie','/event/april/captain/list.htm?_s=znx','2017-04-21.18','2017-04-23.00'],None),
(['KplZhanBao','/information/734.htm','2017-04-17.10','2017-04-22.10'],None),
(['KplZhanBao','/m/information/734.htm','2017-04-17.10','2017-04-22.10'],None),
(['KplZhanBao','/gamezone/video/play/1599710.htm?_s=zx','2017-04-17.10','2017-04-22.10'],None),
(['KplZhanBao','/gamezone/video/play/1600791.htm?_s=zx','2017-04-17.10','2017-04-22.10'],None),
(['KplZhanBao','/gamezone/video/play/1601607.htm?_s=zx','2017-04-17.10','2017-04-22.10'],None),
(['KplZhanBao','/gamezone/video/play/1602357.htm?_s=zx','2017-04-17.10','2017-04-22.10'],None),
(['KplShanZhouZhongJie','/information/735.htm','2017-04-17.11','2017-04-22.11'],None),
(['KplShanZhouZhongJie','/m/information/735.htm','2017-04-17.11','2017-04-22.11'],None),
(['WangZheZhengMongZhou3','/information/736.htm','2017-04-17.14','2017-04-22.14'],None),
(['WangZheZhengMongZhou3','/m/information/736.htm','2017-04-17.14','2017-04-22.14'],None),
(['XianMianYingXiongTaoLuWang','/room/3310097.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['XianMianYingXiongTaoLuWang','/room/34733.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['XianMianYingXiongTaoLuWang','/room/16367098.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['GuoFuZhuiQianJiaoXueYing','/room/35688.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['GuoFuZhuiQianJiaoXueYing','/room/1261052.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['GuoFuZhuiQianJiaoXueYing','/room/2527213.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['GuoFuZhuiQianJiaoXueYing','/room/2419313.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['GuoFuZhuiQianJiaoXueYing','/room/14571.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['GuoFuZhuiQianJiaoXueYing','/room/3881137.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['PaiWeiJiJieHao','/room/33290.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['PaiWeiJiJieHao','/room/5823473.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['PaiWeiJiJieHao','/room/32468513.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['PaiWeiJiJieHao','/room/18618.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['PaiWeiJiJieHao','/room/32777.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['PaiWeiJiJieHao','/room/22278549.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['PaiWeiJiJieHao','/room/1261052.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['MeiNvYuLe','/room/6148385.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['MeiNvYuLe','/room/14717142.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['MeiNvYuLe','/room/29227558.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['MeiNvYuLe','/room/34733.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['MeiNvYuLe','/room/30627511.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['MeiNvYuLe','/room/34733.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['ShuiYouBiPing','/room/20541.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['ShuiYouBiPing','/room/14294.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['ShuiYouBiPing','/room/14294.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['ShuiYouBiPing','/room/4522560.htm?_s=zx','2017-04-17.14','2017-04-22.14'],None),
(['SheiShiYuGao','/information/737.htm','2017-04-17.15','2017-04-22.15'],None),
(['SheiShiYuGao','/m/information/737.htm','2017-04-17.15','2017-04-22.15'],None),
(['SheiShiYuGao','/room/1010.htm?_s=zx','2017-04-17.15','2017-04-22.15'],None),
(['SheiShiYuGao','/room/1077.htm?_s=zx','2017-04-17.15','2017-04-22.15'],None),
(['SheiShiYuGao','/room/1070.htm?_s=zx','2017-04-17.15','2017-04-22.15'],None),
(['SheiShiYuGao','/room/1011.htm?_s=zx','2017-04-17.15','2017-04-22.15'],None),
(['SheiShiYuGao','/room/1061.htm?_s=zx','2017-04-17.15','2017-04-22.15'],None),
(['SheiShiYuGao','/room/1079.htm?_s=zx','2017-04-17.15','2017-04-22.15'],None),
(['SheiShiYuGao','/room/1074.htm?_s=zx','2017-04-17.15','2017-04-22.15'],None),
(['ChuShouShiKe','/information/738.htm','2017-04-17.18','2017-04-22.18'],None),
(['ChuShouShiKe','/m/information/738.htm','2017-04-17.18','2017-04-22.18'],None),
(['ChuShouShiKe','/u/1239132928.htm?_s=zx','2017-04-17.18','2017-04-22.18'],None),
(['MengZhuZhengBaShai','/information/743.htm','2017-04-20.13','2017-04-24.13'],None),
(['MengZhuZhengBaShai','/m/information/743.htm','2017-04-20.13','2017-04-24.13'],None),
(['BaoBaoXiaZhuanFang','/information/744.htm','2017-04-20.15','2017-04-23.15'],None),
(['BaoBaoXiaZhuanFang','/m/information/744.htm','2017-04-20.15','2017-04-23.15'],None),
(['PaiWeiLaoSiJi','/information/745.htm','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLaoSiJi','/m/information/745.htm','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLaoSiJi','/gamezone/pvp.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/26031018.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/1948541.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/1382675.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/29962075.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/15963830.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/12612445.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/21143935.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/29962075.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/8061320.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/34733.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/3016002.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/2815598.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/18814265.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/6932195.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/33290.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/2861735.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['PaiWeiLiaoSiJiHuoDongYi','/room/35344.htm?_s=zx','2017-04-20.17','2017-04-24.17'],None),
(['KplZhanBao2','/information/748.htm','2017-04-22.11','2017-04-24.11'],None),
(['KplZhanBao2','/m/information/748.htm','2017-04-22.11','2017-04-24.11'],None),
(['KplZhanBao2','/gamezone/video/play/1623346.htm?_s=zx','2017-04-22.11','2017-04-24.11'],None),
(['KplZhanBao2','/gamezone/video/play/1624558.htm?_s=zx','2017-04-22.11','2017-04-24.11'],None),
(['KplZhanBao2','/gamezone/video/play/1625740.htm?_s=zx','2017-04-22.11','2017-04-24.11'],None),
(['KplZhanBao2','/gamezone/video/play/1626037.htm?_s=zx','2017-04-22.11','2017-04-24.11'],None),
(['KplZhanBao3','/information/749.htm','2017-04-23.12','2017-04-24.12'],None),
(['KplZhanBao3','/m/information/749.htm','2017-04-23.12','2017-04-24.12'],None),
(['KplZhanBao3','/gamezone/video/play/1633213.htm?_s=zx','2017-04-23.12','2017-04-24.12'],None),
(['KplZhanBao3','/gamezone/video/play/1634176.htm?_s=zx','2017-04-23.12','2017-04-24.12'],None),
(['KplZhanBao3','/gamezone/video/play/1635420.htm?_s=zx','2017-04-23.12','2017-04-24.12'],None),
(['KplZhanBao3','/gamezone/video/play/1636706.htm?_s=zx','2017-04-23.12','2017-04-24.12'],None)]

for batch_Parameter in batch_Parameter_list:
    bigClass = batch_Parameter[0][0]
    url = batch_Parameter[0][1]
    beginDateHour = batch_Parameter[0][2]
    endDateHour = batch_Parameter[0][3]
    hitsCalc(bigClass=bigClass, url=url, beginDateHour=beginDateHour, endDateHour=endDateHour)

now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值