Python控制数据(留存及支付信息)按周进行跑批处理

最新推荐文章于 2024-07-06 02:09:50 发布

BabyFish13

最新推荐文章于 2024-07-06 02:09:50 发布

阅读量1.3k

点赞数

分类专栏： Python Solution projectCase 文章标签：大数据 Python hadoop 多线程批处理

本文链接：https://blog.csdn.net/BabyFish13/article/details/54096306

版权

Python 同时被 3 个专栏收录

109 篇文章 3 订阅

订阅专栏

Solution

85 篇文章 1 订阅

订阅专栏

projectCase

36 篇文章 0 订阅

订阅专栏

三个版本，最老的版本将数据从Hive上取出来，然后用Python进行处理，现在看来这显然是不合适的，放着强大的Hadoop不用，而使用单机多线程；较老版本dict手工生成代码清单进行周的控制，且是从前往后计算，主要计算过程在hadoop上；新版本充分利用了Python的datetime函数等，根据时段，自动生成日期的列表，然后进行周的控制,从后往前计算，便于代码的利用，数据运算主要在Hadoop上运行。总体来看， 新版本比老版本更容易理解、更清晰、更充分利用了大数据平台的计算能力、复用性更强。
1、最早的版本
/Users/nisj/PycharmProjects/EsDataProc/bi-static/Hive_remain_group_byWeek_thread.py

# -*- coding=utf-8 -*-
import warnings
import datetime
import time
import os
import re
import threadpool
from itertools import groupby
from operator import itemgetter

warnings.filterwarnings("ignore")

today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
tomorrow = today + datetime.timedelta(days=1)

now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是：",now_time

def getWeekFristday(weekflag):
    yearnum = weekflag[0:4]  # 取到年份
    weeknum = weekflag[5:7]  # 取到周
    stryearstart = yearnum + '0101'  # 当年第一天
    yearstart = datetime.datetime.strptime(stryearstart, '%Y%m%d')  # 格式化为日期格式
    yearstartcalendarmsg = yearstart.isocalendar()  # 当年第一天的周信息
    yearstartweekday = yearstartcalendarmsg[2]
    yearstartyear = yearstartcalendarmsg[0]
    if yearstartyear < int(yearnum):
        daydelat = (8 - int(yearstartweekday)) + (int(weeknum) - 1) * 7
    else:
        daydelat = (8 - int(yearstartweekday)) + (int(weeknum) - 2) * 7

    week1day = (yearstart + datetime.timedelta(days=daydelat)).date()
    return week1day

def user_remain_proc(batch_week):
    os.system("""/usr/bin/mysql -hHostIp -P6603 -uhadoop -pMysqlPass -e "use funnyai_data; \
                delete from bi_user_remain_byweek_static where data_week='%s'; \
                 " """ % (batch_week))

    week1day = getWeekFristday(batch_week)

    newuser_data = os.popen("""/usr/lib/hive-current/bin/hive -e " \
    select a1.appsource,a1.appkey,a1.identifier from ( \
    select appsource,appkey,identifier \
    from bi_all_access_log \
    where concat(year(pt_day),'#',weekofyear(pt_day)) = '%s' \
    group by appsource,appkey,identifier) a1 \
    left join \
    (select appsource,appkey,identifier \
    from bi_all_access_log \
    where pt_day < '%s' ) a2 on a1.appkey=a2.appkey and a1.identifier=a2.identifier and a1.appsource=a2.appsource \
    where a2.identifier is null \
    ;" \
    """
    % (batch_week, week1day)).readlines();


    week_dict = {"1": "2015#27", "2": "2015#28", "3": "2015#29", "4": "2015#30", "5": "2015#31", "6": "2015#32",
                 "7": "2015#33", "8": "2015#34", "9": "2015#35", "10": "2015#36", "11": "2015#37", "12": "2015#38",
                 "13": "2015#39", "14": "2015#40", "15": "2015#41", "16": "2015#42", "17": "2015#43", "18": "2015#44",
                 "19": "2015#45", "20": "2015#46", "21": "2015#47", "22": "2015#48", "23": "2015#49", "24": "2015#50",
                 "25": "2015#51", "26": "2015#52", "27": "2015#53", "28": "2016#1", "29": "2016#2", "30": "2016#3",
                 "31": "2016#4", "32": "2016#5", "33": "2016#6", "34": "2016#7", "35": "2016#8", "36": "2016#9",
                 "37": "2016#10", "38": "2016#11", "39": "2016#12", "40": "2016#13", "41": "2016#14", "42": "2016#15",
                 "43": "2016#16", "44": "2016#17", "45": "2016#18", "46": "2016#19", "47": "2016#20", "48": "2016#21",
                 "49": "2016#22", "50": "2016#23", "51": "2016#24", "52": "2016#25", "53": "2016#26", "54": "2016#27",
                 "55": "2016#28", "56": "2016#29", "57": "2016#30", "58": "2016#31", "59": "2016#32", "60": "2016#33",
                 "61": "2016#34", "62": "2016#35", "63": "2016#36", "64": "2016#37", "65": "2016#38", "66": "2016#39",
                 "67": "2016#40", "68": "2016#41", "69": "2016#42", "70": "2016#43", "71": "2016#44", "72": "2016#45",
                 "73": "2016#46", "74": "2016#47", "75": "2016#48", "76": "2016#49", "77": "2016#50", "78": "2016#51",
                 "79": "2016#52"}

    for (k, week_value) in week_dict.items():
        if week_value == batch_week:
            for (kx, week_valuex) in week_dict.items():
                if int(kx) >= int(k):
                    print k,kx,week_valuex

                    sql_text = "select appsource,appkey,identifier from bi_all_access_log where concat(year(pt_day),'#',weekofyear(pt_day)) = '%s' group by appsource,appkey,identifier;"%(week_valuex)
                    # print sql_text
                    week2user_data = os.popen("""/usr/lib/hive-current/bin/hive -e "%s" """%(sql_text)).readlines();
                    week2remain_data = set(newuser_data) & set(week2user_data)
                    week2remain_data = list(week2remain_data)
                    week2remain_list = []
                    for w2rm_list in week2remain_data:
                        w2l = re.split('\t',w2rm_list.replace('\n',''))
                        week2remain_list.append(w2l)


                    week2remain_data_sorted = sorted(week2remain_list, key=itemgetter(0, 1))

                    groupby_week2remain_data = groupby(week2remain_data_sorted, key=itemgetter(0, 1))


                    rl = []
                    for key, item in groupby_week2remain_data:
                        item_cnt = 0
                        for jtem in item:
                            item_cnt += 1
                        groupby_week2remain_list = (key, item_cnt)
                        rl.append(groupby_week2remain_list)

                    for x in rl:
                        # print x[0][0], x[0][1], x[1]
                        appsource = x[0][0]
                        appkey = x[0][1]
                        data_type = "%s-remain_cnt" % (week_valuex)
                        data_cnt = x[1]
                        etl_time = time.strftime('%Y-%m-%d %X', time.localtime())


                        os.system("""/usr/bin/mysql -hHostIp -P6603 -uhadoop -pMysqlPass -e "use funnyai_data; \
                        insert into bi_user_remain_byweek_static(data_week,appsource,appkey,data_type,data_cnt,etl_time) \
                        select '%s','%s','%s','%s','%s','%s'; \
                         " """ % (batch_week, appsource, appkey, data_type, data_cnt, etl_time))


def dateRange(beginDate, endDate):
    dates = []
    dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
    date = beginDate[:]
    while date <= endDate:
        dates.append(date)
        dt = dt + datetime.timedelta(1)
        date = dt.strftime("%Y-%m-%d")
    return dates

batch_week_list = ["2015#27","2015#28","2015#29","2015#30","2015#31","2015#32","2015#33","2015#34","2015#35","2015#36","2015#37","2015#38","2015#39","2015#40","2015#41","2015#42","2015#43","2015#44","2015#45","2015#46","2015#47","2015#48","2015#49","2015#50","2015#51","2015#52","2015#53","2016#1","2016#2","2016#3","2016#4","2016#5","2016#6","2016#7","2016#8","2016#9","2016#10","2016#11","2016#12","2016#13","2016#14","2016#15","2016#16","2016#17","2016#18","2016#19","2016#20","2016#21","2016#22","2016#23","2016#24","2016#25","2016#26","2016#27","2016#28","2016#29","2016#30","2016#31","2016#32","2016#33","2016#34","2016#35","2016#36","2016#37","2016#38","2016#39","2016#40","2016#41","2016#42","2016#43","2016#44","2016#45","2016#46","2016#47","2016#48","2016#49","2016#50","2016#51","2016#52"]
# batch_week_list = ["2016#49","2016#50"]
requests = threadpool.makeRequests(user_remain_proc, batch_week_list)
main_pool = threadpool.ThreadPool(40)
[main_pool.putRequest(req) for req in requests]
if __name__ == '__main__':
    while True:
        try:
            time.sleep(3)
            main_pool.poll()
        except KeyboardInterrupt:
            print("**** Interrupted!")
            break
        except threadpool.NoResultsPending:
            break

    if main_pool.dismissedWorkers:
        print("Joining all dismissed worker threads...")
        main_pool.joinAllDismissedWorkers()

now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是：",now_time

2、老版本
/Users/nisj/PycharmProjects/EsDataProc/bi-static/Hive_remain_group_byWeek_thread_0103.py

# -*- coding=utf-8 -*-
import warnings
import datetime
import time
import os
import re
import threadpool

warnings.filterwarnings("ignore")

today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
tomorrow = today + datetime.timedelta(days=1)

now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是：",now_time

def user_remain_proc(batch_week):
    os.system("""/usr/bin/mysql -hHostIp -P6603 -uhadoop -pMysqlPass -e "use funnyai_data; \
                delete from bi_user_remain_pay_byweek_static where data_week='%s'; \
                 " """ % (batch_week))

    week_dict = {"1": "2015#27", "2": "2015#28", "3": "2015#29", "4": "2015#30", "5": "2015#31", "6": "2015#32",
                 "7": "2015#33", "8": "2015#34", "9": "2015#35", "10": "2015#36", "11": "2015#37", "12": "2015#38",
                 "13": "2015#39", "14": "2015#40", "15": "2015#41", "16": "2015#42", "17": "2015#43", "18": "2015#44",
                 "19": "2015#45", "20": "2015#46", "21": "2015#47", "22": "2015#48", "23": "2015#49", "24": "2015#50",
                 "25": "2015#51", "26": "2015#52", "27": "2015#53", "28": "2016#1", "29": "2016#2", "30": "2016#3",
                 "31": "2016#4", "32": "2016#5", "33": "2016#6", "34": "2016#7", "35": "2016#8", "36": "2016#9",
                 "37": "2016#10", "38": "2016#11", "39": "2016#12", "40": "2016#13", "41": "2016#14", "42": "2016#15",
                 "43": "2016#16", "44": "2016#17", "45": "2016#18", "46": "2016#19", "47": "2016#20", "48": "2016#21",
                 "49": "2016#22", "50": "2016#23", "51": "2016#24", "52": "2016#25", "53": "2016#26", "54": "2016#27",
                 "55": "2016#28", "56": "2016#29", "57": "2016#30", "58": "2016#31", "59": "2016#32", "60": "2016#33",
                 "61": "2016#34", "62": "2016#35", "63": "2016#36", "64": "2016#37", "65": "2016#38", "66": "2016#39",
                 "67": "2016#40", "68": "2016#41", "69": "2016#42", "70": "2016#43", "71": "2016#44", "72": "2016#45",
                 "73": "2016#46", "74": "2016#47", "75": "2016#48", "76": "2016#49", "77": "2016#50", "78": "2016#51",
                 "79": "2016#52"}

    for (k, week_value) in week_dict.items():
        if week_value == batch_week:
            for (kx, compare_week) in week_dict.items():
                if int(kx) >= int(k):
                    print batch_week,compare_week

                    newuser_remain_pay_data = os.popen("""/usr/lib/hive-current/bin/hive -e " \
                        add jar /home/hadoop/nisj/udf-jar/hadoop_udf_radixChange.jar; \
                        create temporary function RadixChange as 'com.kascend.hadoop.RadixChange'; \
                        with new_user as (select a1.appsource,a1.appkey,a1.identifier,a1.uid from ( \
                        select appsource,appkey,identifier,RadixChange(uid,16,10) uid \
                        from bi_all_access_log \
                        where case when weekofyear(pt_day)>=52 and month(pt_day)=1 then concat(year(pt_day)-1,'#',weekofyear(pt_day)) else concat(year(pt_day),'#',weekofyear(pt_day)) end = '%s' \
                        group by appsource,appkey,identifier,RadixChange(uid,16,10)) a1 \
                        left join \
                        (select appsource,appkey,identifier,RadixChange(uid,16,10) uid \
                        from bi_all_access_log \
                        where year(pt_day)<int(substring('%s',1,4)) or (year(pt_day)=int(substring('%s',1,4)) and weekofyear(pt_day)<int(substring('%s',6)))) a2 \
                        on a1.appkey=a2.appkey and a1.identifier=a2.identifier and a1.appsource=a2.appsource and a1.uid=a2.uid \
                        where a2.identifier is null \
                        ), \
                        x_day_data as (select appsource,appkey,identifier,RadixChange(uid,16,10) uid \
                        from bi_all_access_log \
                        where case when weekofyear(pt_day)>=52 and month(pt_day)=1 then concat(year(pt_day)-1,'#',weekofyear(pt_day)) else concat(year(pt_day),'#',weekofyear(pt_day)) end = '%s' \
                        group by appsource,appkey,identifier,RadixChange(uid,16,10)), \
                        x_pay as (select uid,sum(amount) amount \
                        from data_chushou_pay_info \
                        where case when weekofyear(pt_day)>=52 and month(pt_day)=1 then concat(year(pt_day)-1,'#',weekofyear(pt_day)) else concat(year(pt_day),'#',weekofyear(pt_day)) end = '%s' \
                        group by uid) \
                        select a1.appkey,a1.appsource,count(a1.identifier) remain_cnt,sum(a3.amount) pay_amount \
                        from new_user a1 \
                        inner join x_day_data a2 on a1.appkey=a2.appkey and a1.identifier=a2.identifier and a1.appsource=a2.appsource \
                        left join x_pay a3 on a1.uid=a3.uid \
                        group by a1.appkey,a1.appsource \
                        ;" \
                        """ % (batch_week, batch_week, batch_week, batch_week, compare_week, compare_week)).readlines();

                    nrpd_list = []
                    for nrp_list in newuser_remain_pay_data:
                        nrp = re.split('\t', nrp_list.replace('\n', ''))
                        nrpd_list.append(nrp)

                    for nrpd in nrpd_list:
                        # print nrpd[0],nrpd[1],nrpd[2],nrpd[3]
                        remain_week = compare_week
                        appkey = nrpd[0]
                        appsource = nrpd[1]
                        remain_cnt = nrpd[2]
                        pay_amount = nrpd[3]
                        etl_time = time.strftime('%Y-%m-%d %X', time.localtime())

                        os.system("""/usr/bin/mysql -hHostIp -P6603 -uhadoop -pMysqlPass -e "use funnyai_data; \
                        insert into bi_user_remain_pay_byweek_static(data_week,appsource,appkey,remain_week,remain_cnt,pay_amount,etl_time) \
                        select '%s','%s','%s','%s','%s','%s','%s'; \
                         " """ % (batch_week, appsource, appkey, remain_week, remain_cnt, pay_amount, etl_time))


# batch_week_list = ["2015#27","2015#28","2015#29","2015#30","2015#31","2015#32","2015#33","2015#34","2015#35","2015#36","2015#37","2015#38","2015#39","2015#40","2015#41","2015#42","2015#43","2015#44","2015#45","2015#46","2015#47","2015#48","2015#49","2015#50","2015#51","2015#52","2015#53","2016#1","2016#2","2016#3","2016#4","2016#5","2016#6","2016#7","2016#8","2016#9","2016#10","2016#11","2016#12","2016#13","2016#14","2016#15","2016#16","2016#17","2016#18","2016#19","2016#20","2016#21","2016#22","2016#23","2016#24","2016#25","2016#26","2016#27","2016#28","2016#29","2016#30","2016#31","2016#32","2016#33","2016#34","2016#35","2016#36","2016#37","2016#38","2016#39","2016#40","2016#41","2016#42","2016#43","2016#44","2016#45","2016#46","2016#47","2016#48","2016#49","2016#50","2016#51","2016#52"]
batch_week_list = ["2016#1","2016#2","2016#3","2016#4","2016#5","2016#6","2016#7","2016#8","2016#9","2016#10","2016#11","2016#12","2016#13","2016#14","2016#15","2016#16","2016#17","2016#18","2016#19","2016#20","2016#21","2016#22","2016#23","2016#24","2016#25","2016#26","2016#27","2016#28","2016#29","2016#30","2016#31","2016#32","2016#33","2016#34","2016#35","2016#36","2016#37","2016#38","2016#39","2016#40","2016#41","2016#42","2016#43","2016#44","2016#45","2016#46","2016#47","2016#48","2016#49","2016#50","2016#51","2016#52"]
requests = threadpool.makeRequests(user_remain_proc, batch_week_list)
main_pool = threadpool.ThreadPool(8)
[main_pool.putRequest(req) for req in requests]
if __name__ == '__main__':
    while True:
        try:
            time.sleep(3)
            main_pool.poll()
        except KeyboardInterrupt:
            print("**** Interrupted!")
            break
        except threadpool.NoResultsPending:
            break

    if main_pool.dismissedWorkers:
        print("Joining all dismissed worker threads...")
        main_pool.joinAllDismissedWorkers()

now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是：",now_time

3、新版本
/Users/nisj/PycharmProjects/EsDataProc/bi-static/Hive_remain_pay_byWeek_thread.py

# -*- coding=utf-8 -*-
import warnings
import datetime
import time
import os
import re
import threadpool

warnings.filterwarnings("ignore")

today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
tomorrow = today + datetime.timedelta(days=1)

now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是：",now_time

def dateRange(beginDate, endDate):
    dates = []
    dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
    date = beginDate[:]
    while date <= endDate:
        dates.append(date)
        dt = dt + datetime.timedelta(1)
        date = dt.strftime("%Y-%m-%d")
    return dates

def weekRang(beginDate, endDate):
    week = set()
    for date in dateRange(beginDate, endDate):
        week.add(datetime.date(int(date[0:4]), int(date[5:7]), int(date[8:10])).isocalendar()[0:2])

    wk_l = []
    for wl in sorted(list(week)):
        wk_l.append(str(wl[0])+'#'+str(wl[1]))
    return wk_l

def hisWeekList(curr_week):
    last_wk = datetime.datetime.now() - datetime.timedelta(days=7)
    end_day = str(last_wk)[0:10]
    his_week_list = []
    for week in weekRang('2015-07-01', end_day):
        if (int(week[0:4]) == int(curr_week[0:4]) and int(week[5:]) <= int(curr_week[5:])) or (int(week[0:4]) < int(curr_week[0:4])):
            his_week_list.append(week)
    return his_week_list

def user_remain_proc(curr_week):
    for his_week in hisWeekList(curr_week):
        if (int(his_week[0:4]) == int(curr_week[0:4]) and int(his_week[5:]) <= int(curr_week[5:])) or (int(his_week[0:4]) < int(curr_week[0:4])):
            # print his_week,curr_week

            os.system("""/usr/bin/mysql -hHostIp -P6603 -uhadoop -pMysqlPass -e "use funnyai_data; \
                        delete from bi_user_remain_pay_byweek where data_week='%s' and remain_week='%s'; \
                         " """ % (his_week, curr_week))

            newuser_remain_pay_data = os.popen("""/usr/lib/hive-current/bin/hive -e " \
                add jar /home/hadoop/nisj/udf-jar/hadoop_udf_radixChange.jar; \
                create temporary function RadixChange as 'com.kascend.hadoop.RadixChange'; \
                with his_new_user as (select appsource,appkey,identifier,RadixChange(uid,16,10) uid \
                from bi_all_access_log_of_new \
                where case when weekofyear(pt_day)>=52 and month(pt_day)=1 then concat(year(pt_day)-1,'#',weekofyear(pt_day)) else concat(year(pt_day),'#',weekofyear(pt_day)) end = '%s' \
                group by appsource,appkey,identifier,RadixChange(uid,16,10)), \
                curr_week_data as (select appsource,appkey,identifier,RadixChange(uid,16,10) uid \
                from bi_all_access_log \
                where case when weekofyear(pt_day)>=52 and month(pt_day)=1 then concat(year(pt_day)-1,'#',weekofyear(pt_day)) else concat(year(pt_day),'#',weekofyear(pt_day)) end = '%s' \
                group by appsource,appkey,identifier,RadixChange(uid,16,10)), \
                curr_week_pay as (select uid,sum(amount) amount \
                from data_chushou_pay_info \
                where case when weekofyear(pt_day)>=52 and month(pt_day)=1 then concat(year(pt_day)-1,'#',weekofyear(pt_day)) else concat(year(pt_day),'#',weekofyear(pt_day)) end = '%s' \
                group by uid) \
                select a1.appkey,a1.appsource,count(distinct a2.identifier) remain_cnt,sum(a3.amount) pay_amount \
                from his_new_user a1 \
                inner join curr_week_data a2 on a1.appkey=a2.appkey and a1.identifier=a2.identifier and a1.appsource=a2.appsource \
                left join curr_week_pay a3 on a1.uid=a3.uid \
                group by a1.appkey,a1.appsource \
                ;" \
                """ % (his_week, curr_week, curr_week)).readlines();

            nrpd_list = []
            for nrp_list in newuser_remain_pay_data:
                nrp = re.split('\t', nrp_list.replace('\n', ''))
                nrpd_list.append(nrp)
            for nrpd in nrpd_list:
                remain_week = curr_week
                appkey = nrpd[0]
                appsource = nrpd[1]
                remain_cnt = nrpd[2]
                pay_amount = nrpd[3]
                etl_time = time.strftime('%Y-%m-%d %X', time.localtime())

                os.system("""/usr/bin/mysql -hHostIp -P6603 -uhadoop -pMysqlPass -e "use funnyai_data; \
                insert into bi_user_remain_pay_byweek(data_week,appsource,appkey,remain_week,remain_cnt,pay_amount,etl_time) \
                select '%s','%s','%s','%s','%s','%s','%s'; \
                 " """ % (his_week, appsource, appkey, remain_week, remain_cnt, pay_amount, etl_time))

batch_week_list = weekRang('2015-07-01', '2016-01-03')
requests = threadpool.makeRequests(user_remain_proc, batch_week_list)
main_pool = threadpool.ThreadPool(12)
[main_pool.putRequest(req) for req in requests]
if __name__ == '__main__':
    while True:
        try:
            time.sleep(3)
            main_pool.poll()
        except KeyboardInterrupt:
            print("**** Interrupted!")
            break
        except threadpool.NoResultsPending:
            break

    if main_pool.dismissedWorkers:
        print("Joining all dismissed worker threads...")
        main_pool.joinAllDismissedWorkers()

now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是：",now_time

结果数据的查看

select data_week,remain_week,count(*) 
from funnyai_data.bi_user_remain_pay_byweek 
group by data_week,remain_week 
order by substring(remain_week,1,4)+0,substring(remain_week,6)+0,substring(data_week,1,4)+0,substring(data_week,6)+0
;

一些想法
对于用Python进行大数据平台下的跑批，Python的作用最好只限制在参数的生成、调用及总体流程的控制；而耗时的、具体的运算还是让Hive调用MapReduce去完成。Python的多线程最好也是用在调用Hive的mapReduce上；而不是数据的具体运算。