Python从阿里云Oss拉数据写入Hive表并进行相关处理

最新推荐文章于 2024-07-21 16:25:22 发布

BabyFish13

最新推荐文章于 2024-07-21 16:25:22 发布

阅读量1.6k

点赞数

分类专栏： Hadoop # Hive Python # Hdfs Solution projectCase 文章标签： Python hadoop hive oss 阿里云

本文链接：https://blog.csdn.net/BabyFish13/article/details/64124006

版权

Python 同时被 3 个专栏收录

109 篇文章 3 订阅

订阅专栏

Solution

85 篇文章 1 订阅

订阅专栏

Hive

80 篇文章 6 订阅

订阅专栏

数据的流向是：先从Oss拉取原始日志文件数据，根据Hive分区表建表规范放在HDFS上；然后将数据根据不同的过滤条件分别插入到对应的其他中间表；由于空间有限，原始数据转移处理好之后，立马删除从Oss上拉取下来的日志数据；之后，再分别对各中间表数据利用Hive正则进行切分和提取，并根据需求进行汇总统计及tableau展现。
1、Hive目标库建表

drop table if exists xx_oss_access_log;
create table xx_oss_access_log
(
text_connect string
)
partitioned by (pt_day string)
row format delimited
stored as textfile
location '/tmp/nisj/oss_access_log';

drop table if exists xx_oss_access_log_pay;
create table xx_oss_access_log_pay
(
text_connect string
)
partitioned by (pt_day string)
row format delimited
stored as textfile;

drop table if exists xx_oss_access_log_pay_ios;
create table xx_oss_access_log_pay_ios
(
text_connect string
)
partitioned by (pt_day string)
row format delimited
stored as textfile;

drop table if exists xx_oss_access_log_pay_appstore;
create table xx_oss_access_log_pay_appstore
(
text_connect string
)
partitioned by (pt_day string)
row format delimited
stored as textfile;

drop table if exists xx_oss_access_log_pay_pc;
create table xx_oss_access_log_pay_pc
(
text_connect string
)
partitioned by (pt_day string)
row format delimited
stored as textfile;

2、python处理数据日期时间处理
/Users/nisj/PycharmProjects/BiDataProc/getOssFile/DateCalc.py

# -*- coding=utf-8 -*-
import warnings
import datetime
import calendar

warnings.filterwarnings("ignore")

def getNowDay():
    DayNow = datetime.datetime.today().strftime('%Y-%m-%d')
    return DayNow

def getYesterDay():
    YesterDay = datetime.datetime.today() - datetime.timedelta(days=1)
    YesterDay = YesterDay.strftime('%Y-%m-%d')
    return YesterDay

def getMonthOfYear():
    Month_list =['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
    return Month_list

def getDayOfMonth(year, month):
    day_list = range(calendar.monthrange(year, month)[1]+1)[1:]
    day_list2 = []
    for day in day_list:
        day_list2.append(str(day).zfill(2))
    return day_list2

def dateRange(beginDate, endDate):
    dates = []
    dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
    date = beginDate[:]
    while date <= endDate:
        dates.append(date)
        dt = dt + datetime.timedelta(1)
        date = dt.strftime("%Y-%m-%d")
    return dates

for date in dateRange('2017-02-01', '2017-03-14'):
    year = date[0:4]
    month = date[5:7]
    day = date[8:10]


# print getDayOfMonth(year=2017, month=int(month))

3、从Oss获取数据的主Python脚本
/Users/nisj/PycharmProjects/BiDataProc/getOssFile/getFileFromOss.py

# -*- coding=utf-8 -*-
import os
from DateCalc import *

warnings.filterwarnings("ignore")

def getParameterInfo(date_start, date_end):
    ParameterList = []
    AccessKeyId = "AccessKeyId-string"
    AccessKeySecret = "AccessKeySecret-string"
    bucket = "chushou-hz"
    endpoint = "oss-cn-hangzhou-internal.aliyuncs.com"
    server_list = ['sz-1-1','sz-113-14','sz-121-210','sz-128-140','sz-128-50']
    for server_name in server_list:
        for date in dateRange(date_start, date_end):
            year = date[0:4]
            month = date[5:7]
            day = date[8:10]
            hourList = ['00', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15',
                        '16', '17', '18', '19', '20', '21', '22', '23']
            for hour in hourList:
                ParameterList.append((AccessKeyId, AccessKeySecret, bucket, endpoint, server_name, year, month, day, hour))

    return ParameterList


def getOssFile(AccessKeyId, AccessKeySecret, bucket, endpoint, server_name, year, month, day, hour):

    date = year+"-"+month+"-"+day
    dateTime = date+"."+hour
    # print AccessKeyId,AccessKeySecret,bucket,endpoint,server_name,year,month,day,hour

    ossFileName = "JellyFishLogs/"+server_name+"/"+year+"/"+month+"/"+day+"/localhost_access_log."+dateTime+".txt"
    ossPath = "http://chushou-hz.oss-cn-hangzhou-internal.aliyuncs.com"+ossFileName
    # print ossPath

    hdfsFilePath = "/tmp/nisj/oss_access_log/pt_day="+date+"/"
    hdfsFileName = hdfsFilePath+server_name+"#localhost_access_log."+dateTime+".txt"

    os.system("""hadoop distcp oss://%s:%s@%s.%s/%s hdfs:%s""" % (AccessKeyId, AccessKeySecret, bucket, endpoint, ossFileName, hdfsFileName))

    # distcp_command = """hadoop distcp oss://%s:%s@%s.%s/%s hdfs:%s""" % (AccessKeyId, AccessKeySecret, bucket, endpoint, ossFileName, hdfsFileName)
    # print distcp_command


# test code
# date_start='2017-02-06'
# date_end='2017-02-06'
# for Parameter in getParameterInfo(date_start, date_end):
#     AccessKeyId = Parameter[0]
#     AccessKeySecret = Parameter[1]
#     bucket = Parameter[2]
#     endpoint = Parameter[3]
#     server_name = Parameter[4]
#     year = Parameter[5]
#     month = Parameter[6]
#     day = Parameter[7]
#     hour = Parameter[8]
#
#     getOssFile(AccessKeyId, AccessKeySecret, bucket, endpoint, server_name, year, month, day, hour)

4、Oss数据拉取并行调度
/Users/nisj/PycharmProjects/BiDataProc/getOssFile/BatchThread.py

# -*- coding=utf-8 -*-
import threadpool
import time
from getFileFromOss import *

warnings.filterwarnings("ignore")

today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
tomorrow = today + datetime.timedelta(days=1)

def batchThread(run_day):
    now_time = time.strftime('%Y-%m-%d %X', time.localtime())
    print "当前时间是：",now_time

    date_start = run_day
    date_end = run_day
    batch_Parameter_list = []
    for Parameter in getParameterInfo(date_start, date_end):
        AccessKeyId = Parameter[0]
        AccessKeySecret = Parameter[1]
        bucket = Parameter[2]
        endpoint = Parameter[3]
        server_name = Parameter[4]
        year = Parameter[5]
        month = Parameter[6]
        day = Parameter[7]
        hour = Parameter[8]
        batch_Parameter_list.append(([AccessKeyId, AccessKeySecret, bucket, endpoint, server_name, year, month, day, hour], None))

    requests = []
    request_getOssFile = threadpool.makeRequests(getOssFile, batch_Parameter_list)
    requests.extend(request_getOssFile)
    main_pool = threadpool.ThreadPool(28)
    [main_pool.putRequest(req) for req in requests]

    while True:
        try:
            time.sleep(30)
            main_pool.poll()
        except KeyboardInterrupt:
            print("**** Interrupted!")
            break
        except threadpool.NoResultsPending:
            break

    if main_pool.dismissedWorkers:
        print("Joining all dismissed worker threads...")
        main_pool.joinAllDismissedWorkers()

    now_time = time.strftime('%Y-%m-%d %X', time.localtime())
    print "当前时间是：",now_time

5、数据处理的总控
/Users/nisj/PycharmProjects/BiDataProc/getOssFile/BatchCtl.py

# -*- coding=utf-8 -*-
from BatchThread import *

warnings.filterwarnings("ignore")

today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
tomorrow = today + datetime.timedelta(days=1)

for run_day in dateRange(beginDate='2017-02-01', endDate='2017-03-15'):
    os.system("""source /etc/profile; \
            /usr/lib/hive-current/bin/hive -e " \
            alter table xx_oss_access_log drop if exists partition (pt_day='%s'); \
            alter table xx_oss_access_log add partition (pt_day='%s');" \
            """ % (run_day, run_day))

    batchThread(run_day)

    os.system("""source /etc/profile; \
                /usr/lib/hive-current/bin/hive -e " \
                alter table xx_oss_access_log_pay drop if exists partition (pt_day='{0}'); \
                alter table xx_oss_access_log_pay add partition (pt_day='{0}'); \
                insert into xx_oss_access_log_pay partition(pt_day='{0}') \
                select text_connect from xx_oss_access_log \
                where text_connect like '%/m/pay.htm%' \
                and pt_day='{0}'; " """.format(run_day))

    os.system("""source /etc/profile; \
                /usr/lib/hive-current/bin/hive -e " \
                alter table xx_oss_access_log_pay_ios drop if exists partition (pt_day='{0}'); \
                alter table xx_oss_access_log_pay_ios add partition (pt_day='{0}'); \
                insert into xx_oss_access_log_pay_ios partition(pt_day='{0}') \
                select text_connect from xx_oss_access_log \
                where text_connect like '%/m/pay-ios.htm%' \
                and pt_day='{0}'; " """.format(run_day))

    os.system("""source /etc/profile; \
                /usr/lib/hive-current/bin/hive -e " \
                alter table xx_oss_access_log_pay_appstore drop if exists partition (pt_day='{0}'); \
                alter table xx_oss_access_log_pay_appstore add partition (pt_day='{0}'); \
                insert into xx_oss_access_log_pay_appstore partition(pt_day='{0}') \
                select text_connect from xx_oss_access_log \
                where text_connect like '%/m/pay-appstore.htm%' \
                and pt_day='{0}'; " """.format(run_day))

    os.system("""source /etc/profile; \
                /usr/lib/hive-current/bin/hive -e " \
                alter table xx_oss_access_log_pay_pc drop if exists partition (pt_day='{0}'); \
                alter table xx_oss_access_log_pay_pc add partition (pt_day='{0}'); \
                insert into xx_oss_access_log_pay_pc partition(pt_day='{0}') \
                select text_connect from xx_oss_access_log \
                where text_connect like '%/pay.htm%' \
                and pt_day='{0}'; " """.format(run_day))


    os.system("""source /etc/profile; \
            /usr/lib/hive-current/bin/hive -e " \
            alter table xx_oss_access_log drop if exists partition (pt_day='%s');" \
            """ % (run_day))

6、数据的切割提取

-- 新的Sql写法
with tab_access_log_pay as (
select pt_day,
substr(regexp_extract(text_connect,'token=([0-9a-fA-F]+)',0),7) token,
substr(regexp_extract(text_connect,'_identifier=([0-9]+)',0),13) identifier,
substr(regexp_extract(text_connect,'_appSource=([0-9]+)',0),12) appSource
from xx_oss_access_log_pay)
select *
from tab_access_log_pay limit 100;

-- 统计分析
with tab_access_log_pay as (
select pt_day,
substr(regexp_extract(text_connect,'token=([0-9a-fA-F]+)',0),7) token,
substr(regexp_extract(text_connect,'_identifier=([0-9]+)',0),13) identifier,
substr(regexp_extract(text_connect,'_appSource=([0-9]+)',0),12) appSource
from xx_oss_access_log_pay)
select pt_day,appSource,count(*)
from tab_access_log_pay group by pt_day,appSource;

-- 结果数据导出
hive -e "with tab_access_log_pay as (
select pt_day,
substr(regexp_extract(text_connect,'token=([0-9a-fA-F]+)',0),7) token,
substr(regexp_extract(text_connect,'_identifier=([0-9]+)',0),13) identifier,
substr(regexp_extract(text_connect,'_appSource=([0-9]+)',0),12) appSource
from xx_oss_access_log_pay)
select *
from tab_access_log_pay ;" > xx_oss_access_log_pay.txt