数据的流向是:先从Oss拉取原始日志文件数据,根据Hive分区表建表规范放在HDFS上;然后将数据根据不同的过滤条件分别插入到对应的其他中间表;由于空间有限,原始数据转移处理好之后,立马删除从Oss上拉取下来的日志数据;之后,再分别对各中间表数据利用Hive正则进行切分和提取,并根据需求进行汇总统计及tableau展现。
1、Hive目标库建表
2、python处理数据日期时间处理
/Users/nisj/PycharmProjects/BiDataProc/getOssFile/DateCalc.py
3、从Oss获取数据的主Python脚本
/Users/nisj/PycharmProjects/BiDataProc/getOssFile/getFileFromOss.py
4、Oss数据拉取并行调度
/Users/nisj/PycharmProjects/BiDataProc/getOssFile/BatchThread.py
5、数据处理的总控
/Users/nisj/PycharmProjects/BiDataProc/getOssFile/BatchCtl.py
6、数据的切割提取
1、Hive目标库建表
drop table if exists xx_oss_access_log;
create table xx_oss_access_log
(
text_connect string
)
partitioned by (pt_day string)
row format delimited
stored as textfile
location '/tmp/nisj/oss_access_log';
drop table if exists xx_oss_access_log_pay;
create table xx_oss_access_log_pay
(
text_connect string
)
partitioned by (pt_day string)
row format delimited
stored as textfile;
drop table if exists xx_oss_access_log_pay_ios;
create table xx_oss_access_log_pay_ios
(
text_connect string
)
partitioned by (pt_day string)
row format delimited
stored as textfile;
drop table if exists xx_oss_access_log_pay_appstore;
create table xx_oss_access_log_pay_appstore
(
text_connect string
)
partitioned by (pt_day string)
row format delimited
stored as textfile;
drop table if exists xx_oss_access_log_pay_pc;
create table xx_oss_access_log_pay_pc
(
text_connect string
)
partitioned by (pt_day string)
row format delimited
stored as textfile;
2、python处理数据日期时间处理
/Users/nisj/PycharmProjects/BiDataProc/getOssFile/DateCalc.py
# -*- coding=utf-8 -*-
import warnings
import datetime
import calendar
warnings.filterwarnings("ignore")
def getNowDay():
DayNow = datetime.datetime.today().strftime('%Y-%m-%d')
return DayNow
def getYesterDay():
YesterDay = datetime.datetime.today() - datetime.timedelta(days=1)
YesterDay = YesterDay.strftime('%Y-%m-%d')
return YesterDay
def getMonthOfYear():
Month_list =['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
return Month_list
def getDayOfMonth(year, month):
day_list = range(calendar.monthrange(year, month)[1]+1)[1:]
day_list2 = []
for day in day_list:
day_list2.append(str(day).zfill(2))
return day_list2
def dateRange(beginDate, endDate):
dates = []
dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
date = beginDate[:]
while date <= endDate:
dates.append(date)
dt = dt + datetime.timedelta(1)
date = dt.strftime("%Y-%m-%d")
return dates
for date in dateRange('2017-02-01', '2017-03-14'):
year = date[0:4]
month = date[5:7]
day = date[8:10]
# print getDayOfMonth(year=2017, month=int(month))
3、从Oss获取数据的主Python脚本
/Users/nisj/PycharmProjects/BiDataProc/getOssFile/getFileFromOss.py
# -*- coding=utf-8 -*-
import os
from DateCalc import *
warnings.filterwarnings("ignore")
def getParameterInfo(date_start, date_end):
ParameterList = []
AccessKeyId = "AccessKeyId-string"
AccessKeySecret = "AccessKeySecret-string"
bucket = "chushou-hz"
endpoint = "oss-cn-hangzhou-internal.aliyuncs.com"
server_list = ['sz-1-1','sz-113-14','sz-121-210','sz-128-140','sz-128-50']
for server_name in server_list:
for date in dateRange(date_start, date_end):
year = date[0:4]
month = date[5:7]
day = date[8:10]
hourList = ['00', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15',
'16', '17', '18', '19', '20', '21', '22', '23']
for hour in hourList:
ParameterList.append((AccessKeyId, AccessKeySecret, bucket, endpoint, server_name, year, month, day, hour))
return ParameterList
def getOssFile(AccessKeyId, AccessKeySecret, bucket, endpoint, server_name, year, month, day, hour):
date = year+"-"+month+"-"+day
dateTime = date+"."+hour
# print AccessKeyId,AccessKeySecret,bucket,endpoint,server_name,year,month,day,hour
ossFileName = "JellyFishLogs/"+server_name+"/"+year+"/"+month+"/"+day+"/localhost_access_log."+dateTime+".txt"
ossPath = "http://chushou-hz.oss-cn-hangzhou-internal.aliyuncs.com"+ossFileName
# print ossPath
hdfsFilePath = "/tmp/nisj/oss_access_log/pt_day="+date+"/"
hdfsFileName = hdfsFilePath+server_name+"#localhost_access_log."+dateTime+".txt"
os.system("""hadoop distcp oss://%s:%s@%s.%s/%s hdfs:%s""" % (AccessKeyId, AccessKeySecret, bucket, endpoint, ossFileName, hdfsFileName))
# distcp_command = """hadoop distcp oss://%s:%s@%s.%s/%s hdfs:%s""" % (AccessKeyId, AccessKeySecret, bucket, endpoint, ossFileName, hdfsFileName)
# print distcp_command
# test code
# date_start='2017-02-06'
# date_end='2017-02-06'
# for Parameter in getParameterInfo(date_start, date_end):
# AccessKeyId = Parameter[0]
# AccessKeySecret = Parameter[1]
# bucket = Parameter[2]
# endpoint = Parameter[3]
# server_name = Parameter[4]
# year = Parameter[5]
# month = Parameter[6]
# day = Parameter[7]
# hour = Parameter[8]
#
# getOssFile(AccessKeyId, AccessKeySecret, bucket, endpoint, server_name, year, month, day, hour)
4、Oss数据拉取并行调度
/Users/nisj/PycharmProjects/BiDataProc/getOssFile/BatchThread.py
# -*- coding=utf-8 -*-
import threadpool
import time
from getFileFromOss import *
warnings.filterwarnings("ignore")
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
tomorrow = today + datetime.timedelta(days=1)
def batchThread(run_day):
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time
date_start = run_day
date_end = run_day
batch_Parameter_list = []
for Parameter in getParameterInfo(date_start, date_end):
AccessKeyId = Parameter[0]
AccessKeySecret = Parameter[1]
bucket = Parameter[2]
endpoint = Parameter[3]
server_name = Parameter[4]
year = Parameter[5]
month = Parameter[6]
day = Parameter[7]
hour = Parameter[8]
batch_Parameter_list.append(([AccessKeyId, AccessKeySecret, bucket, endpoint, server_name, year, month, day, hour], None))
requests = []
request_getOssFile = threadpool.makeRequests(getOssFile, batch_Parameter_list)
requests.extend(request_getOssFile)
main_pool = threadpool.ThreadPool(28)
[main_pool.putRequest(req) for req in requests]
while True:
try:
time.sleep(30)
main_pool.poll()
except KeyboardInterrupt:
print("**** Interrupted!")
break
except threadpool.NoResultsPending:
break
if main_pool.dismissedWorkers:
print("Joining all dismissed worker threads...")
main_pool.joinAllDismissedWorkers()
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time
5、数据处理的总控
/Users/nisj/PycharmProjects/BiDataProc/getOssFile/BatchCtl.py
# -*- coding=utf-8 -*-
from BatchThread import *
warnings.filterwarnings("ignore")
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
tomorrow = today + datetime.timedelta(days=1)
for run_day in dateRange(beginDate='2017-02-01', endDate='2017-03-15'):
os.system("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e " \
alter table xx_oss_access_log drop if exists partition (pt_day='%s'); \
alter table xx_oss_access_log add partition (pt_day='%s');" \
""" % (run_day, run_day))
batchThread(run_day)
os.system("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e " \
alter table xx_oss_access_log_pay drop if exists partition (pt_day='{0}'); \
alter table xx_oss_access_log_pay add partition (pt_day='{0}'); \
insert into xx_oss_access_log_pay partition(pt_day='{0}') \
select text_connect from xx_oss_access_log \
where text_connect like '%/m/pay.htm%' \
and pt_day='{0}'; " """.format(run_day))
os.system("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e " \
alter table xx_oss_access_log_pay_ios drop if exists partition (pt_day='{0}'); \
alter table xx_oss_access_log_pay_ios add partition (pt_day='{0}'); \
insert into xx_oss_access_log_pay_ios partition(pt_day='{0}') \
select text_connect from xx_oss_access_log \
where text_connect like '%/m/pay-ios.htm%' \
and pt_day='{0}'; " """.format(run_day))
os.system("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e " \
alter table xx_oss_access_log_pay_appstore drop if exists partition (pt_day='{0}'); \
alter table xx_oss_access_log_pay_appstore add partition (pt_day='{0}'); \
insert into xx_oss_access_log_pay_appstore partition(pt_day='{0}') \
select text_connect from xx_oss_access_log \
where text_connect like '%/m/pay-appstore.htm%' \
and pt_day='{0}'; " """.format(run_day))
os.system("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e " \
alter table xx_oss_access_log_pay_pc drop if exists partition (pt_day='{0}'); \
alter table xx_oss_access_log_pay_pc add partition (pt_day='{0}'); \
insert into xx_oss_access_log_pay_pc partition(pt_day='{0}') \
select text_connect from xx_oss_access_log \
where text_connect like '%/pay.htm%' \
and pt_day='{0}'; " """.format(run_day))
os.system("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e " \
alter table xx_oss_access_log drop if exists partition (pt_day='%s');" \
""" % (run_day))
6、数据的切割提取
-- 新的Sql写法
with tab_access_log_pay as (
select pt_day,
substr(regexp_extract(text_connect,'token=([0-9a-fA-F]+)',0),7) token,
substr(regexp_extract(text_connect,'_identifier=([0-9]+)',0),13) identifier,
substr(regexp_extract(text_connect,'_appSource=([0-9]+)',0),12) appSource
from xx_oss_access_log_pay)
select *
from tab_access_log_pay limit 100;
-- 统计分析
with tab_access_log_pay as (
select pt_day,
substr(regexp_extract(text_connect,'token=([0-9a-fA-F]+)',0),7) token,
substr(regexp_extract(text_connect,'_identifier=([0-9]+)',0),13) identifier,
substr(regexp_extract(text_connect,'_appSource=([0-9]+)',0),12) appSource
from xx_oss_access_log_pay)
select pt_day,appSource,count(*)
from tab_access_log_pay group by pt_day,appSource;
-- 结果数据导出
hive -e "with tab_access_log_pay as (
select pt_day,
substr(regexp_extract(text_connect,'token=([0-9a-fA-F]+)',0),7) token,
substr(regexp_extract(text_connect,'_identifier=([0-9]+)',0),13) identifier,
substr(regexp_extract(text_connect,'_appSource=([0-9]+)',0),12) appSource
from xx_oss_access_log_pay)
select *
from tab_access_log_pay ;" > xx_oss_access_log_pay.txt