Mysql分表数据,一般情况下我们是把他导入到数据仓库中整合到一张表中;由于环境受限,所以考虑使用Python脚本取分表数据在Python中进行计算的形式。
1、取Mysql分表进行汇总计算的Python脚本
/Users/nisj/PycharmProjects/BiDataProc/love/staticFromMysqlSubmeter.py
/Users/nisj/PycharmProjects/BiDataProc/love/listGroup.py
1、取Mysql分表进行汇总计算的Python脚本
/Users/nisj/PycharmProjects/BiDataProc/love/staticFromMysqlSubmeter.py
# -*- coding=utf-8 -*-
import os
import re
from itertools import groupby
from operator import itemgetter
def getMysqlSubmeterData(submeter_cnt, src_tabName):
allDataList=[]
for submeterPlus in range(0, submeter_cnt, 1):
submeterData=os.popen("""source /etc/profile; \
/usr/local/mysql/bin/mysql -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass -Djellyfish_quest \
-N -e"set names utf8; \
select uid,\`date\` \
from {src_tabName}_{submeterPlus} \
where \`date\` between '2017-11-24' and '2017-11-27' and task_id in(14,15) and state=1;" \
""".format(src_tabName=src_tabName, submeterPlus=submeterPlus)).readlines();
submeterDataList = []
for submeterDataRow in submeterData:
submeterD = re.split('\t', submeterDataRow.replace('\n', '').replace('`', '').replace('\'', '').replace('"', ''))
submeterDataList.append(submeterD)
allDataList.extend(submeterDataList)
duplicateRemovalList=[]
for allD in allDataList:
if allD not in duplicateRemovalList:
duplicateRemovalList.append(allD)
groupbyAllData = groupby(sorted(duplicateRemovalList, key=itemgetter(1)), itemgetter(1))
for key, item in groupbyAllData:
i = 0
for jtem in item:
i += 1
print key, i
# Batch Test
getMysqlSubmeterData(submeter_cnt=256, src_tabName='user_task_done_list')
2、在Hive进行Sql查询的脚本select `date`,count(distinct uid)
from oss_user_task_done_list
where `date` between '2017-11-24' and '2017-11-27' and task_id in(14,15) and state=1
group by `date`;
3、另一个数据分组汇总的简单示例/Users/nisj/PycharmProjects/BiDataProc/love/listGroup.py
# -*- coding=utf-8 -*-
from itertools import groupby
lst = [1, 2, 8, 11, 25, 43, 6, 9, 29, 51, 66, 100]
def dataClassification(num):
if num <= 10:
return 'less'
elif num >= 30:
return 'great'
else:
return 'middle'
# Batch Test
print [(k, list(g))for k, g in groupby(sorted(lst), key=dataClassification)]
for key, item in groupby(sorted(lst), key=dataClassification):
i = 0
for jtem in item:
i += 1
print key, i