1、从分表之一预计算
room_id 每个房间最高在线人数据除以天数据的均值
10001185 107
10012429 503.2
10023373 334.5
10027296 2762.8333
10028457 191.8333
1003009 45
10030228 7839.2857
10033081 406.25
10033270 1451.5714
1004148 196
10088630 2175.4
10098915 913
最终的查询结果数据一致
select A.room_id,SUBSTRING(A.created_time,1,10) days,max(A.online_count)
from room_online_stat_15 A
where SUBSTRING(A.created_time,1,10) between '2016-11-15' and '2016-11-21' and category_id=90
and room_id in
(21983 ,
12232623 ,
6201660 ,
3915297 ,
20249052)
GROUP BY A.room_Id,SUBSTRING(A.created_time,1,10);
2、将分表数据汇总一起后的查询语句
进入总表(room_online_stat_1521)之前,数据根据具体需要进行了过滤等处理,确保room_online_stat_1521表里的数据,都是所需要的,以致于数据量不会太大,保证计算时的效率。
-- create table room_online_stat_1521(room_id VARCHAR(100),created_time VARCHAR(100),online_count BIGINT);
select room_id,sum(ONLINE_count_max) /count(*) from
(
select room_id,CREATEd_time,max(ONLINE_count) ONLINE_count_max,count(*) days
from room_online_stat_1521
group by room_id,CREATEd_time order by room_id,CREATEd_time) x
group by room_id
3、使用Python将数据取到内存中后进行计算
/Users/nisj/PycharmProjects/EsDataProc/xx-demand.py
# -*- coding=utf-8 -*-
import MySQLdb
import warnings
import datetime
from itertools import groupby
from operator import itemgetter
warnings.filterwarnings("ignore")
db_config = {
'host': 'MysqlIp',
'user': 'MysqlUser',
'passwd': 'MysqlPass',
'port': 3306,
'db': 'jellyfish_stat'
}
def getDB():
try:
conn = MySQLdb.connect(host=db_config['host'], user=db_config['user'], passwd=db_config['passwd'],
port=db_config['port'])
conn.autocommit(True)
curr = conn.cursor()
curr.execute("SET NAMES utf8");
curr.execute("USE %s" % db_config['db']);
return conn, curr
except MySQLdb.Error, e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
return None, None
conn, curr = getDB()
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
tomorrow = today + datetime.timedelta(days=1)
sql_text = "select table_name from information_schema.tables where table_schema='jellyfish_stat' and table_name like 'room_online_stat_%';"
curr.execute(sql_text)
tab_list = curr.fetchall()
col_datas_union = []
for tab_name in tab_list:
tab_name = tab_name[0]
sql_text = "select room_id,SUBSTRING(created_time,1,10) days,online_count from jellyfish_stat.%s where SUBSTRING(created_time,1,10) between '2016-11-15' and '2016-11-21' and category_id=90 and room_id in (21983,12232623,6201660,20249052);" % (tab_name)
curr.execute(sql_text)
col_datas = curr.fetchall()
col_datas_union.extend(col_datas)
group_col_datas = groupby(sorted(col_datas_union, key=itemgetter(0,1)), itemgetter(0,1))
result_list = []
rl = []
for key, item in group_col_datas:
i = 0
j = 0
online_count_max = []
for jtem in item:
i += float(jtem[2])
j += 1
online_count_max.append(jtem[2])
# print key, max(online_count_max)
result_list = (key, max(online_count_max))
result_list = (result_list[0][0],result_list[0][1],result_list[1])
rl.append(result_list)
# rl.sort(key=lambda x:x[1],reverse=True)
# print rl
rl_group = groupby(sorted(rl, key=itemgetter(0)), itemgetter(0))
rl_send = []
for key,item in rl_group:
i = 0
j = 0
for jtem in item:
i += float(jtem[2])
j += 1
print key,i/j,i,j
curr.close()
conn.close()
Python脚本中,主要解决了SQL中的分表数据汇总、取最大值、Sum求和、Count以及子计算等功能。
room_id 每个房间最高在线人数据除以天数据的均值
10001185 107
10012429 503.2
10023373 334.5
10027296 2762.8333
10028457 191.8333
1003009 45
10030228 7839.2857
10033081 406.25
10033270 1451.5714
1004148 196
10088630 2175.4
10098915 913
最终的查询结果数据一致