1. 表设计
由于我这边数据量比较大对数据进行了按用户分了100个区
CREATE TABLE `l_live_room` (
`sso_id` int(10) unsigned NOT NULL DEFAULT '0' COMMENT '用户ID',
`erp_id` varchar(64) NOT NULL DEFAULT '0',
`cid` varchar(64) NOT NULL DEFAULT '0',
`package_id` int(10) unsigned NOT NULL DEFAULT '0',
`unit_id` int(10) unsigned NOT NULL DEFAULT '0',
`course_id` int(10) unsigned DEFAULT '0',
`room_id` varchar(30) NOT NULL,
`start_time` int(10) unsigned NOT NULL,
`end_time` int(10) unsigned NOT NULL,
`live_type` smallint(5) unsigned NOT NULL DEFAULT '0',
`watch_time` double unsigned NOT NULL DEFAULT '0' COMMENT '直播观看时长',
`current_position` double unsigned NOT NULL DEFAULT '0' COMMENT '回放观看时长',
`total_length` double unsigned NOT NULL DEFAULT '0',
`live_id` int(10) unsigned NOT NULL DEFAULT '0',
`live_name` varchar(255) DEFAULT NULL,
`interaction_type` tinyint(3) unsigned NOT NULL DEFAULT '0' COMMENT '1:一对多 2:一对一 3 其他',
`status` tinyint(3) unsigned NOT NULL DEFAULT '0' COMMENT '1 观看直播, 2观看回放, 3观看了直播和回放',
`get_watch_time` int(10) unsigned NOT NULL DEFAULT '0' COMMENT '同步watch_time的时间',
`action_time` varchar(19) NOT NULL COMMENT '用户最近一次操作时间',
`error` int(10) unsigned NOT NULL DEFAULT '0' COMMENT '1 表示未获取到room_id, 2 表示未查询到',
`update_time` datetime NOT NULL DEFAULT '2000-01-01 00:00:00' COMMENT '最后一次写入阿里云日志的时间',
PRIMARY KEY (`sso_id`,`erp_id`,`cid`,`package_id`,`unit_id`) USING BTREE,
KEY `room_id` (`room_id`) USING BTREE,
KEY `status` (`status`,`live_type`,`error`,`get_watch_time`,`update_time`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC
PARTITION BY HASH (sso_id)
(PARTITION p0 ENGINE = InnoDB,
PARTITION p1 ENGINE = InnoDB,
# 省略部分按自己需要的分区数自行补全
PARTITION p98 ENGINE = InnoDB,
PARTITION p99 ENGINE = InnoDB);
2. 数据示例
3. 需求及数据说明
- 获取近20周的周报
- 周报结果包含直播观看时长live_watch_time, 回放观看时长playback_duration, 有效学习的课程(直播或回放观看超过50%)course_finish_total, 本周学习的课程总数course_total, 直播总数total, 观看直播的数量live_attendance, 直播有效观看的数量live_effective_attendance及直播总时长live_total_time
- 若观看直播则按直播开始时间start_time 进行分周, 若观看的是回放则按最后一次观看时间action_time进行分周
- 若当前周观看了其它周的直播, 不计在当前周的直播总数中, 但记在本周的学习课程数中
- 原始数据中若同一节课即观看了直播又观看了回放数据存放在一条记录中, 如何区分请查看status字段的说明(这样设计是为了便于统计学习的课程数, 因为在同一周中若即看了直播又看了回放只会算作学习了一节课)
4. sql实现
SELECT
start_time, SUBDATE(start_time, -6) AS end_time,
SUM(LEAST(watch_time, total_time)) AS live_watch_time,
SUM(current_position) AS playback_duration,
SUM(IF(status != 0 AND (watch_time / total_time > 0.5 OR current_position / total_length > 0.5), 1, 0)) AS course_finish_total,
SUM(IF(status != 0, 1, 0)) AS course_total,
SUM(IF(status != 2, 1, 0)) AS total,
SUM(IF(status IN (1, 3), 1, 0)) AS live_attendance,
SUM(IF(watch_time / total_time > 0.5, 1, 0)) AS live_effective_attendance,
SUM(total_time) AS live_total_time
FROM(
SELECT
IF(status IN (0, 1) OR type=2, start_time, start_time2) AS start_time,
IF(status=2 OR (status=3 AND type=1 AND start_time != start_time2), 0, watch_time) AS watch_time,
IF(status=2 OR (status=3 AND type=1 AND start_time != start_time2), 0, total_time) AS total_time,
IF(status=1 OR type=2, 0, current_position) AS current_position,
IF(status=1 OR type=2, 0, total_length) AS total_length,
IF(status=2 AND type=2, 0, status) AS status,
type
FROM(
SELECT
LEFT(SUBDATE(FROM_UNIXTIME(start_time),IF(FROM_UNIXTIME(start_time,'%w')=0, 7, FROM_UNIXTIME(start_time,'%w'))-1), 10) AS start_time,
LEFT(SUBDATE(action_time,IF(DATE_FORMAT(action_time,'%w')=0, 7, DATE_FORMAT(action_time,'%w'))-1), 10) AS start_time2,
watch_time, end_time - start_time AS total_time, current_position, total_length, status, type
FROM
`l_live_room`
LEFT JOIN
(SELECT 1 AS type UNION SELECT 2 AS type) AS a
ON (status IN (0, 1) AND type = 1) OR status IN (2, 3)
WHERE
sso_id = 123123
AND error != 1
AND GREATEST(FROM_UNIXTIME(start_time), action_time) >= SUBDATE(CURDATE(),IF(DATE_FORMAT(CURDATE(),'%w')=0, 7, DATE_FORMAT(CURDATE(),'%w'))-1+133)
) AS t
WHERE NOT (start_time = start_time2 AND type = 2)
) AS t
WHERE start_time >= SUBDATE(CURDATE(),IF(DATE_FORMAT(CURDATE(),'%w')=0, 7, DATE_FORMAT(CURDATE(),'%w'))-1+133)
AND start_time <= NOW()
GROUP BY start_time;
5. sql拆分解析
5.1 原始数据处理(子查询1)
SELECT
# 将直播开始时间转换为其所在周周一的日期
LEFT(SUBDATE(FROM_UNIXTIME(start_time),IF(FROM_UNIXTIME(start_time,'%w')=0, 7, FROM_UNIXTIME(start_time,'%w'))-1), 10) AS start_time,
# 将最后一次观看时间转换为其所在周周一的日期
LEFT(SUBDATE(action_time,IF(DATE_FORMAT(action_time,'%w')=0, 7, DATE_FORMAT(action_time,'%w'))-1), 10) AS start_time2,
# total_time 直播总时长
watch_time, end_time - start_time AS total_time, current_position, total_length, status, type
FROM
`l_live_room`
LEFT JOIN
# 由于直播和回放可能在不同周的统计中, 所以把所有数据复制一份
(SELECT 1 AS type UNION SELECT 2 AS type) AS a
# 对于未观看的课程和只观看了直播的课程只保留一条数据,
ON (status IN (0, 1) AND type = 1) OR status IN (2, 3)
WHERE
sso_id = 123123
AND error != 1
# 只获取最近20周的数据
AND GREATEST(FROM_UNIXTIME(start_time), action_time) >= SUBDATE(CURDATE(),IF(DATE_FORMAT(CURDATE(),'%w')=0, 7, DATE_FORMAT(CURDATE(),'%w'))-1+133)
5.2 过滤多余重复数据(子查询2)
SELECT
# 若未学习或只观看了直播或(直播和回放不在同一周观看的其中一条)保留直播时间, 否则保留观看回放的时间
IF(status IN (0, 1) OR type=2, start_time, start_time2) AS start_time,
# 若直播开始时间不在当前周且在当前周观看了回放, 则将本条记录直播观看时间置为0
IF(status=2 OR (status=3 AND type=1 AND start_time != start_time2), 0, watch_time) AS watch_time,
# 若直播开始时间不在当前周且在当前周观看了回放, 则将本条记录直播总时长置为0, 避免影响计算本周直播总时长的计算
IF(status=2 OR (status=3 AND type=1 AND start_time != start_time2), 0, total_time) AS total_time,
# 若当前周只观看了直播则将回放信息置为0
IF(status=1 OR type=2, 0, current_position) AS current_position,
IF(status=1 OR type=2, 0, total_length) AS total_length,
# 若只观看了回放且与直播时间不在同一周将其中一条记录(保留直播时间的那一条)status置为0, 后续会将此条记录统计到直播所在周的直播总数中, 且不会被统计到已学习课程中
IF(status=2 AND type=2, 0, status) AS status,
type
FROM( 子查询1 ) AS t
# 若观看直播和回放在同一周则只保留一条数据, 不去重会导致学习课程数统计错误
WHERE NOT (start_time = start_time2 AND type = 2)
5.3 生成报告
SELECT
start_time, SUBDATE(start_time, -6) AS end_time, # 周报周期范围
SUM(LEAST(watch_time, total_time)) AS live_watch_time, # 直播观看总时长
SUM(current_position) AS playback_duration, # 回放观看总时长
SUM(IF(status != 0 AND (watch_time / total_time > 0.5 OR current_position / total_length > 0.5), 1, 0)) AS course_finish_total, # 有效学习的课程数
SUM(IF(status != 0, 1, 0)) AS course_total, # 学习的课程数
SUM(IF(status != 2, 1, 0)) AS total, # 直播总数
SUM(IF(status IN (1, 3), 1, 0)) AS live_attendance, # 观看直播的总数
SUM(IF(watch_time / total_time > 0.5, 1, 0)) AS live_effective_attendance, # 直播有效观看数
SUM(total_time) AS live_total_time # 直播总时长
FROM( 子查询2 ) AS t
# 限制最近20周的数据, 虽然子查询中有类似的限制, 但因为子查询是直播开始时间或回放观看时间, 所以实际数据可能超出20周的范围
WHERE start_time >= SUBDATE(CURDATE(),IF(DATE_FORMAT(CURDATE(),'%w')=0, 7, DATE_FORMAT(CURDATE(),'%w'))-1+133)
# 此条件限制未来时间还未观看的直播被统计出来
AND start_time <= NOW()
GROUP BY start_time