1 先上整体代码
import sys
sys.path.append('/data/gtair/tools/DWH')
import pandas as pd
from sqlalchemy import create_engine
engine = create_engine('mysql+pymysql://edu_ws:QbvkrBHKXO@chb2gc-pd-edu-dws.mysql.rds.aliyuncs.com:3306/edu_platform')
df_spark =ss.sql("""
select
--行为项
pdate as statistic,
---教学行为-核心行为:备课
case when fngroup ='202002' then '编辑课程标准'
when fngroup in ('209001' , '209002' , '209003' , '209004', '209005' , '209006' , '209007' ,'209008') then '上传课程资源'
when fngroup in ('202003', '203003') then '新建/编排教案'
---教学行为-核心行为:课堂授课
when fncode in ('2070010007','2070010008','2070010009','2070010010') then '浏览前导知识点'
when fncode in ('2070010014','2070010015','2070010016','2070010017','2070010018','2070010019','2070010020','2070010021',
'2070010022','2070010023','2070010024','2070010025','2070010026','2070010027','2070010028') then '执行教案'
when fncode in ('2070010001','2070010002','2070010003','2070010004','2070010005','2070010006','2070010011','2070010012') then '组织课堂互动'
when fncode ='2070010013' then '使用工具软件'
---教学行为-核心行为:课下辅导
when fncode in ('2070010150','2070010151','2070010152','2070010153') then '发布课前预习任务'
when fncode in ('2070010250','2070010251','2070010252','2070010253') then '发布课后作业任务'
when fncode in ('2070010154','2070010254','2070010354') then '查看学习结果'
when fncode in ('2070010900','2070010901','2070010902','2070010903','2070010904','2070010905','2070010906','2070010907','2070010908','2070010909',
'2070030900','2070030901','2070030902','2070030903','2070030904','2070030905','2070030906','2070030907','2070030908','2070030909') then '批阅作业'
---教学行为-核心行为:考评行为
--'浏览班级成绩'
--'导出考评结果'
---管理行为-核心行为:教务管理
when fngroup ='208001' then '组建班级'
when fngroup ='208002' then '导入学生'
---管理行为核心行为:教学质量分析
--'浏览学情分析、查看课堂记录'
---非业务行为-核心行为:登录等
when fncode ='2010010000' then '登录等'
else '未知'
end as action_list,
--租户id,院校id,使用课程id,用户id,子行为集合id
get_json_object(query,'$.tenantId') as tenant_id,
get_json_object(query,'$.schoolId') as colg_idcd,
get_json_object(query,'$.courseId') as course_id,
get_json_object(query,'$.userId') as user_id,
get_json_object(query,'$.opSetId') as opset_id,
--求时间差(单位为秒‘1631557318’)
round((sort_array(collect_set(unix_timestamp(trigertime,'yyyy-MM-dd HH:mm:ss')))[size(sort_array(collect_set(unix_timestamp(trigertime,'yyyy-MM-dd HH:mm:ss'))))-1]-
sort_array(collect_set(unix_timestamp(trigertime,'yyyy-MM-dd HH:mm:ss')))[0])/60,2) as duration_minutes ,
count(*) as pv
from pub.dm_pub_view_userlog_gux_func_add where query <>'' and pdate='20210914' and pcode ='81978' and get_json_object(query,'$.utype') ='teacher'
group by pdate,
---教学行为-核心行为:备课
case when fngroup ='202002' then '编辑课程标准'
when fngroup in ('209001' , '209002' , '209003' , '209004', '209005' , '209006' , '209007' ,'209008') then '上传课程资源'
when fngroup in ('202003', '203003') then '新建/编排教案'
---教学行为-核心行为:课堂授课
when fncode in ('2070010007','2070010008','2070010009','2070010010') then '浏览前导知识点'
when fncode in ('2070010014','2070010015','2070010016','2070010017','2070010018','2070010019','2070010020','2070010021',
'2070010022','2070010023','2070010024','2070010025','2070010026','2070010027','2070010028') then '执行教案'
when fncode in ('2070010001','2070010002','2070010003','2070010004','2070010005','2070010006','2070010011','2070010012') then '组织课堂互动'
when fncode ='2070010013' then '使用工具软件'
---教学行为-核心行为:课下辅导
when fncode in ('2070010150','2070010151','2070010152','2070010153') then '发布课前预习任务'
when fncode in ('2070010250','2070010251','2070010252','2070010253') then '发布课后作业任务'
when fncode in ('2070010154','2070010254','2070010354') then '查看学习结果'
when fncode in ('2070010900','2070010901','2070010902','2070010903','2070010904','2070010905','2070010906','2070010907','2070010908','2070010909',
'2070030900','2070030901','2070030902','2070030903','2070030904','2070030905','2070030906','2070030907','2070030908','2070030909') then '批阅作业'
---教学行为-核心行为:考评行为
--'浏览班级成绩'
--'导出考评结果'
---管理行为-核心行为:教务管理
when fngroup ='208001' then '组建班级'
when fngroup ='208002' then '导入学生'
---管理行为核心行为:教学质量分析
--'浏览学情分析、查看课堂记录'
---非业务行为-核心行为:登录等
when fncode ='2010010000' then '登录等'
else '未知'
end,
get_json_object(query,'$.tenantId'),
get_json_object(query,'$.schoolId'),
get_json_object(query,'$.courseId'),
get_json_object(query,'$.userId') ,
get_json_object(query,'$.opSetId')
--limit 220
""").toPandas()
pd.io.sql.to_sql(df_spark, 'ads_jxpt_teacher_actions', engine,if_exists='append',index=False)
print('done')
2 hql应用数组
2.1 本sql是解决多行数据中最大时间与最小时间的时间差是多少分钟。原数据样本如下
id type version user_id trigertime partition
1 Collector_WebApp 0.3.23 --502c636d3713 2021-09-12 10:10:20.531 20210912
2 Collector_WebApp 0.3.23 --502c636d3713 2021-09-12 10:10:22.531 20210912
3 Collector_WebApp 0.3.23 --502c636d3713 2021-09-12 10:10:23.531 20210912
4 Collector_WebApp 0.3.23 --502c636d3713 2021-09-12 10:10:24.531 20210912
5 Collector_WebApp 0.3.23 --502c636d3713 2021-09-12 10:10:30.531 20210912
2.2 sql思路:
a 转换trigertime字段类型为unix_timestamp,名为trigertime。
b collect_set(trigertime) 行转列,这里也可以用collect_list。此时多个trigertime 变为无序的数组,名为trigertime。
c sort_array(trigertime)使之从小到大有序排列。依然是数组属性。名为trigertime。
d 此时数组中 trigertime[size(trigertime-1]是最大;trigertime[0]是最小。相减即为时间差。/60 转变单位为分钟即可。
round((sort_array(collect_set(unix_timestamp(trigertime,'yyyy-MM-dd HH:mm:ss')))[size(sort_array(collect_set(unix_timestamp(trigertime,'yyyy-MM-dd HH:mm:ss'))))-1]-
sort_array(collect_set(unix_timestamp(trigertime,'yyyy-MM-dd HH:mm:ss')))[0])/60,2) as duration_minutes
3 df写入mysql
pd.io.sql.to_sql(df_spark, 'ads_jxpt_teacher_actions', engine,if_exists='append',index=False)
print('done')