单独使用groupby
select
max(cate_id) as cate_id,count(cate_id) as num
from
表库.表名
where
partition_dt='20180817'
group by cate_id
order by num desc
LIMIT 1000
select后的内容只能是groupby的内容以及该内容的统计量。
SELECT * FROM
(SELECT
call_id,pl_id,role,sentence,sentence_index,version,result ,
ROW_NUMBER() OVER(PARTITION by call_id,opp_id,sentence_index) as r
from
表库.表名
where
partition_dt="20201025" and check_is=1 ) t
where r<101
如果想要对某些字段进行联合排序,且select后面出现更多的内容则要用row_number开窗。
注意r<101和limit含义不同
partition_dt自动找到昨天的数据
#以下是dp自己实现的格式
partition_dt = ${(#date(0,0,-1):yyyyMMdd#)}
#以下是通用格式
#取得今天的时间
SELECT from_unixtime(unix_timestamp(),"yyyy-MM-dd")
#取得前一天的时间
SELECT data_sub(from_unixtime(unix_timestamp(),"yyyy-MM-dd"),1)
文本拼接
把相同用户的不同列内容进行合并。collect_list
select username, collect_list(video_name) from t_visit_video group by username ;
而后对用户名字排序sort_array
华仔给我写的至尊sql
SELECT call_id,opp_id,pl_id,version,
regexp_replace(concat_ws('#',sort_array(collect_list(concat(rnum,'&&',sentence)))),'\\d{3}&&','') as sentence,
concat_ws(',',collect_set(if(result REGEXP '(目标商机|确认联系人|确认负责人|确认行业|确认公司名称/个人|确认城市)',result,''))) as result
FROM
(
SELECT call_id,opp_id,pl_id,version,sentence,result,
if(rnum<10,concat('00',rnum),if(rnum<100,concat('0',rnum),cast(rnum as string))) as rnum
FROM
(
SELECT call_id,opp_id,pl_id,version,sentence,result,
row_number() over (partition by call_id order by sentence_index asc ) as rnum
FROM aaa.bbb
where partition_dt="20201025" and check_is=1
and call_id in (
'00c666e3-cef0-4e80-887c-a158c27c664b',
'00e320c5-657f-41f5-b3f8-964e00a1778d'
)
) t1
) t2
group by call_id,opp_id,pl_id,version
df实现
df.groupby(['call_id',v])['sentence'].apply(lambda x:x.str.cat(sep='')).reset_index()
正则用法
if表达式
全量表和增量表
全量表是截止到当天之前的所有的数据,属于pdf的感觉,因为只有一个分区,所以查数据速度很快。
增量表是每天增量追加,只有当天的新的数据。
判断方法:
查看某个数据的日期,如果在该天只有一条,就是增量,有n条是全量。看数据量不好判断,因为今天不上传,明天多加了产品线,弄不清。
sql
SELECT count(request_id)
FROM `db58_ailab_vqc`.`check_result_202103`
where biz_model = 29
and ai_label_num > 0
and ai_label_list like "%|2612|%"
and abtest_no like '%"quality_label_algo":"1"%'
and partition_dt = "20210303";
SELECT count(request_id)
FROM `db58_ailab_vqc`.`check_result_202103`
where biz_model = 29
and ai_label_list REGEXP '(2602|2603|2604|2605|2606|2607|2608|2609|2610|2611|2612)'
and abtest_no like '%"quality_label_algo":"1"%'
and partition_dt = "20210303";
SELECT count(request_id)
FROM `db58_ailab_vqc`.`check_result_202103`
where biz_model = 29
and abtest_no like '%"quality_label_algo":"2"%'
and partition_dt = "20210303";
json用法
SELECT
ai_label_num,
ai_label_list,
JSON_UNQUOTE(JSON_EXTRACT(t.abtest_no,'$.quality_label_algo')) as value
FROM
`db58_ailab_vqc`.`check_result_202103` t
where
biz_model = 29
and ai_label_list REGEXP '2603'
and ai_label_num>0
and value = 1
and partition_dt = "20210303";