当使用hive或spark对超大数据量(几十亿)数据进行排序的时候,直接使用row_number函数会导致数据严重倾斜,都在一个reduce任务上执行,导致很难跑出来,一定要排序的还可以参考如下sql
SELECT
a.uuid,
a.rank + NVL(b.max_num, 0) AS seq
FROM
(
SELECT
uuid,
ABS(hash(uuid)) % 10000 AS part,
row_number() over(PARTITION BY ABS(hash(uuid)) % 10000 order by uuid) AS rank
FROM
{tab_seq}
WHERE
dt = '2022-10-06'
)
a
LEFT JOIN
(
SELECT
part,
SUM(max_num) over(order by part) AS max_num
FROM
(
SELECT
ABS(hash(uuid)) % 10000 AS part,
COUNT(1) AS max_num
FROM
{tab_seq}
WHERE
dt = '2022-10-06'
GROUP BY
ABS(hash(uuid)) % 10000
)
tmp2
)
b
ON
(a.part-1) = b.part