1 累加
1. 字符串累加
-- 国家维度下的省份拼接
select
country,
array_join( collect_list( province ), ' ') as provinces,
from
table_name
where
dt='20210329'
group by
country
;
2 行列转换
1. 列存指标数据合并到一行(行转列)
-- 人群标签合并到一行(列式存储的人群标签)
with tmp as (
select
id,
max( if( label_name='lable_01', label_value, null) ) as age, -- 第一类标签
max( if( label_name='lable_02', label_value, null) ) as sex -- 第二类标签
from table_name
where dt='20210329' and label_name in ('age', 'sex')
group by id
)
select count(1) from tmp where age='18' and sex='M';
2. 行转列
-- demo01
with tmp as (
select
*
from
values
('数学','张三',88), ('语文','张三',92), ('英语','张三',77),
('数学','王五',65), ('语文','王五',87), ('英语','王五',90),
('数学','李雷',67), ('语文','李雷',33), ('英语','李雷',24),
('数学','宫九',77), ('语文','宫九',87), ('英语','宫九',90)
as tt(subject,name,score)
)
select
*
from
tmp
pivot (
sum(score) for name in ('张三','王五','李雷','宫九')
)
;
-- demo02
with tmp as (
select
*
from
values
(100, 'John', 30, 1, 'Street_1'),
(200, 'Mary', NULL, 1, 'Street_2'),
(300, 'Mike', 80, 3, 'Street_3'),
(400, 'Dan', 50, 4, 'Street_4')
as tt(id, name, age, class, address)
)
select
*
from
tmp
pivot (
sum(age) as age, avg(class) as class
for name in ('John' as john, 'Mike' as mike)
)
;


3 数据填充
1. 取最新一条记录的vid, 填充到同一个id的所有记录上
first_value(vid) over( partition by id order by ts desc ) as vid
4 设定变量
1. sql中设定变量
-- 一般常量写法
spark-sql> set aa=ssjt;
aa ssjt
spark-sql> select '${aa}' as id;
ssjt
-- 函数结果写法
spark-sql> set yestoday=date_format(date_sub(current_date(), 1), 'yyyyMMdd');
yestoday date_format(date_sub(current_date(), 1), 'yyyyMMdd')
spark-sql> select ${yestoday};
20210913
-- sql结果赋值给变量写法
--注意sql要加括号,否则被认为是简单替换
--如果多次使用同一个复杂sql的结果变量,最好要先建表保存这个结果变量
spark-sql> set cnt = (select 1);
spark-sql> select ${cnt};
1
2. spark-sql -f 传入变量
-- sql.txt 文件内容
select
*
from
${tab_name}
where
dt = '${date}'
;
-- 调用方式
spark-sql -d tab_name='ssjt.shy' -d date='20230310' -f sql.txt
5 生成id
1. 生成全局唯一自增id
// zipWithIndex()
首先基于分区索引排序,然后是每个分区中的项的排序。所以第一个分区中的第一项得到索引0,第二个分区的起始值是第一个分区的最大值。从0开始。分区内id连续。会触发spark job。
// zipWithUniqueId()
每个分区中第一个元素的唯一ID值为:该分区索引号,
每个分区中第N个元素的唯一ID值为:(前一个元素的唯一ID值) + (该RDD总的分区数)
注意:每个分区数量不同,造成了ID唯一但不连续;
// dataframe方式初始化ID
def main(args: Array[String]): Unit = {
// 构造数据源
val arr = Array(1, 3, 8, 9, 11)
spark.sparkContext.makeRDD(arr, 2).toDF("odid").createOrReplaceTempView("tmp")
val res = spark.sql("select odid from tmp")
// 添加连续的唯一ID
val res2 = res.rdd.zipWithIndex().map(x => (x._1.getInt(0), x._2))
// 输出结果
res2.toDF("odid", "id").createOrReplaceTempView("tmp2")
spark.sql("select * from tmp2").show()
}
// SQL方式增量添加ID
set dt_2=20230319;
set dt_1=20230318;
set hive.exec.dynamic.partition.mode=nonstrict;
with active_tab as ( -- 当天数据
select
distinct id_ori
from
tbl_a
where
dt = '${dt_1}'
),
his_tab as ( -- 历史数据
select
id_ori
,id
from
tbl_b
where
dt = '${dt_2}'
),
init_tab as ( -- 新增映射
select
aa.id_ori
,row_number() over(order by 1) + (select max(id) from his_tab) as id
from
active_tab aa
left anti join
his_tab bb
on aa.id_ori = bb.id_ori
)
insert overwrite table tbl_b partition(dt)
select
id_ori
,id
,'${dt_1}' as dt
from
(
select
id_ori
,id
from
init_tab
union all
select
,id_ori
,id
from
his_tab
)
;
spark使用zipWithIndex和zipWithUniqueId为rdd中每条数据添加索引数据 - 码农教程
6 url
1. urldecode
1) 关键词汇
hive urldecode, spark urldecode
2) 解决方法
-- 常规处理
select reflect('java.net.URLDecoder', 'decode', col_name, 'UTF-8') as url;
-- Incomplete trailing escape (%) pattern
select reflect('java.net.URLDecoder', 'decode', regexp_replace(trim(col_name), '%(?![0-9a-fA-F]{2})','%25'), 'UTF-8') as url;
3) 参考链接
Hive对字段进行urlDecode - Boblim - 博客园
7 lateral view使用
1. lateral view级联使用
spark-sql> select
> *
> from
> (select 'a:1,b:2' as col) aa
> lateral view explode(split(col,',')) as es
> lateral view explode(split(es,':')) as e
> ;
a:1,b:2 a:1 a
a:1,b:2 a:1 1
a:1,b:2 b:2 b
a:1,b:2 b:2 2
8 分组聚合
1. 参考语句
-- 单维度聚合
select
brand
,model
,count(1)
,grouping_id(brand,model)
from
table_name
where
dt = '20220414'
group by
brand
,model
grouping sets(brand,model)
;
2. 参考链接
SparkSQL 中group by、grouping sets、rollup和cube方法详解
grouping__id grouping sets在hive和spark中执行的区别
Spark SQL 的 GROUP BY 子句 | Spark SQL 教程 - 盖若
9 数据透视
Spark SQL 的 PIVOT 子句用于数据透视。根据特定的列值获得聚合值,这些值将转换为 SELECT 子句中使用的多个列.
Spark SQL 数据透视子句 PIVOT | Spark SQL 教程 - 盖若
Spark_Sql pivot函数 - 超级无敌小剑 - 博客园
sparksql 2.4 PIVOT 使用_章锡平的博客-CSDN博客
10 抽样数据
注意: tablesample是对表采样(先采样后过滤), 会导致扫全表; 因此, 使用时最好先限定分区和过滤条件获得临时结果表, 再对临时表进行采样;
-- 按比例采样
WITH tmp AS (
SELECT
*
FROM
tab_name
WHERE
dt = '20220423'
)
SELECT
*
FROM
tmp TABLESAMPLE (50 PERCENT)
;
-- 按行数采样
WITH tmp AS (
SELECT
*
FROM
tab_name
WHERE
dt = '20220423'
)
SELECT
*
FROM
tmp TABLESAMPLE (5 ROWS)
;
-- 按桶采样
WITH tmp AS (
SELECT
*
FROM
tab_name
WHERE
dt = '20220423'
)
SELECT
*
FROM
tmp TABLESAMPLE (BUCKET 4 OUT OF 10)
;
11 选择列
regex_column_names
-- 打开列正则过滤
set spark.sql.parser.quotedRegexColumnNames=true;
-- 读取dt、dh之外的所有类
select
`(dt|dh)?+.+`
from
ssjt.shy
where
dt = '20220420'
limit 1
;
12 分析函数
13 去重统计
1 去重方法
group by、distinct、union、开窗函数
2 只按部分字段去重
row_number over (partition by --去重字段 oder by --排序字段 ) as rnk, 取rnk=1的数据
3 count( distinct(id,name,age)) 去重统计不准确
-- 统计值准确, 类似统计行数
select
count(1)
from
(
select
distinct *
from
tbl_name
where
dt='20230626'
and event = 'play'
) aa
;
-- 统计值不准, 类似统计某个字段的非null值, 一行记录中存在空值字段不会被统计
select
count( distinct *)
from
tbl_name
where
dt='20230626'
and event = 'play'
;
14 行数据聚合成map
with init_data as (
select '11' as id, 'app_1' as pkg, 1 as is_init
union all
select '11' as id, 'app_2' as pkg, 0 as is_init
union all
select '22' as id, 'app_3' as pkg, 0 as is_init
union all
select '22' as id, 'app_4' as pkg, 0 as is_init
)
select
id,
map_from_entries( collect_list(struct(pkg, is_init)) ) as info
from
init_data
group by
id
;
15 轨迹串处理
-- 0/1字符串轨迹做异或(轨迹的长度不能超过63位)
select
conv( bit_or( cast( conv(aa,2,10) as bigint) ), 10, 2)
from
values
('1001')
,('0110')
,('00000')
as t(aa)
;
16 数据的读取路径
> select id,input_file_name() as name from tbl_name where dt='20240507';
id name
87 oss://aa/bb/dt=20240507/part-00000-8aabc1b1-f5e0-4a4b-b209-6a00b0c20856.c000.snappy.parquet
88 oss://aa/bb/dt=20240507/part-00001-8aabc1b1-f5e0-4a4b-b209-6a00b0c20856.c000.snappy.parquet
17 禁止生成success文件
set mapreduce.fileoutputcommitter.marksuccessfuljobs=false;
18 排列组合
1. 数组2元素的组合
with tmp as ( -- 构造数组结构
select sequence(0,2) as arr
),
tmp2 as ( -- 双层循环求组合
select
transform( arr, x -> transform( arr, y -> if(x < y, array(x, y), null)) ) as all_combinations
from
tmp
)
-- 过滤并展开
select
filter( flatten( all_combinations ), x -> x is not null ) as combination
from
tmp2
;
19 两个数组开头连续相同的元素
-- zip_with函数不用考虑两个数组元素个数不一致
with tmp as ( -- 构造数据源,并判断元素是否相同
select zip_with(array('a', 'b', 'c'), array('a', 'b', 'd'), (x, y) -> if( x = y, x, '#') ) as arr
)
select -- 取头部连续相同的元素
slice(arr, 1, array_position(arr,'#')-1 ) as res
from
tmp
;
res
["a","b"]
20 其它
1. 客户端显示表头
spark.sql.cli.print.header=true
2. 行转列展示
dataframe.show(n,vertical,truncate)
3. 文件元数据信息
-- SQL方式
-- 读取parquet文件, 并注册临时视图
create temporary view tmp
using parquet
options (path "save_path")
;
-- 展示视图信息
desc tmp;
-- 非SQL方式
// 读取Parquet文件
val df = spark.read.parquet("save_path")
// 获取元数据信息
df.printSchema
4. 某个时间段内所有app_id
1. 列举EventLog方式
import sys
import pandas as pd
import subprocess
# 设定圈选时间范围
start_time = '2024-10-16T00:00'
end_time = '2024-10-16T00:09'
# 获取Eventlog文件信息
command = " hadoop fs -ls hdfs://hdfs-cluster/spark-history/ | grep spark-history"
result = subprocess.run(command, capture_output=True, text=True, shell=True)
# 解析文件信息
list_var=[]
if result.returncode == 0:
for line in result.stdout.strip().split('\n'):
parts = line.split()
element={
"owner":parts[2],
"time":parts[5]+'T'+parts[6],
"app_id":parts[7].split('/')[4],
}
list_var.append(element)
else:
# 打印标准错误
print("命令执行出错:")
print(result.stderr)
sys.exit(1)
# 过滤数据
df = pd.DataFrame(list_var)
pd.set_option('display.max_rows', None)
filtered_df = df[(df['time'] >= start_time ) & (df['time'] < end_time )]
print(filtered_df[['time','app_id']].to_string(index=False))
2. 通过Yarn Rest Api
1515

被折叠的 条评论
为什么被折叠?



