对比pyspark处理数据与hive处理数据

过滤:

hive: select * from table where 'age'>1

spark: df=df.filter(df.age>1)

过滤空值:

select * from table where age is not null

df=df.filter(df.age.isNotNull())

选择某字段的最大值:

select max(age) max_age from table 

df=df.agg(F.max(df.age)).withColumnRenamed('max(age)', 'max_age')

计算数据量:

select count(1) count from table

df.agg(F.countDistinct(df.name).alias('count'))

df.count()/df.distinct().count()#返回int

选择某几个字段:

select c1,c2 from table /select * from table

df=df.select('c1','c2')/df=df.select('*')

去掉重复数据:

select c1,c2,max(c3) c3 ... from group by c1,c2

df=df.dropDuplicates(['name', 'height'])

join:

select a.c1,a.c2,a.c3,b.c4 from table1 a left join table2 b on a.c1=b.c1 and a.c2=b.c2 where b.c4 is not null

df=df1.join(df2,[df1.c1==df2.c1,df1.c2==df2.c2],'left').select(df1['*'],df2.c4).filter(df2.c4.isNotNull())

limit:

select * from table limit 10

df=df.limit(10)

排序:

select * from table order by age desc

df=df.sort(df.age.desc())/df=df.orderBy('age',ascending=0)

多字段排序

select * from table order by age desc,score asc

df=df.orderBy(['age','score'],ascending=[0,1])

拼接:

select * from a union all select * from b

df=df1.union(df2)   

分组聚合再排序:

select name,sum(socre) score from table group by name order by score

ddf1=ddf.groupBy('name').agg(F.sum('score')).withColumnRenamed('sum(score)','score') #先分组聚合

dff1=ddf1.orderBy('score',ascend=1) #再排序

ddf1=ddf.groupby('name').count().orderBy('count')#待验证,感觉和和 select name ,count(1) c from table group by name 结果有些不一样

in:

select * from table where name in ('a','ab')

df=df.filter(df.name.isin('a','ab'))

like:

select * from table where name like '%a%'

df=df.filter(df.name.like('%a%'))

if:

select if (score<10,0,100) score from table

df=df.select(F.when(df.score<10,0).otherwise(100)) 

coalesce:

select * ,coalesce(name1,name2,'xx') from table

df=df.select('*',F.coalesce(df.name1,df.name2,F.lit('xx')))

concat:

select concat_ws('-',name1,name2) name from table

df=df.select(F.concat_ws('-',df.name1,df.name2).alias('name'))

length:

select col, length(col) len from table

df=df.select(df.col,F.length(df.col).alias('length'))

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值