sql基础统计,及pandas实现

目录

1 查询前N行

2 查询特定列

3 查询某列的去重数据

4 根据条件筛选数据

5 分组计数

6 按某列分组后,对其它列进行统计

7 横向连接表

8 纵向连接表

9 分组后排序

10 按条件分组

11 按条件更新某个值

12 按条件删除行

13 删除列

14 提取/匹配字符串

15 替换字符串中字符

16 根据某列,对另外一列进行计数


1 查询前N行

# python
order_data.head(10)

# mysql
select * from t_order limit 10

2 查询特定列

# python
# 1
order_data[['orderid', 'amount']]
# 2
order_data.loc[:,['orderid', 'amount']]
# 3
order_data.iloc[:,[3,4]]


# mysql
select orderid,amount from t_order

3 查询某列的去重数据

# python
# 查询所有不重复的值
order_data['uid'].unique()
# 查询所有不重复值的数量
order_data['uid'].nunique()

# mysql
# 查询所有不重复的值
select distinct uid from t_order
# 查询所有不重复值的数量
select count(distinct uid) from t_order

4 根据条件筛选数据

# python
# 且用 &, 或用 |
order_data[(order_data['uid'] == 10003) & (order_data['amount'] > 50)]


# mysql
# 且用 and, 或用 or
select * from t_order
where uid = 10003
and amount > 50

5 分组计数

# python
order_data.groupby('uid').size()

# mysql
select uid,count(*) from t_order
group by uid

6 按某列分组后,对其它列进行统计

# 按uid分组后,分别对amount和orderid列进行统计

# python
# agg支持的fun名字:count、sum、mean、median、min、max、prod(求积)、std(标准差)、var(方差)、
# unique(罗列不重复的项)、first、last(最后一个非NA值)
# 复杂的可以使用apply函数
order_data.groupby('uid').agg({'amount':['mean','sum'],'orderid':['count']})

# mysql
# 取两位小数round(sum(amount),2)
select uid, sum(amount) as amount_mean, count(orderid) as orderid_count 
from t_order
group by uid

7 横向连接表

# python
# how的参数有left/right/outer/inner
pd.merge(order_data, user_data, on='uid', how='left')

# mysql
# left join/right join/outer join/inner join
select *
from t_order
left join t_user
on t_order.uid = t_user.uid

8 纵向连接表

# python
# 不去重
pd.concat([order_data,order2_data])
# 去重
pd.concat([order_data,order2_data]).drop_duplicates()

# mysql
# 不去重
select * from t_order
union all
select * from t_order2
# 去重
select * from t_order
union
select * from t_order2

9 分组后排序

# python
# 降序
order_data.groupby('uid').size().sort_values(ascending=False)
# 多列排序
order_df= order_data.groupby('uid').agg({'orderid':'count','amount':'sum'})
order_df.rename(columns={'orderid':'orderid_count','amount':'amount_sum'},inplace=True)
order_df.sort_values(by=['orderid_count', 'amount_sum'], ascending=[False, True])

# mysql
# 降序
select uid, count(*) as uid_count
from t_order
group by uid
order by uid_count desc
# 多列排序
select uid, count(orderid) as orderid_count, sum(amount) as amount_sum
from t_order
group by uid
order by orderid_count desc, amount_sum

10 按条件分组

# 按uid字段对总金额求和后,再把总金额分为[0-300),[300,600),[600,900)三组
# python
def func(x):
    if x < 300:
        return '[0,300)'
    elif x < 600:
        return '[300,600)'
    elif x < 900:
        return '[600,900)'
    else:
        return 'other'

order_df= order_data.groupby('uid').agg({'orderid':'count','amount':'sum'})
order_df.rename(columns={'orderid':'orderid_count','amount':'amount_sum'},inplace=True)
order_df.sort_values(by=['orderid_count', 'amount_sum'], ascending=[False, True])
order_df['amount_inerval'] = order_df['amount_sum'].map(func)

# mysql
select uid, orderid_count, 
case when amount_sum < 300 then '[0,300)'
		 when amount_sum >= 300 and amount_sum < 600 then '[300,600)'
		 when amount_sum >= 600 and amount_sum < 900 then '[600,900)'
else 'other' end as amount_inerval
from(
select uid, count(distinct orderid) as orderid_count, sum(amount) as amount_sum
from t_order
group by uid
) A

11 按条件更新某个值

# 20岁及以下的年龄替换成20岁
# python
user_data.loc[user_data['age']<=20,'age'] = 20

# mysql
# 先选中表
select * from t_user;
update t_user set age = 20 where age <=20
from t_user

12 按条件删除行

# 删除20岁及以下的
# python
user_data = user_data[user_data['age'] > 20]

# mysql
delete from t_user where age <= 20

13 删除列

# python
user_data.drop(['uid'], inplace=True, axis=1)

# mysql
# alter table 删除列drop,添加列add
alter table t_user drop column uid

14 提取/匹配字符串

# Python
# 提取前8位
order['orderid'].astype(str).str[:8]
# 模糊匹配后提取,返回的是正则表达式中() 对应的结果
order['ts'].astype(str).str.extract('(\d{4}-\d{2}-\d{2}).*')
# 模糊匹配后筛选,返回的是布尔值
order['ts'].astype(str).str.contains('08-01')

# Hive SQL
# 提取前8位
# substr(string A, int start, int len)
select *, substring(orderid, 1, 8) as dt
from t_order;
# 模糊匹配后提取
# regexp_extract(str, regexp[, idx])
# idx默认值是1,表示返回正则表达式中第一个() 对应的结果。以此类推。0表示返回全部结果
select *, regexp_extract(ts, '(\\d{4}-\\d{2}-\\d{2}).*', 1) as dt
from t_order;

15 替换字符串中字符

# 将“-”替换成空
# Python
order[order['ts'].astype(str).str.replace("-","")]

# Hive SQL
select *, regexp_replace(ts, '-', '') as dt
from t_order;

16 根据某列,对另外一列进行计数

# 统计ts字段中含有‘2019-08-01’的订单有多少,ts字段中含有‘2019-08-02’的订单有多少

# Python
# DataFrame.apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds)
# axis {0 or ‘index’: apply function to each column. 
# 1 or ‘columns’: apply function to each row.}, default 0
def func(x):
    if '2019-08-01' in x['ts']:
        return '2019-08-01'
    elif '2019-08-02' in x['ts']:
        return '2019-08-02'
    else:
        return None
order['count_condition'] = order.apply(func, axis=1)
order.groupby('count_condition').agg({'orderid': 'count'})

# Hive SQL
# case when ts like '%08-01%' then orderid end
# 如果ts列包含“08-01”则替换为orderid列的内容,如果不包含则为空
select count(case when ts like '%08-01%' then orderid end) as 0801_count, 
count(case when ts like '%08-02%' then orderid end) as 0802_count
from t_order;

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值