目录
1 查询前N行
# python
order_data.head(10)
# mysql
select * from t_order limit 10
2 查询特定列
# python
# 1
order_data[['orderid', 'amount']]
# 2
order_data.loc[:,['orderid', 'amount']]
# 3
order_data.iloc[:,[3,4]]
# mysql
select orderid,amount from t_order
3 查询某列的去重数据
# python
# 查询所有不重复的值
order_data['uid'].unique()
# 查询所有不重复值的数量
order_data['uid'].nunique()
# mysql
# 查询所有不重复的值
select distinct uid from t_order
# 查询所有不重复值的数量
select count(distinct uid) from t_order
4 根据条件筛选数据
# python
# 且用 &, 或用 |
order_data[(order_data['uid'] == 10003) & (order_data['amount'] > 50)]
# mysql
# 且用 and, 或用 or
select * from t_order
where uid = 10003
and amount > 50
5 分组计数
# python
order_data.groupby('uid').size()
# mysql
select uid,count(*) from t_order
group by uid
6 按某列分组后,对其它列进行统计
# 按uid分组后,分别对amount和orderid列进行统计
# python
# agg支持的fun名字:count、sum、mean、median、min、max、prod(求积)、std(标准差)、var(方差)、
# unique(罗列不重复的项)、first、last(最后一个非NA值)
# 复杂的可以使用apply函数
order_data.groupby('uid').agg({'amount':['mean','sum'],'orderid':['count']})
# mysql
# 取两位小数round(sum(amount),2)
select uid, sum(amount) as amount_mean, count(orderid) as orderid_count
from t_order
group by uid
7 横向连接表
# python
# how的参数有left/right/outer/inner
pd.merge(order_data, user_data, on='uid', how='left')
# mysql
# left join/right join/outer join/inner join
select *
from t_order
left join t_user
on t_order.uid = t_user.uid
8 纵向连接表
# python
# 不去重
pd.concat([order_data,order2_data])
# 去重
pd.concat([order_data,order2_data]).drop_duplicates()
# mysql
# 不去重
select * from t_order
union all
select * from t_order2
# 去重
select * from t_order
union
select * from t_order2
9 分组后排序
# python
# 降序
order_data.groupby('uid').size().sort_values(ascending=False)
# 多列排序
order_df= order_data.groupby('uid').agg({'orderid':'count','amount':'sum'})
order_df.rename(columns={'orderid':'orderid_count','amount':'amount_sum'},inplace=True)
order_df.sort_values(by=['orderid_count', 'amount_sum'], ascending=[False, True])
# mysql
# 降序
select uid, count(*) as uid_count
from t_order
group by uid
order by uid_count desc
# 多列排序
select uid, count(orderid) as orderid_count, sum(amount) as amount_sum
from t_order
group by uid
order by orderid_count desc, amount_sum
10 按条件分组
# 按uid字段对总金额求和后,再把总金额分为[0-300),[300,600),[600,900)三组
# python
def func(x):
if x < 300:
return '[0,300)'
elif x < 600:
return '[300,600)'
elif x < 900:
return '[600,900)'
else:
return 'other'
order_df= order_data.groupby('uid').agg({'orderid':'count','amount':'sum'})
order_df.rename(columns={'orderid':'orderid_count','amount':'amount_sum'},inplace=True)
order_df.sort_values(by=['orderid_count', 'amount_sum'], ascending=[False, True])
order_df['amount_inerval'] = order_df['amount_sum'].map(func)
# mysql
select uid, orderid_count,
case when amount_sum < 300 then '[0,300)'
when amount_sum >= 300 and amount_sum < 600 then '[300,600)'
when amount_sum >= 600 and amount_sum < 900 then '[600,900)'
else 'other' end as amount_inerval
from(
select uid, count(distinct orderid) as orderid_count, sum(amount) as amount_sum
from t_order
group by uid
) A
11 按条件更新某个值
# 20岁及以下的年龄替换成20岁
# python
user_data.loc[user_data['age']<=20,'age'] = 20
# mysql
# 先选中表
select * from t_user;
update t_user set age = 20 where age <=20
from t_user
12 按条件删除行
# 删除20岁及以下的
# python
user_data = user_data[user_data['age'] > 20]
# mysql
delete from t_user where age <= 20
13 删除列
# python
user_data.drop(['uid'], inplace=True, axis=1)
# mysql
# alter table 删除列drop,添加列add
alter table t_user drop column uid
14 提取/匹配字符串
# Python
# 提取前8位
order['orderid'].astype(str).str[:8]
# 模糊匹配后提取,返回的是正则表达式中() 对应的结果
order['ts'].astype(str).str.extract('(\d{4}-\d{2}-\d{2}).*')
# 模糊匹配后筛选,返回的是布尔值
order['ts'].astype(str).str.contains('08-01')
# Hive SQL
# 提取前8位
# substr(string A, int start, int len)
select *, substring(orderid, 1, 8) as dt
from t_order;
# 模糊匹配后提取
# regexp_extract(str, regexp[, idx])
# idx默认值是1,表示返回正则表达式中第一个() 对应的结果。以此类推。0表示返回全部结果
select *, regexp_extract(ts, '(\\d{4}-\\d{2}-\\d{2}).*', 1) as dt
from t_order;
15 替换字符串中字符
# 将“-”替换成空
# Python
order[order['ts'].astype(str).str.replace("-","")]
# Hive SQL
select *, regexp_replace(ts, '-', '') as dt
from t_order;
16 根据某列,对另外一列进行计数
# 统计ts字段中含有‘2019-08-01’的订单有多少,ts字段中含有‘2019-08-02’的订单有多少
# Python
# DataFrame.apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds)
# axis {0 or ‘index’: apply function to each column.
# 1 or ‘columns’: apply function to each row.}, default 0
def func(x):
if '2019-08-01' in x['ts']:
return '2019-08-01'
elif '2019-08-02' in x['ts']:
return '2019-08-02'
else:
return None
order['count_condition'] = order.apply(func, axis=1)
order.groupby('count_condition').agg({'orderid': 'count'})
# Hive SQL
# case when ts like '%08-01%' then orderid end
# 如果ts列包含“08-01”则替换为orderid列的内容,如果不包含则为空
select count(case when ts like '%08-01%' then orderid end) as 0801_count,
count(case when ts like '%08-02%' then orderid end) as 0802_count
from t_order;