import pandas as pd
import numpy as np
#from pandasql import sqldf
url = 'https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv'
tips = pd.read_csv(url)
tips.head()
# SELECT
# 'total_bill','tip','sex','smoker','time'
# FROM TIPS
# LIMIT 5;
tips[['total_bill','tip','sex','smoker','time']].head()
# SELECT
# *
# FROM TIPS
# WHERE TIME='Dinner'
# LIMIT 5;
tips[tips['time']=='Dinner'].head()
# SELECT *
# FROM TIPS
# WHERE TIME ='Dinner' AND TIP >5.00
tips[(tips['time']=='Dinner') & (tips['tip']>5.00)].head()
# SELECT *
# FROM TIPS
# WHERE SIZE >=5 OR TOTAL_BILL >45;
tips[(tips['size']>=5) | (tips['total_bill']>45.00)]
# SELECT SEX,COUNT(*)
# FROM TIPS
# GROUP BY SEX
tips.groupby('sex').size()
tips.groupby('sex').count()
tips.groupby('sex')['total_bill'].count()
# SELECT DAY,AVG(TIP),COUNT(*)
# FROM TIPS
# GROUP BY DAY;
tips.groupby('day').agg({'tip':np.mean,'day':np.size})
# SELECT SMOKER,DAY,COUNT(*),AVG(TIP)
# FROM TIPS
# GROUP BY SMOKER,DAY
# tips.groupby(['smoker','day']).agg({'tip':np.mean,'day':np.size})
tips.groupby(['smoker','day']).agg({'tip':[np.mean,np.size]})
# JOIN
df1 = pd.DataFrame({'key':['A','B','C','D'],
'value':np.random.randn(4)})
df2 = pd.DataFrame({'key':['B','D','D','E'],
'value':np.random.randn(4)})
# INNER JOIN
# SELECT *
# FROM DF1
# INNER JOIN DF2
# ON DF1.KEY = DF2.KEY
pd.merge(df1,df2,on='key')
#LEFT OUTER JOIN
# SELECT *
# FROM DF1
# LEFT OUTER JOIN DF2
# ON DF1.KEY = DF2.KEY
pd.merge(df1,df2,on='key',how='left')
#RIGHT JOIN
# SELECT *
# FROM DF1
# RIGHT OUTER JOIN DF2
# ON DF1.KEY = DF2.KEY
pd.merge(df1,df2,on='key',how='right')
#FULL JOIN
# SELECT *
# FROM DF1
# FULL OUTER JOIN DF2
# ON DF1.KEY = DF2.KEY
pd.merge(df1,df2,on='key',how='outer')
#UNOIN
df1 = pd.DataFrame({'city':['Chicago','San Francisco','New York City'],
'rank':range(1,4)})
df2 = pd.DataFrame({'city':['Chicago','Boston','Los Angeles'],
'rank':[1,4,5]})
# SELECT CITY,RANK
# FROM DF1
# UNION ALL
# SELECT CITY,RANK
# FROM DF2
pd.concat([df1,df2])
# SELECT CITY,RANK
# FROM DF1
# UNION
# SELECT CITY,RANK
# FROM DF2
pd.concat([df1,df2]).drop_duplicates()
#TOP N ROWS WITH OFFSET(MYSQL)
# SELECT *
# FROM TIPS
# ORDER BY TIP DESC
# LIMIT 10 OFFSET 5
tips.nlargest(10+5,columns='tip').tail(10)
#TOP N ROWS PER GROUP(ORACLE`S ROW_NUMBER() ANALYTIC FUNCTION)
# SELECT * FROM(
# SELECT T.*
# ROW_NUM() OVER( PARTITION BY DAY ORDER BY TOTAL_BILL DESC) AS RN
# FROM TIPS T
# )
# WHERE RN<3
# ORDER BY DAY,RN
(tips.assign(rn=tips.sort_values(['total_bill'],ascending=False)
.groupby(['day'])
.cumcount()+1)
.query('rn<3')
.sort_values(['day','rn']))
# the same using rank(method='first')function,
(tips.assign(rnk=tips.groupby(['day'])['total_bill']
.rank(method='first',ascending=False))
.query('rnk<3')
.sort_values(['day','rnk']))
#ORACLE`S RANK() ANALYTIC FUNCITON
# SELECT * FROM(
# SELECT
# T.*,
# RANK() OVER(PATITION BY SEX ORDER BY TIP) AS RNK
# FROM TIPS T
# WHERE TIP < 2
# )
# WHERE RNK < 3
# ORDER BY SEX,RNK
(tips[tips['tip']<2]
.assign(rnk_min=tips.groupby(['sex'])['tip']
.rank(method='min'))
.query('rnk_min<3')
.sort_values(['sex','rnk_min']))
#update
# UPDATE TIPS
# SET TIP = TIP*2
# WHERE TIP<2
tips.loc[tips['tip']<2,'tip']*=2
#DELETE
# DELETE FROM TIPS
# WHERE TIP >9
tips = tips.loc[tips['tip']<=9]
pandas和SQL对应关系
最新推荐文章于 2023-02-11 15:29:56 发布