原文地址(一)
原文地址(二)
介绍
SQL 是用于访问和处理数据库的标准的计算机语言。 常用于数据库管理系统(RDBMS)中, 这类数据库包括 MySQL、SQL Server、Oracle等。
Pandas 是一个开源的,为 Python 提供高性能的,数据结构以及数据分析工具。
在熟练地使用 SQL 的同时,为满足一些的业务需求,常常需要我们将数据提取后,再对数据进行统计分析,那应该如何使用 Pandas 达到和 SQL 一样的效果呢?
下面的速查表将会逐一使用 Pandas 对常见的 SQL 语句进行映射。
本篇内容
本篇将解构下面的 SQL 查询句式, 使用 Pandas 进行实现
SQL 查询句式
SELECT DISTINCT [字段]
FROM [表] JOIN [bin] ON [连接条件]
WHERE [过滤条件]
GROUP BY [字段]
HAVING [条件]
ORDER BY [字段] DESC
LIMIT [个数] OFFSET [个数]
SQL 创建句式
CREATE TABLE [表名] (
[列名] [类型],
[列名] [类型],
....
);
SQL 插入句式
INSERT INTO [表名] VALUES ([值], [值], ...);
INSERT INTO [表名] ([列名],[列名] ...) VALUES ([值], [值], ...);
SQL 更新句式
UPDATE [表名]
SET [列名] = [值], [列名] = [值]
WHERE [过滤条件];
SQL 删除句式
DELETE FROM [表名] WHERE [过滤条件];
读取测试数据
import pandas as pd
import pymysql
conn = pymysql.connect(host='127.0.0.1',
user='root',
password='12345678',
db='test_db')
df = pd.read_sql(sql="select * from student", con=conn)
df_comments = pd.read_sql(sql="select * from comments", con=conn)
数据预览
df
id name age sex city money 0 1 赵雷 1990-01-01 男 北京 20.0 1 2 钱电 1990-12-21 男 天津 30.0 2 3 孙风 1990-12-20 男 成都 2.0 3 4 李云 1990-12-06 男 北京 100.0 4 5 周梅 1991-12-01 女 成都 50.0 5 6 吴兰 1992-01-01 女 北京 3.0 6 7 郑竹 1989-01-01 女 成都 200.0 7 8 张三 2017-12-20 女 天津 20.0 8 9 李四 2017-12-25 女 西安 35.0 9 10 李四 2012-06-06 女 北京 300.0 10 11 赵六 2013-06-13 女 成都 5.0 11 12 孙七 2014-06-01 女 天津 210.0 12 13 张三 2017-12-20 女 天津 20.0
df_comments
id student_id content 0 1 1 测试评论1 1 2 5 测试评论5 2 3 2 测试评论2 3 4 3 测试评论3 4 5 1 测试评论11 5 6 9 测试评论9
SELECT
SQL
SELECT * FROM student
SELECT id, name, sex FROM student
Pandas
df
df[['id','name','sex']]
DISTINCT
SQL
SELECT DISTINCT name FROM student
Pandas
df['name'].unique()
COUNT & SUM & MAX & MIN & AVG
SQL
SELECT COUNT(*) FROM student
SELECT SUM(money) FROM student
SELECT id, MAX(money) FROM student
SELECT id, MIN(money) FROM student
SELECT AVG(money) FROM student
Pandas
df['id'].count()
df['money'].sum()
df[df['money'] == df['money'].max()]
df[df['money'] == df['money'].min()]
df['money'].mean()
描述性统计数据
In [1]: df['money'].describe()
Out[1]: count 12.000000
mean 59.583333
std 72.963825
min 2.000000
25% 16.250000
50% 32.500000
75% 62.500000
max 210.000000
Name: money, dtype: float64
WHERE
例子: = > <
SQL
SELECT * FROM student WHERE sex = '男';
SELECT * FROM student WHERE money > 100;
SELECT * FROM student WHERE money < 100;
Pandas
# df['sex'] or df.sex 都可
df[df.sex == ('男')]
df[df['money'] > 100]
df[df['money'] < 100]
例子: in & not in
SQL
SELECT * FROM student WHERE id IN (2,4,6,8,10);
SELECT * FROM student WHERE id NOT IN (2,4,6,8,10);
Pandas
df[df['id'].isin((2,4,6,8))]
df[~df['id'].isin((2,4,6,8))]
多个条件
SQL
SELECT * FROM student WHERE sex = '男' and id IN (2,4,6,8,10);
Pandas
df[(df['sex'] == ('男')) & (df['id'].isin((2,4,6,8)))]
LIMIT OFFSET
SQL
SELECT * FROM student ORDER BY id DESC LIMIT 3;
SELECT * FROM student ORDER BY id DESC LIMIT 3 OFFSET 2;
Pandas
df.sort_values('id',ascending=False).head(3)
df.nlargest(2 + 3, 'id').tail(3)
SELECT & WHERE & LIMIT
SQL
SELECT * FROM student WHERE sex = '男' LIMIT 3;
SELECT id, name, sex FROM student WHERE sex ='男' LIMIT 3;
Pandas
df[df['sex'] == ('男')].head(3)
df[df['sex'] == ('男')][['id','name','sex']].head(3)
ORDER BY
SQL
SELECT * FROM student ORDER BY age;
SELECT * FROM student ORDER BY age DESC;
Pandas
df.sort_values('age')
df.sort_values('age', ascending=False)
GROUP BY
GROYP BY & COUNT
SQL
SELECT city, COUNT(*) FROM student GROUP BY city;
Pandas
df.groupby(['city']).size().to_frame('size').reset_index()
GROYP BY & SUM
SQL
SELECT city, SUM(money) FROM student GROUP BY city;
Pandas
df.groupby(['city'])['money'].agg('sum').reset_index()
GROUP BY & ORDER BY & COUNT
GROUP BY 单字段
SQL
SELECT city, COUNT(*) FROM student GROUP BY sex ORDER BY city;
Pandas
df.groupby(['city']).size().to_frame('size').reset_index().sort_values('city')
GROUP BY 多字段
SQL
SELECT city, sex, COUNT(*) FROM student GROUP BY city, sex ORDER BY city;
Pandas
df.groupby(['city','sex']).size().to_frame('size').reset_index().sort_values('city')
HAVING
SQL
SELECT city, COUNT(*) FROM student GROUP BY city HAVING count(*) > 3;
Pandas
df.groupby('city').filter(lambda x:len(x)>3).groupby('city').size().to_frame('size').reset_index()
JOIN
(INNER) JOIN
SQL
SELECT
*
FROM
student
INNER JOIN comments ON student.id = comments.student_id;
Pandas
pd.merge(df, df_comments, left_on='id', right_on='student_id')
LEFT (OUTER) JOIN
SQL
SELECT
*
FROM
student
LEFT JOIN comments ON student.id = comments.student_id;
Pandas
pd.merge(df, df_comments, left_on='id', right_on='student_id', how='left')
RIGHT (OUTER) JOIN
SQL
SELECT
*
FROM
student
RIGHT JOIN comments ON student.id = comments.student_id;
Pandas
pd.merge(df, df_comments, left_on='id', right_on='student_id', how='right')
UNION
SQL
SELECT * FROM student where city ='北京'
UNION
SELECT * FROM student where sex ='男';
Pandas
pd.concat([df[df.city == '北京'], df[df.sex == '男']]).drop_duplicates().reset_index()
UNION ALL
SQL
SELECT * FROM student where city ='北京'
UNION ALL
SELECT * FROM student where sex ='男';
Pandas
pd.concat([df[df.city == '北京'], df[df.sex == '男']]).reset_index()
CREATE
SQL
CREATE TABLE student (
id INT ( 11 ) NOT NULL AUTO_INCREMENT,
name VARCHAR ( 10 ) COLLATE utf8mb4_general_ci DEFAULT NULL,
age date DEFAULT NULL,
sex VARCHAR ( 10 ) COLLATE utf8mb4_general_ci DEFAULT NULL,
city VARCHAR ( 255 ) CHARACTER
SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL,
money DOUBLE ( 255, 2 ) DEFAULT NULL,
);
Pandas
pd.DataFrame(columns=['id', 'name', 'sex', 'city', 'money'])
INSERT
SQL
INSERT INTO student (id, name, age, sex, city, money )
VALUES
(1, '张三', '2017-12-20', '女', '天津', 20.00 );
Pandas
# 第一种
df.loc[-1] = [1, '张三', '女', '天津', 20.00]
df.index = df.index + 1
df = df.sort_index()
# 第二种
temp_pd = pd.DataFrame({'id': [1], 'name': ['张三'], 'sex': ['女'], 'city': ['天津'], 'money': [20.00]})
df = pd.concat([df,temp_pd], ignore_index=True)
df.reset_index()
# 第三种
temp_pd = pd.DataFrame([[1,'张三1', '女', '天津', 20.00]], columns=df.columns)
df = pd.concat([df, temp_pd])
df.reset_index()
UPDATE
SQL
UPDATE student SET money = 300 WHERE id = 1;
Pandas
df.loc[df.id == 1, 'money'] = 300
DELETE
SQL
-- 测试时会因外键报错,此处忽略,仅讨论句法
DELETE FROM student WHERE id = 1;
Pandas
df = df.loc[df.id != 1]