python merge on多个条件_【Python】当数据离开了 SQL

e3c43739b6de5f54cd93083dd669c160.png
原文地址(一)
原文地址(二)

介绍

SQL 是用于访问和处理数据库的标准的计算机语言。 常用于数据库管理系统(RDBMS)中, 这类数据库包括 MySQLSQL ServerOracle等。

Pandas 是一个开源的,为 Python 提供高性能的,数据结构以及数据分析工具。

在熟练地使用 SQL 的同时,为满足一些的业务需求,常常需要我们将数据提取后,再对数据进行统计分析,那应该如何使用 Pandas 达到和 SQL 一样的效果呢?

下面的速查表将会逐一使用 Pandas 对常见的 SQL 语句进行映射。

本篇内容

本篇将解构下面的 SQL 查询句式, 使用 Pandas 进行实现

SQL 查询句式

SELECT DISTINCT [字段] 
FROM [表] JOIN [bin] ON [连接条件] 
WHERE [过滤条件] 
GROUP BY [字段] 
HAVING [条件] 
ORDER BY [字段] DESC 
LIMIT [个数] OFFSET [个数]

SQL 创建句式

CREATE TABLE [表名] (
    [列名] [类型],
    [列名] [类型],
   ....
);

SQL 插入句式

INSERT INTO [表名] VALUES ([值], [值], ...);
INSERT INTO [表名] ([列名],[列名] ...) VALUES ([值], [值], ...);

SQL 更新句式

UPDATE [表名]
SET [列名] = [值], [列名] = [值]
WHERE [过滤条件];

SQL 删除句式

DELETE FROM [表名] WHERE [过滤条件];

读取测试数据

import pandas as pd
import pymysql
conn = pymysql.connect(host='127.0.0.1',
                       user='root',
                       password='12345678',
                       db='test_db')

df = pd.read_sql(sql="select * from student", con=conn)
df_comments = pd.read_sql(sql="select * from comments", con=conn)

数据预览

df

id name age sex city money 0 1 赵雷 1990-01-01 男 北京 20.0 1 2 钱电 1990-12-21 男 天津 30.0 2 3 孙风 1990-12-20 男 成都 2.0 3 4 李云 1990-12-06 男 北京 100.0 4 5 周梅 1991-12-01 女 成都 50.0 5 6 吴兰 1992-01-01 女 北京 3.0 6 7 郑竹 1989-01-01 女 成都 200.0 7 8 张三 2017-12-20 女 天津 20.0 8 9 李四 2017-12-25 女 西安 35.0 9 10 李四 2012-06-06 女 北京 300.0 10 11 赵六 2013-06-13 女 成都 5.0 11 12 孙七 2014-06-01 女 天津 210.0 12 13 张三 2017-12-20 女 天津 20.0

df_comments

id student_id content 0 1 1 测试评论1 1 2 5 测试评论5 2 3 2 测试评论2 3 4 3 测试评论3 4 5 1 测试评论11 5 6 9 测试评论9

SELECT

SQL

SELECT * FROM student

SELECT id, name, sex FROM student

Pandas

df

df[['id','name','sex']]

DISTINCT

SQL

SELECT DISTINCT name FROM student

Pandas

df['name'].unique()

COUNT & SUM & MAX & MIN & AVG

SQL

SELECT COUNT(*) FROM student

SELECT SUM(money) FROM student

SELECT id, MAX(money) FROM student

SELECT id, MIN(money) FROM student

SELECT AVG(money) FROM student

Pandas

df['id'].count()

df['money'].sum()

df[df['money'] == df['money'].max()]

df[df['money'] == df['money'].min()]

df['money'].mean()

描述性统计数据

In [1]: df['money'].describe()

Out[1]: count     12.000000
        mean      59.583333
        std       72.963825
        min        2.000000
        25%       16.250000
        50%       32.500000
        75%       62.500000
        max      210.000000
        Name: money, dtype: float64

WHERE

例子: = > <

SQL

SELECT * FROM student WHERE sex = '男';
SELECT * FROM student WHERE money > 100;
SELECT * FROM student WHERE money < 100;

Pandas

# df['sex'] or df.sex 都可
df[df.sex == ('男')]
df[df['money'] > 100]
df[df['money'] < 100]

例子: in & not in

SQL

SELECT * FROM student WHERE id IN (2,4,6,8,10);

SELECT * FROM student WHERE id NOT IN (2,4,6,8,10);

Pandas

df[df['id'].isin((2,4,6,8))]

df[~df['id'].isin((2,4,6,8))]

多个条件

SQL

SELECT * FROM student WHERE sex = '男' and id IN (2,4,6,8,10);

Pandas

df[(df['sex'] == ('男')) & (df['id'].isin((2,4,6,8)))]

LIMIT OFFSET

SQL

SELECT * FROM student ORDER BY id DESC LIMIT 3;

SELECT * FROM student ORDER BY id DESC LIMIT 3 OFFSET 2;

Pandas

df.sort_values('id',ascending=False).head(3)

df.nlargest(2 + 3, 'id').tail(3)

SELECT & WHERE & LIMIT

SQL

SELECT * FROM student WHERE sex = '男' LIMIT 3;

SELECT id, name, sex FROM student WHERE sex ='男' LIMIT 3;

Pandas

df[df['sex'] == ('男')].head(3)

df[df['sex'] == ('男')][['id','name','sex']].head(3)

ORDER BY

SQL

SELECT * FROM student ORDER BY age;

SELECT * FROM student ORDER BY age DESC;

Pandas

df.sort_values('age')

df.sort_values('age', ascending=False)

GROUP BY

GROYP BY & COUNT

SQL

SELECT city, COUNT(*) FROM student GROUP BY city;

Pandas

df.groupby(['city']).size().to_frame('size').reset_index()

GROYP BY & SUM

SQL

SELECT city, SUM(money) FROM student GROUP BY city;

Pandas

df.groupby(['city'])['money'].agg('sum').reset_index()

GROUP BY & ORDER BY & COUNT

GROUP BY 单字段

SQL

SELECT city, COUNT(*) FROM student GROUP BY sex ORDER BY city;

Pandas

df.groupby(['city']).size().to_frame('size').reset_index().sort_values('city')

GROUP BY 多字段

SQL

SELECT city, sex, COUNT(*) FROM student GROUP BY city, sex ORDER BY city;

Pandas

df.groupby(['city','sex']).size().to_frame('size').reset_index().sort_values('city')

HAVING

SQL

SELECT city, COUNT(*) FROM student GROUP BY city HAVING count(*) > 3;

Pandas

df.groupby('city').filter(lambda x:len(x)>3).groupby('city').size().to_frame('size').reset_index()

JOIN

(INNER) JOIN

SQL

SELECT
    * 
FROM
    student
    INNER JOIN comments ON student.id = comments.student_id;

Pandas

pd.merge(df, df_comments, left_on='id', right_on='student_id')

LEFT (OUTER) JOIN

SQL

SELECT
    * 
FROM
    student
    LEFT JOIN comments ON student.id = comments.student_id;

Pandas

pd.merge(df, df_comments, left_on='id', right_on='student_id', how='left')

RIGHT (OUTER) JOIN

SQL

SELECT
    * 
FROM
    student
    RIGHT JOIN comments ON student.id = comments.student_id;

Pandas

pd.merge(df, df_comments, left_on='id', right_on='student_id', how='right')

UNION

SQL

SELECT * FROM student where city ='北京' 
UNION 
SELECT * FROM student where sex ='男';

Pandas

pd.concat([df[df.city == '北京'], df[df.sex == '男']]).drop_duplicates().reset_index()

UNION ALL

SQL

SELECT * FROM student where city ='北京' 
UNION ALL 
SELECT * FROM student where sex ='男';

Pandas

pd.concat([df[df.city == '北京'], df[df.sex == '男']]).reset_index()

CREATE

SQL

CREATE TABLE student (
    id INT ( 11 ) NOT NULL AUTO_INCREMENT,
    name VARCHAR ( 10 ) COLLATE utf8mb4_general_ci DEFAULT NULL,
    age date DEFAULT NULL,
    sex VARCHAR ( 10 ) COLLATE utf8mb4_general_ci DEFAULT NULL,
    city VARCHAR ( 255 ) CHARACTER 
    SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL,
    money DOUBLE ( 255, 2 ) DEFAULT NULL,

);

Pandas

pd.DataFrame(columns=['id', 'name', 'sex', 'city', 'money'])

INSERT

SQL

INSERT INTO student (id, name, age, sex, city, money )
VALUES
    (1, '张三', '2017-12-20', '女', '天津', 20.00 );

Pandas

# 第一种
df.loc[-1] = [1, '张三', '女', '天津', 20.00]
df.index = df.index + 1
df = df.sort_index()

# 第二种
temp_pd = pd.DataFrame({'id': [1], 'name': ['张三'], 'sex': ['女'], 'city': ['天津'], 'money': [20.00]})
df = pd.concat([df,temp_pd], ignore_index=True)
df.reset_index()

# 第三种
temp_pd = pd.DataFrame([[1,'张三1', '女', '天津', 20.00]], columns=df.columns)
df = pd.concat([df, temp_pd])
df.reset_index()

UPDATE

SQL

UPDATE student SET money = 300 WHERE id = 1;

Pandas

df.loc[df.id == 1, 'money'] = 300

DELETE

SQL

-- 测试时会因外键报错,此处忽略,仅讨论句法
DELETE FROM student WHERE id = 1;

Pandas

df = df.loc[df.id != 1]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值