开始简介
数据读写
polars数据读写与pandas类似
存储 df.write_csv(“docs/data/output.csv”)
读取 df_csv = pl.read_csv(“docs/data/output.csv”)
import polars as pl
from datetime import datetime
df = pl.DataFrame(
{
"integer": [1, 2, 3],
"date": [
datetime(2025, 1, 1),
datetime(2025, 1, 2),
datetime(2025, 1, 3),
],
"float": [4.0, 5.0, 6.0],
"string": ["a", "b", "c"],
}
)
df
integer | date | float | string |
---|---|---|---|
i64 | datetime[μs] | f64 | str |
1 | 2025-01-01 00:00:00 | 4.0 | "a" |
2 | 2025-01-02 00:00:00 | 5.0 | "b" |
3 | 2025-01-03 00:00:00 | 6.0 | "c" |
Expressions
polars中最核心的部分Expressions,Expressions提供了一个模块结构,在该结构内,你可以使用并不断叠加简单的concepts(另外一个核心的概念),最终实现复杂的查询。
在polars中,主要由以下四个基本的模块结构(也称contexts):
-
select
-
filter
-
group_by
-
with_columns
-
select
为了选择某列,首先需要定义对应的数据集dataframe,其次要明确需要的列
# col('*')表示选择所有列, 与pl.all()相同
print(df.select(pl.col("*")))
print(df.select(pl.all()))
# 选择特定列
print(df.select(pl.col('float','date')))
shape: (3, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date ┆ float ┆ string │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ str │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 1 ┆ 2025-01-01 00:00:00 ┆ 4.0 ┆ a │
│ 2 ┆ 2025-01-02 00:00:00 ┆ 5.0 ┆ b │
│ 3 ┆ 2025-01-03 00:00:00 ┆ 6.0 ┆ c │
└─────────┴─────────────────────┴───────┴────────┘
shape: (3, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date ┆ float ┆ string │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ str │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 1 ┆ 2025-01-01 00:00:00 ┆ 4.0 ┆ a │
│ 2 ┆ 2025-01-02 00:00:00 ┆ 5.0 ┆ b │
│ 3 ┆ 2025-01-03 00:00:00 ┆ 6.0 ┆ c │
└─────────┴─────────────────────┴───────┴────────┘
shape: (3, 2)
┌───────┬─────────────────────┐
│ float ┆ date │
│ --- ┆ --- │
│ f64 ┆ datetime[μs] │
╞═══════╪═════════════════════╡
│ 4.0 ┆ 2025-01-01 00:00:00 │
│ 5.0 ┆ 2025-01-02 00:00:00 │
│ 6.0 ┆ 2025-01-03 00:00:00 │
└───────┴─────────────────────┘
- filter
# 通过日期筛选
print(df.filter(pl.col('date').is_between(datetime(2025, 1, 1), datetime(2025, 1, 2))))
#通过数值筛选
print(df.filter(pl.col('float').is_between(5, 6) ))
shape: (2, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date ┆ float ┆ string │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ str │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 1 ┆ 2025-01-01 00:00:00 ┆ 4.0 ┆ a │
│ 2 ┆ 2025-01-02 00:00:00 ┆ 5.0 ┆ b │
└─────────┴─────────────────────┴───────┴────────┘
shape: (2, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date ┆ float ┆ string │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ str │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 2 ┆ 2025-01-02 00:00:00 ┆ 5.0 ┆ b │
│ 3 ┆ 2025-01-03 00:00:00 ┆ 6.0 ┆ c │
└─────────┴─────────────────────┴───────┴────────┘
select和fliter返回的dataframe均为筛选后的,其一般不会新增列,group_by和 with_columns能对原始数据的列进行替换或添加
- with_column
print(df.with_columns(pl.col('float').sum().alias('new_folat'), (pl.col('string')+'add').alias('string+add')))
# 使用alias创建新列,否则替换原列
print(df.with_columns(pl.col('float').sum(), (pl.col('string')+'add')))
shape: (3, 6)
┌─────────┬─────────────────────┬───────┬────────┬───────────┬────────────┐
│ integer ┆ date ┆ float ┆ string ┆ new_folat ┆ string+add │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ str ┆ f64 ┆ str │
╞═════════╪═════════════════════╪═══════╪════════╪═══════════╪════════════╡
│ 1 ┆ 2025-01-01 00:00:00 ┆ 4.0 ┆ a ┆ 15.0 ┆ aadd │
│ 2 ┆ 2025-01-02 00:00:00 ┆ 5.0 ┆ b ┆ 15.0 ┆ badd │
│ 3 ┆ 2025-01-03 00:00:00 ┆ 6.0 ┆ c ┆ 15.0 ┆ cadd │
└─────────┴─────────────────────┴───────┴────────┴───────────┴────────────┘
shape: (3, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date ┆ float ┆ string │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ str │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 1 ┆ 2025-01-01 00:00:00 ┆ 15.0 ┆ aadd │
│ 2 ┆ 2025-01-02 00:00:00 ┆ 15.0 ┆ badd │
│ 3 ┆ 2025-01-03 00:00:00 ┆ 15.0 ┆ cadd │
└─────────┴─────────────────────┴───────┴────────┘
- group_by
df2 = pl.DataFrame(
{
"x": range(8),
"y": ["A", "A", "A", "B", "B", "C", "X", "X"],
}
)
df2.head()
x | y |
---|---|
i64 | str |
0 | "A" |
1 | "A" |
2 | "A" |
3 | "B" |
4 | "B" |
df2.group_by(['y'], maintain_order=True).mean()
y | x |
---|---|
str | f64 |
"A" | 1.0 |
"B" | 3.5 |
"C" | 5.0 |
"X" | 6.5 |
print(df2.group_by('y', maintain_order=True).agg(pl.col('*').mean().alias('mean'), pl.col('*').count().alias('count'),))
shape: (4, 3)
┌─────┬──────┬───────┐
│ y ┆ mean ┆ count │
│ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ u32 │
╞═════╪══════╪═══════╡
│ A ┆ 1.0 ┆ 3 │
│ B ┆ 3.5 ┆ 2 │
│ C ┆ 5.0 ┆ 1 │
│ X ┆ 6.5 ┆ 2 │
└─────┴──────┴───────┘
以上4种结构不仅可以单独使用,还可以相互配合以实现更强大的查询需求
print( df.with_columns((pl.col('float')*6).alias('float*6')).select(pl.all().exclude('string')))
shape: (3, 4)
┌─────────┬─────────────────────┬───────┬─────────┐
│ integer ┆ date ┆ float ┆ float*6 │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ f64 │
╞═════════╪═════════════════════╪═══════╪═════════╡
│ 1 ┆ 2025-01-01 00:00:00 ┆ 4.0 ┆ 24.0 │
│ 2 ┆ 2025-01-02 00:00:00 ┆ 5.0 ┆ 30.0 │
│ 3 ┆ 2025-01-03 00:00:00 ┆ 6.0 ┆ 36.0 │
└─────────┴─────────────────────┴───────┴─────────┘
合并数据
import numpy as np
df3 = pl.DataFrame(
{
"a": range(8),
"b": np.random.rand(8),
"d": [1, 2.0, float("nan"), float("nan"), 0, -5, -42, None],
}
)
df4 = pl.DataFrame(
{
"x": range(8),
"y": ["A", "A", "A", "B", "B", "C", "X", "X"],
}
)
print(df3.head(), df4.head())
shape: (5, 3)
┌─────┬──────────┬─────┐
│ a ┆ b ┆ d │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ f64 │
╞═════╪══════════╪═════╡
│ 0 ┆ 0.411314 ┆ 1.0 │
│ 1 ┆ 0.984068 ┆ 2.0 │
│ 2 ┆ 0.169014 ┆ NaN │
│ 3 ┆ 0.712731 ┆ NaN │
│ 4 ┆ 0.248682 ┆ 0.0 │
└─────┴──────────┴─────┘ shape: (5, 2)
┌─────┬─────┐
│ x ┆ y │
│ --- ┆ --- │
│ i64 ┆ str │
╞═════╪═════╡
│ 0 ┆ A │
│ 1 ┆ A │
│ 2 ┆ A │
│ 3 ┆ B │
│ 4 ┆ B │
└─────┴─────┘
df5 = df3.join(df4, left_on="a", right_on="x")
df5
a | b | d | y |
---|---|---|---|
i64 | f64 | f64 | str |
0 | 0.411314 | 1.0 | "A" |
1 | 0.984068 | 2.0 | "A" |
2 | 0.169014 | NaN | "A" |
3 | 0.712731 | NaN | "B" |
4 | 0.248682 | 0.0 | "B" |
5 | 0.921465 | -5.0 | "C" |
6 | 0.516578 | -42.0 | "X" |
7 | 0.145339 | null | "X" |
df3.hstack(df4)
a | b | d | x | y |
---|---|---|---|---|
i64 | f64 | f64 | i64 | str |
0 | 0.411314 | 1.0 | 0 | "A" |
1 | 0.984068 | 2.0 | 1 | "A" |
2 | 0.169014 | NaN | 2 | "A" |
3 | 0.712731 | NaN | 3 | "B" |
4 | 0.248682 | 0.0 | 4 | "B" |
5 | 0.921465 | -5.0 | 5 | "C" |
6 | 0.516578 | -42.0 | 6 | "X" |
7 | 0.145339 | null | 7 | "X" |
Concepts - Polars API 的核心概念介绍
数据类型
常见的数据类型都包括在polars中,详见:https://docs.pola.rs/user-guide/concepts/data-types/overview/
在使用中,可以采用.cast(dtyps)的方法,转换对应的数据类型
数据结构
与pandas一样,主要包括 Series 和 DataFrame
# Series
# 1维数据
import polars as pl
pl.Series('num',range(6))
num |
---|
i64 |
0 |
1 |
2 |
3 |
4 |
5 |
DataFrame
二维数据,与pandas中DataFrame概念相同,可以通过读取、创建、变换等方式得到。使用pl.DataFrame(),与df.to_pandas(),可以和pandas的DataFrame相互变换.
from datetime import datetime
df = pl.DataFrame(
{
"integer": [1, 2, 3, 4, 5],
"date": [
datetime(2022, 1, 1),
datetime(2022, 1, 2),
datetime(2022, 1, 3),
datetime(2022, 1, 4),
datetime(2022, 1, 5),
],
"float": [4.0, 5.0, 6.0, 7.0, 8.0],
}
)
print(df)
print(type(df.to_pandas()))
shape: (5, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date ┆ float │
│ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 │
╞═════════╪═════════════════════╪═══════╡
│ 1 ┆ 2022-01-01 00:00:00 ┆ 4.0 │
│ 2 ┆ 2022-01-02 00:00:00 ┆ 5.0 │
│ 3 ┆ 2022-01-03 00:00:00 ┆ 6.0 │
│ 4 ┆ 2022-01-04 00:00:00 ┆ 7.0 │
│ 5 ┆ 2022-01-05 00:00:00 ┆ 8.0 │
└─────────┴─────────────────────┴───────┘
<class 'pandas.core.frame.DataFrame'>
数据浏览
与pandas相同,polars包含.head(), .tail(), .sample(), .describe()等方法初步观看数据,同时支持[:,:]切片查询
# 查看表头
print(df.head(2))
# 查看表尾
print(df.tail(3))
# 随机取样
print(df.sample(1))
# 总体分析
print(df.describe())
# 切片查询
print(df[:,1:3])
shape: (2, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date ┆ float │
│ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 │
╞═════════╪═════════════════════╪═══════╡
│ 1 ┆ 2022-01-01 00:00:00 ┆ 4.0 │
│ 2 ┆ 2022-01-02 00:00:00 ┆ 5.0 │
└─────────┴─────────────────────┴───────┘
shape: (3, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date ┆ float │
│ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 │
╞═════════╪═════════════════════╪═══════╡
│ 3 ┆ 2022-01-03 00:00:00 ┆ 6.0 │
│ 4 ┆ 2022-01-04 00:00:00 ┆ 7.0 │
│ 5 ┆ 2022-01-05 00:00:00 ┆ 8.0 │
└─────────┴─────────────────────┴───────┘
shape: (1, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date ┆ float │
│ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 │
╞═════════╪═════════════════════╪═══════╡
│ 3 ┆ 2022-01-03 00:00:00 ┆ 6.0 │
└─────────┴─────────────────────┴───────┘
shape: (9, 4)
┌────────────┬──────────┬─────────────────────┬──────────┐
│ statistic ┆ integer ┆ date ┆ float │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ str ┆ f64 │
╞════════════╪══════════╪═════════════════════╪══════════╡
│ count ┆ 5.0 ┆ 5 ┆ 5.0 │
│ null_count ┆ 0.0 ┆ 0 ┆ 0.0 │
│ mean ┆ 3.0 ┆ 2022-01-03 00:00:00 ┆ 6.0 │
│ std ┆ 1.581139 ┆ null ┆ 1.581139 │
│ min ┆ 1.0 ┆ 2022-01-01 00:00:00 ┆ 4.0 │
│ 25% ┆ 2.0 ┆ 2022-01-02 00:00:00 ┆ 5.0 │
│ 50% ┆ 3.0 ┆ 2022-01-03 00:00:00 ┆ 6.0 │
│ 75% ┆ 4.0 ┆ 2022-01-04 00:00:00 ┆ 7.0 │
│ max ┆ 5.0 ┆ 2022-01-05 00:00:00 ┆ 8.0 │
└────────────┴──────────┴─────────────────────┴──────────┘
shape: (5, 2)
┌─────────────────────┬───────┐
│ date ┆ float │
│ --- ┆ --- │
│ datetime[μs] ┆ f64 │
╞═════════════════════╪═══════╡
│ 2022-01-01 00:00:00 ┆ 4.0 │
│ 2022-01-02 00:00:00 ┆ 5.0 │
│ 2022-01-03 00:00:00 ┆ 6.0 │
│ 2022-01-04 00:00:00 ┆ 7.0 │
│ 2022-01-05 00:00:00 ┆ 8.0 │
└─────────────────────┴───────┘
Contexts 查询背景
polars的两个核心组件是背景(Contexts)和表达式(Expressions)
polars中主要的三种查询背景分别为:
- 选择:df.select(), df.with_columns()
- 筛选: df.filter()
- 分类/聚合: df.group_by().agg()
# slect
# polars在其expression中支持广播机制,在expression中,如果不使用alias从新命名,默认替换原数据,因此需要注意列名重复问题
print(df)
print(df.select(
pl.col('date').dt.day(),
pl.col('float')/pl.col('float').sum(),
(pl.col('float')*10).alias('float*10')
))
shape: (5, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date ┆ float │
│ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 │
╞═════════╪═════════════════════╪═══════╡
│ 1 ┆ 2022-01-01 00:00:00 ┆ 4.0 │
│ 2 ┆ 2022-01-02 00:00:00 ┆ 5.0 │
│ 3 ┆ 2022-01-03 00:00:00 ┆ 6.0 │
│ 4 ┆ 2022-01-04 00:00:00 ┆ 7.0 │
│ 5 ┆ 2022-01-05 00:00:00 ┆ 8.0 │
└─────────┴─────────────────────┴───────┘
shape: (5, 3)
┌──────┬──────────┬──────────┐
│ date ┆ float ┆ float*10 │
│ --- ┆ --- ┆ --- │
│ i8 ┆ f64 ┆ f64 │
╞══════╪══════════╪══════════╡
│ 1 ┆ 0.133333 ┆ 40.0 │
│ 2 ┆ 0.166667 ┆ 50.0 │
│ 3 ┆ 0.2 ┆ 60.0 │
│ 4 ┆ 0.233333 ┆ 70.0 │
│ 5 ┆ 0.266667 ┆ 80.0 │
└──────┴──────────┴──────────┘
# with_columns与select不同之处在于,with_columns会保留其他列,select仅保留选择对象
print(df.select(
pl.col('date').dt.day(),
pl.col('float')/pl.col('float').sum(),
(pl.col('float')*10).alias('float*10')
))
print(df.with_columns(
pl.col('date').dt.day(),
pl.col('float')/pl.col('float').sum(),
(pl.col('float')*10).alias('float*10')
))
shape: (5, 3)
┌──────┬──────────┬──────────┐
│ date ┆ float ┆ float*10 │
│ --- ┆ --- ┆ --- │
│ i8 ┆ f64 ┆ f64 │
╞══════╪══════════╪══════════╡
│ 1 ┆ 0.133333 ┆ 40.0 │
│ 2 ┆ 0.166667 ┆ 50.0 │
│ 3 ┆ 0.2 ┆ 60.0 │
│ 4 ┆ 0.233333 ┆ 70.0 │
│ 5 ┆ 0.266667 ┆ 80.0 │
└──────┴──────────┴──────────┘
shape: (5, 4)
┌─────────┬──────┬──────────┬──────────┐
│ integer ┆ date ┆ float ┆ float*10 │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i8 ┆ f64 ┆ f64 │
╞═════════╪══════╪══════════╪══════════╡
│ 1 ┆ 1 ┆ 0.133333 ┆ 40.0 │
│ 2 ┆ 2 ┆ 0.166667 ┆ 50.0 │
│ 3 ┆ 3 ┆ 0.2 ┆ 60.0 │
│ 4 ┆ 4 ┆ 0.233333 ┆ 70.0 │
│ 5 ┆ 5 ┆ 0.266667 ┆ 80.0 │
└─────────┴──────┴──────────┴──────────┘
# filter 支持单个或多个筛选
df.filter((pl.col('integer')>2) & (pl.col('float')<7))
integer | date | float |
---|---|---|
i64 | datetime[μs] | f64 |
3 | 2022-01-03 00:00:00 | 6.0 |
# group_by/aggregation
# 分类与聚合
df = pl.DataFrame(
{
"nrs": [1, 2, 3, None, 5],
"names": ["foo", "ham", "spam", "egg", None],
"random": [2, 1, 2, 4, 5],
"groups": ["A", "A", "B", "C", "B"],
}
)
print(df)
# group_by的背景下,可以多次使用expression,生成多个新列
print(df.group_by('groups').agg(
pl.col('nrs'),
pl.col('nrs').mean().alias('nrs_mean'),
pl.col('nrs').count().alias('nrs_count'),
pl.col('nrs').is_null().sum().alias('nrs_na_sum'),
pl.col('names')
))
shape: (5, 4)
┌──────┬───────┬────────┬────────┐
│ nrs ┆ names ┆ random ┆ groups │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ i64 ┆ str │
╞══════╪═══════╪════════╪════════╡
│ 1 ┆ foo ┆ 2 ┆ A │
│ 2 ┆ ham ┆ 1 ┆ A │
│ 3 ┆ spam ┆ 2 ┆ B │
│ null ┆ egg ┆ 4 ┆ C │
│ 5 ┆ null ┆ 5 ┆ B │
└──────┴───────┴────────┴────────┘
shape: (3, 6)
┌────────┬───────────┬──────────┬───────────┬────────────┬────────────────┐
│ groups ┆ nrs ┆ nrs_mean ┆ nrs_count ┆ nrs_na_sum ┆ names │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ list[i64] ┆ f64 ┆ u32 ┆ u32 ┆ list[str] │
╞════════╪═══════════╪══════════╪═══════════╪════════════╪════════════════╡
│ C ┆ [null] ┆ null ┆ 0 ┆ 1 ┆ ["egg"] │
│ A ┆ [1, 2] ┆ 1.5 ┆ 2 ┆ 0 ┆ ["foo", "ham"] │
│ B ┆ [3, 5] ┆ 4.0 ┆ 2 ┆ 0 ┆ ["spam", null] │
└────────┴───────────┴──────────┴───────────┴────────────┴────────────────┘
Expressions 查询表达
expression是polars的核心内容,拥有者强大的选择查询能力,以下为一个expression示例
pl.col(‘nrs’).sum()
该expression示意:1.选择‘nrs’列;
2.对列进行求和
df.select(pl.col('random').sort().head(3), pl.col('nrs').filter(pl.col('groups')=='A').sum())
random | nrs |
---|---|
i64 | i64 |
1 | 3 |
2 | 3 |
2 | 3 |
Lazy API 与 Streaming API
文档中特别强调该两接口,在大规模数据加载与处理过程中,使用该接口能很好的加快速度,lazy接口能避免重复的执行查询,优化查询路径,从而加快速度
# lazy
# 原始方法
df = pl.read_csv("docs/data/iris.csv")
df_small = df.filter(pl.col("sepal_length") > 5)
df_agg = df_small.group_by("species").agg(pl.col("sepal_width").mean())
#lazy查询,注意,其用.scan_调用lazy接口,在处理dataframe时,可使用.lazy()调用
q = (
pl.scan_csv("docs/data/iris.csv")
.filter(pl.col("sepal_length") > 5)
.group_by("species")
.agg(pl.col("sepal_width").mean())
)
df = q.collect(streaming=True) # 使用streaming调用streaming API
Expressions – polar核心,查询的表达
基本的运算
在python和Rust中,polar支持如“+ - * / < >"等运算符的直接使用
import polars as pl
import numpy as np
df = pl.DataFrame(
{
"nrs": [1, 2, 3, None, 5],
"names": ["foo", "ham", "spam", "egg", None],
"random": np.random.rand(5),
"groups": ["A", "A", "B", "C", "B"],
}
)
print(df)
shape: (5, 4)
┌──────┬───────┬──────────┬────────┐
│ nrs ┆ names ┆ random ┆ groups │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ f64 ┆ str │
╞══════╪═══════╪══════════╪════════╡
│ 1 ┆ foo ┆ 0.829716 ┆ A │
│ 2 ┆ ham ┆ 0.039304 ┆ A │
│ 3 ┆ spam ┆ 0.865325 ┆ B │
│ null ┆ egg ┆ 0.954994 ┆ C │
│ 5 ┆ null ┆ 0.324218 ┆ B │
└──────┴───────┴──────────┴────────┘
# 数值运算
print(df.select(
(pl.col('nrs') + 5).alias('nrs+5'),
(pl.col('nrs')*pl.col('random')).alias('nrs*random')
))
# 逻辑运算, &和|在表达式中不支持用and和or替换
print(df.select(
(pl.col('nrs')>1).alias('nrs>1'),
(pl.col('groups') != 'A').alias('groups!=A'),
((pl.col('nrs')>1) & (pl.col('random')<0.5)).alias('and_expression'), # and
((pl.col('nrs')>1) | (pl.col('random')<0.5)).alias('or_expression') # or
))
shape: (5, 2)
┌───────┬────────────┐
│ nrs+5 ┆ nrs*random │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═══════╪════════════╡
│ 6 ┆ 0.829716 │
│ 7 ┆ 0.078608 │
│ 8 ┆ 2.595976 │
│ null ┆ null │
│ 10 ┆ 1.621089 │
└───────┴────────────┘
shape: (5, 4)
┌───────┬───────────┬────────────────┬───────────────┐
│ nrs>1 ┆ groups!=A ┆ and_expression ┆ or_expression │
│ --- ┆ --- ┆ --- ┆ --- │
│ bool ┆ bool ┆ bool ┆ bool │
╞═══════╪═══════════╪════════════════╪═══════════════╡
│ false ┆ false ┆ false ┆ false │
│ true ┆ false ┆ true ┆ true │
│ true ┆ true ┆ false ┆ true │
│ null ┆ true ┆ false ┆ null │
│ true ┆ true ┆ true ┆ true │
└───────┴───────────┴────────────────┴───────────────┘
选择列
常规选择
from datetime import date, datetime
df = pl.DataFrame(
{
"id": [9, 4, 2],
"place": ["Mars", "Earth", "Saturn"],
"date": pl.date_range(date(2022, 1, 1), date(2022, 1, 3), "1d", eager=True),
"sales": [33.4, 2142134.1, 44.7],
"has_people": [False, True, False],
"logged_at": pl.datetime_range(
datetime(2022, 12, 1), datetime(2022, 12, 1, 0, 0, 2), "1s", eager=True
),
}
).with_row_index("index")
# 一般使用pl.col()方法对特定的列进行选择
# pl.col(‘*’) 与 pl.all()均表示选择所有列
print(df.select(pl.all()))
# 使用exclude方法排除某列
print(df.select(pl.all().exclude('sales', 'place'))) # 注意exclude传入的不是list
# 选择多列,同时对所有选择列进行统一操作
print(df.select(pl.col('date', 'logged_at').dt.strftime("%Y-%m-%d"))) # 选择列进行时间格式转换,.dt访问时间属性
shape: (3, 7)
┌───────┬─────┬────────┬────────────┬───────────┬────────────┬─────────────────────┐
│ index ┆ id ┆ place ┆ date ┆ sales ┆ has_people ┆ logged_at │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ i64 ┆ str ┆ date ┆ f64 ┆ bool ┆ datetime[μs] │
╞═══════╪═════╪════════╪════════════╪═══════════╪════════════╪═════════════════════╡
│ 0 ┆ 9 ┆ Mars ┆ 2022-01-01 ┆ 33.4 ┆ false ┆ 2022-12-01 00:00:00 │
│ 1 ┆ 4 ┆ Earth ┆ 2022-01-02 ┆ 2142134.1 ┆ true ┆ 2022-12-01 00:00:01 │
│ 2 ┆ 2 ┆ Saturn ┆ 2022-01-03 ┆ 44.7 ┆ false ┆ 2022-12-01 00:00:02 │
└───────┴─────┴────────┴────────────┴───────────┴────────────┴─────────────────────┘
shape: (3, 5)
┌───────┬─────┬────────────┬────────────┬─────────────────────┐
│ index ┆ id ┆ date ┆ has_people ┆ logged_at │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ i64 ┆ date ┆ bool ┆ datetime[μs] │
╞═══════╪═════╪════════════╪════════════╪═════════════════════╡
│ 0 ┆ 9 ┆ 2022-01-01 ┆ false ┆ 2022-12-01 00:00:00 │
│ 1 ┆ 4 ┆ 2022-01-02 ┆ true ┆ 2022-12-01 00:00:01 │
│ 2 ┆ 2 ┆ 2022-01-03 ┆ false ┆ 2022-12-01 00:00:02 │
└───────┴─────┴────────────┴────────────┴─────────────────────┘
shape: (3, 2)
┌────────────┬────────────┐
│ date ┆ logged_at │
│ --- ┆ --- │
│ str ┆ str │
╞════════════╪════════════╡
│ 2022-01-01 ┆ 2022-12-01 │
│ 2022-01-02 ┆ 2022-12-01 │
│ 2022-01-03 ┆ 2022-12-01 │
└────────────┴────────────┘
正则选择
pl.col方法支持使用正则筛选列
df.select(pl.col('^.*(people).*$')) # 筛选包括people的列
has_people |
---|
bool |
false |
true |
false |
根据数据类型选择
df.select(pl.col(pl.Int64, pl.Boolean))
id | has_people |
---|---|
i64 | bool |
9 | false |
4 | true |
2 | false |
使用selectors选择器
使用 polars.selectors
import polars.selectors as cs
print(dir(cs))
# 通过类型
print(df.select(cs.integer()))
# 采用集合,如所以数值列,除去df中的第一列
print(df.select(cs.numeric() - cs.first()))
print(df.select(cs.by_name('index') | ~cs.numeric())) # 筛选非数值列,同时包括index列
['Any', 'Binary', 'Boolean', 'Categorical', 'Collection', 'Date', 'Datetime', 'Decimal', 'Duration', 'Expr', 'F', 'FLOAT_DTYPES', 'INTEGER_DTYPES', 'Literal', 'Mapping', 'NUMERIC_DTYPES', 'NoReturn', 'Object', 'SIGNED_INTEGER_DTYPES', 'Sequence', 'String', 'TEMPORAL_DTYPES', 'TYPE_CHECKING', 'Time', 'UNSIGNED_INTEGER_DTYPES', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '_combine_as_selector', '_expand_selector_dicts', '_expand_selectors', '_parse_inputs_as_iterable', '_re_string', '_selector_proxy_', 'all', 'alpha', 'alphanumeric', 'annotations', 'binary', 'boolean', 'by_dtype', 'by_index', 'by_name', 'categorical', 'contains', 'date', 'datetime', 'decimal', 'deprecate_nonkeyword_arguments', 'digit', 'duration', 'ends_with', 'exclude', 'expand_selector', 'first', 'float', 'integer', 'is_column', 'is_polars_dtype', 'is_selector', 'last', 'matches', 'numeric', 'object', 'or_', 'overload', 're_escape', 'reduce', 'signed_integer', 'starts_with', 'string', 'temporal', 'time', 'timezone', 'unsigned_integer']
shape: (3, 2)
┌───────┬─────┐
│ index ┆ id │
│ --- ┆ --- │
│ u32 ┆ i64 │
╞═══════╪═════╡
│ 0 ┆ 9 │
│ 1 ┆ 4 │
│ 2 ┆ 2 │
└───────┴─────┘
shape: (3, 2)
┌─────┬───────────┐
│ id ┆ sales │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════╪═══════════╡
│ 9 ┆ 33.4 │
│ 4 ┆ 2142134.1 │
│ 2 ┆ 44.7 │
└─────┴───────────┘
shape: (3, 5)
┌───────┬────────┬────────────┬────────────┬─────────────────────┐
│ index ┆ place ┆ date ┆ has_people ┆ logged_at │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ date ┆ bool ┆ datetime[μs] │
╞═══════╪════════╪════════════╪════════════╪═════════════════════╡
│ 0 ┆ Mars ┆ 2022-01-01 ┆ false ┆ 2022-12-01 00:00:00 │
│ 1 ┆ Earth ┆ 2022-01-02 ┆ true ┆ 2022-12-01 00:00:01 │
│ 2 ┆ Saturn ┆ 2022-01-03 ┆ false ┆ 2022-12-01 00:00:02 │
└───────┴────────┴────────────┴────────────┴─────────────────────┘
# selectors支持正则与子字符串的匹配
print(df.select(cs.contains('ind'), cs.matches('.*_.*'))) # 筛选包含 ind 及 _ 的列
shape: (3, 3)
┌───────┬────────────┬─────────────────────┐
│ index ┆ has_people ┆ logged_at │
│ --- ┆ --- ┆ --- │
│ u32 ┆ bool ┆ datetime[μs] │
╞═══════╪════════════╪═════════════════════╡
│ 0 ┆ false ┆ 2022-12-01 00:00:00 │
│ 1 ┆ true ┆ 2022-12-01 00:00:01 │
│ 2 ┆ false ┆ 2022-12-01 00:00:02 │
└───────┴────────────┴─────────────────────┘
# 将列转换为expression
print(df.select(cs.temporal().as_expr().dt.strftime('%Y-%m'))) # 视乎没有.as_expr()也行??
# 使用is_selector 验证是否为selectors
from polars.selectors import is_selector
out = cs.numeric()
print(is_selector(out))
out = cs.boolean() | cs.numeric()
print(is_selector(out))
out = cs.numeric() + pl.lit(123)
print(is_selector(out))
shape: (3, 2)
┌─────────┬───────────┐
│ date ┆ logged_at │
│ --- ┆ --- │
│ str ┆ str │
╞═════════╪═══════════╡
│ 2022-01 ┆ 2022-12 │
│ 2022-01 ┆ 2022-12 │
│ 2022-01 ┆ 2022-12 │
└─────────┴───────────┘
True
True
False
功能-Functions
# .alias() 命名列
# 对某列进行多次操作时,会产生错误
try:
df.select(pl.col('sales') + 5, pl.col('sales' )-5)
except Exception as e:
print(e)
print(df.select((pl.col('sales') + 5).alias('+5'), (pl.col('sales') - 5).alias('-5')))
the name: 'sales' is duplicate
It's possible that multiple expressions are returning the same default column name. If this is the case, try renaming the columns with `.alias("new_name")` to avoid duplicate column names.
shape: (3, 2)
┌───────────┬───────────┐
│ +5 ┆ -5 │
│ --- ┆ --- │
│ f64 ┆ f64 │
╞═══════════╪═══════════╡
│ 38.4 ┆ 28.4 │
│ 2142139.1 ┆ 2142129.1 │
│ 49.7 ┆ 39.7 │
└───────────┴───────────┘
# .name.prefix 前缀, .name.suffix后缀 , .name.map 自定义函数
print(df.select(pl.col('*').name.map(lambda x: x[:-1])))
print(df.select(pl.col('place').name.suffix('_suffix'), pl.col('place').name.prefix('prefix_')))
shape: (3, 7)
┌──────┬─────┬────────┬────────────┬───────────┬───────────┬─────────────────────┐
│ inde ┆ i ┆ plac ┆ dat ┆ sale ┆ has_peopl ┆ logged_a │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ i64 ┆ str ┆ date ┆ f64 ┆ bool ┆ datetime[μs] │
╞══════╪═════╪════════╪════════════╪═══════════╪═══════════╪═════════════════════╡
│ 0 ┆ 9 ┆ Mars ┆ 2022-01-01 ┆ 33.4 ┆ false ┆ 2022-12-01 00:00:00 │
│ 1 ┆ 4 ┆ Earth ┆ 2022-01-02 ┆ 2142134.1 ┆ true ┆ 2022-12-01 00:00:01 │
│ 2 ┆ 2 ┆ Saturn ┆ 2022-01-03 ┆ 44.7 ┆ false ┆ 2022-12-01 00:00:02 │
└──────┴─────┴────────┴────────────┴───────────┴───────────┴─────────────────────┘
shape: (3, 2)
┌──────────────┬──────────────┐
│ place_suffix ┆ prefix_place │
│ --- ┆ --- │
│ str ┆ str │
╞══════════════╪══════════════╡
│ Mars ┆ Mars │
│ Earth ┆ Earth │
│ Saturn ┆ Saturn │
└──────────────┴──────────────┘
# 计数 n_unique, approx_n_unique
df.select(
pl.col('place').n_unique().alias('unique'),
pl.approx_n_unique('place').alias('unique_approx')
)
unique | unique_approx |
---|---|
u32 | u32 |
3 | 3 |
# 条件判断
# 使用when, then, otherwise进行条件判断,当when满足时,执行then,不满足执行otherwise
print(df)
print(df.select(pl.when(pl.col('id')>2).then(pl.col('sales') + 2).otherwise(pl.col('sales')-2))) # 当id>2时,sales列+2,否则-2
shape: (3, 7)
┌───────┬─────┬────────┬────────────┬───────────┬────────────┬─────────────────────┐
│ index ┆ id ┆ place ┆ date ┆ sales ┆ has_people ┆ logged_at │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ i64 ┆ str ┆ date ┆ f64 ┆ bool ┆ datetime[μs] │
╞═══════╪═════╪════════╪════════════╪═══════════╪════════════╪═════════════════════╡
│ 0 ┆ 9 ┆ Mars ┆ 2022-01-01 ┆ 33.4 ┆ false ┆ 2022-12-01 00:00:00 │
│ 1 ┆ 4 ┆ Earth ┆ 2022-01-02 ┆ 2142134.1 ┆ true ┆ 2022-12-01 00:00:01 │
│ 2 ┆ 2 ┆ Saturn ┆ 2022-01-03 ┆ 44.7 ┆ false ┆ 2022-12-01 00:00:02 │
└───────┴─────┴────────┴────────────┴───────────┴────────────┴─────────────────────┘
shape: (3, 1)
┌───────────┐
│ sales │
│ --- │
│ f64 │
╞═══════════╡
│ 35.4 │
│ 2142136.1 │
│ 42.7 │
└───────────┘
Casting 数据类型变换
polars中可以使用cast方法实现数据类型的转换,同时cast的strict参数决定在转换过程中如何处理不能被转换的数据,strict默认为True,此时遇到不能被转换数据会报错,设置strict=False时,不能被转换的数据将会被设置为null
数值
df = pl.DataFrame(
{
"integers": [1, 2, 3, 4, 5],
"big_integers": [1, 10000002, 3, 10000004, 10000005],
"floats": [4.0, 5.0, 6.0, 7.0, 8.0],
"floats_with_decimal": [4.532, 5.5, 6.5, 7.5, 8.5],
}
)
print(df)
print(df.select(
pl.col('floats_with_decimal').cast(pl.Int32),
pl.col('big_integers').cast(pl.Int8, strict=False)
))
shape: (5, 4)
┌──────────┬──────────────┬────────┬─────────────────────┐
│ integers ┆ big_integers ┆ floats ┆ floats_with_decimal │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ f64 ┆ f64 │
╞══════════╪══════════════╪════════╪═════════════════════╡
│ 1 ┆ 1 ┆ 4.0 ┆ 4.532 │
│ 2 ┆ 10000002 ┆ 5.0 ┆ 5.5 │
│ 3 ┆ 3 ┆ 6.0 ┆ 6.5 │
│ 4 ┆ 10000004 ┆ 7.0 ┆ 7.5 │
│ 5 ┆ 10000005 ┆ 8.0 ┆ 8.5 │
└──────────┴──────────────┴────────┴─────────────────────┘
shape: (5, 2)
┌─────────────────────┬──────────────┐
│ floats_with_decimal ┆ big_integers │
│ --- ┆ --- │
│ i32 ┆ i8 │
╞═════════════════════╪══════════════╡
│ 4 ┆ 1 │
│ 5 ┆ null │
│ 6 ┆ 3 │
│ 7 ┆ null │
│ 8 ┆ null │
└─────────────────────┴──────────────┘
字符串
df.select(pl.col("*").cast(pl.String))
integers | big_integers | floats | floats_with_decimal |
---|---|---|---|
str | str | str | str |
"1" | "1" | "4.0" | "4.532" |
"2" | "10000002" | "5.0" | "5.5" |
"3" | "3" | "6.0" | "6.5" |
"4" | "10000004" | "7.0" | "7.5" |
"5" | "10000005" | "8.0" | "8.5" |
布尔值
df.select(pl.col('*').cast(pl.Boolean))
integers | big_integers | floats | floats_with_decimal |
---|---|---|---|
bool | bool | bool | bool |
true | true | true | true |
true | true | true | true |
true | true | true | true |
true | true | true | true |
true | true | true | true |
日期
from datetime import date, datetime
df = pl.DataFrame(
{
"date": pl.date_range(date(2022, 1, 1), date(2022, 1, 5), eager=True),
"datetime": pl.datetime_range(
datetime(2022, 1, 1), datetime(2022, 1, 5), eager=True
),
}
)
print(df)
print(df.select(pl.col("date").cast(pl.Int64), pl.col("datetime").dt.to_string('%Y-%m')))
# 使用dt.to_string() str.to_date() str.to_datetime() 方法实现string与日期格式的互换
shape: (5, 2)
┌────────────┬─────────────────────┐
│ date ┆ datetime │
│ --- ┆ --- │
│ date ┆ datetime[μs] │
╞════════════╪═════════════════════╡
│ 2022-01-01 ┆ 2022-01-01 00:00:00 │
│ 2022-01-02 ┆ 2022-01-02 00:00:00 │
│ 2022-01-03 ┆ 2022-01-03 00:00:00 │
│ 2022-01-04 ┆ 2022-01-04 00:00:00 │
│ 2022-01-05 ┆ 2022-01-05 00:00:00 │
└────────────┴─────────────────────┘
shape: (5, 2)
┌───────┬──────────┐
│ date ┆ datetime │
│ --- ┆ --- │
│ i64 ┆ str │
╞═══════╪══════════╡
│ 18993 ┆ 2022-01 │
│ 18994 ┆ 2022-01 │
│ 18995 ┆ 2022-01 │
│ 18996 ┆ 2022-01 │
│ 18997 ┆ 2022-01 │
└───────┴──────────┘
字符串处理与分析
# 字符串长度计算
#str.len_bytes str.len_chars
df = pl.DataFrame({"animal": ["Crab", "cat and dog", "rab$bit", None]})
print(df)
print(df.select(pl.col('animal').str.len_bytes().alias('len_bytes'), # 更快
pl.col('animal').str.len_chars().alias('len_chars'),
))
shape: (4, 1)
┌─────────────┐
│ animal │
│ --- │
│ str │
╞═════════════╡
│ Crab │
│ cat and dog │
│ rab$bit │
│ null │
└─────────────┘
shape: (4, 2)
┌───────────┬───────────┐
│ len_bytes ┆ len_chars │
│ --- ┆ --- │
│ u32 ┆ u32 │
╞═══════════╪═══════════╡
│ 4 ┆ 4 │
│ 11 ┆ 11 │
│ 7 ┆ 7 │
│ null ┆ null │
└───────────┴───────────┘
# 字符串分析
# .str.contains, str.contains_any, str.starts_with, str.ends_with
print(df.select(
pl.all(),
pl.all().str.contains('ca|ra').alias('contains_ca|ra'),
pl.all().str.contains('rab$', literal=True).alias('cliteral'),
pl.all().str.ends_with('dog').alias("ends_with"),
))
shape: (4, 4)
┌─────────────┬────────────────┬──────────┬───────────┐
│ animal ┆ contains_ca|ra ┆ cliteral ┆ ends_with │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ bool ┆ bool ┆ bool │
╞═════════════╪════════════════╪══════════╪═══════════╡
│ Crab ┆ true ┆ false ┆ false │
│ cat and dog ┆ true ┆ false ┆ true │
│ rab$bit ┆ true ┆ true ┆ false │
│ null ┆ null ┆ null ┆ null │
└─────────────┴────────────────┴──────────┴───────────┘
# 字符提取
# str.extract str.extract_all
df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t"]})
out = df.select(
pl.col('foo').str.extract(r'(\d{3})',group_index=1).alias('extract'),
pl.col("foo").str.extract_all(r"(\d+)").alias("extracted_all"),
)
print(out)
shape: (2, 2)
┌─────────┬────────────────┐
│ extract ┆ extracted_all │
│ --- ┆ --- │
│ str ┆ list[str] │
╞═════════╪════════════════╡
│ 123 ┆ ["123", "45"] │
│ 678 ┆ ["678", "910"] │
└─────────┴────────────────┘
# 字符替换 str.replace str.replace_all
df.select(
pl.col('foo'),
pl.col('foo').str.replace('123', '321').alias('replace'),
pl.col('foo').str.replace_all('a', '*').alias('replace_all')
)
foo | replace | replace_all |
---|---|---|
str | str | str |
"123 bla 45 asd" | "321 bla 45 asd" | "123 bl* 45 *sd" |
"xyz 678 910t" | "xyz 678 910t" | "xyz 678 910t" |
聚合 aggregation
聚合可以生成多种多样的匹配方式,且polars的聚合context中可以添加多个expressions,不设操作上限,功能强大。
使用多个expression时,利用lazy() API有助于加快速度,但与eager() API没有操作上的差别
# 教程以US congress数据库为示例
url = "https://theunitedstates.io/congress-legislators/legislators-historical.csv"
# 教程中schema_overrides已经不能使用
dataset = pl.read_csv(url).with_columns(
pl.col("birthday").str.to_date(strict=False),
pl.col("first_name", "gender", "type", "state", "party").cast(pl.Categorical)
)
dataset.head(3)
last_name | first_name | middle_name | suffix | nickname | full_name | birthday | gender | type | state | district | senate_class | party | url | address | phone | contact_form | rss_url | twitter_id | youtube | youtube_id | mastodon | bioguide_id | thomas_id | opensecrets_id | lis_id | fec_ids | cspan_id | govtrack_id | votesmart_id | ballotpedia_id | washington_post_id | icpsr_id | wikipedia_id | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
str | cat | str | str | str | str | date | cat | cat | cat | i64 | i64 | cat | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | i64 | str | str | str | i64 | str |
"Bassett" | "Richard" | null | null | null | null | 1745-04-02 | "M" | "sen" | "DE" | null | 2 | "Anti-Administration" | null | null | null | null | null | null | null | null | null | null | null | "B000226" | null | null | null | null | null | 401222 | null | null | null | 507 | "Richard Bassett (Delaware poli… |
"Bland" | "Theodorick" | null | null | null | null | 1742-03-21 | "M" | "rep" | "VA" | 9 | null | null | null | null | null | null | null | null | null | null | null | null | null | "B000546" | null | null | null | null | null | 401521 | null | null | null | 786 | "Theodorick Bland (congressman)" |
"Burke" | "Aedanus" | null | null | null | null | 1743-06-16 | "M" | "rep" | "SC" | 2 | null | null | null | null | null | null | null | null | null | null | null | null | null | "B001086" | null | null | null | null | null | 402032 | null | null | null | 1260 | "Aedanus Burke" |
# 初始聚合
#计数,选取列,选择特定值
print(dataset.group_by('first_name').agg(
pl.count('party').alias('short_form'),
pl.col('party').count().alias('full_form'),
pl.col('gender'),
pl.first('last_name').alias('shortform'),
).limit(3))
shape: (3, 5)
┌────────────┬────────────┬───────────┬───────────────────┬───────────┐
│ first_name ┆ short_form ┆ full_form ┆ gender ┆ shortform │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ cat ┆ u32 ┆ u32 ┆ list[cat] ┆ str │
╞════════════╪════════════╪═══════════╪═══════════════════╪═══════════╡
│ Wyche ┆ 1 ┆ 1 ┆ ["M"] ┆ Fowler │
│ Philemon ┆ 5 ┆ 5 ┆ ["M", "M", … "M"] ┆ Dickinson │
│ Benedict ┆ 0 ┆ 0 ┆ ["M", "M"] ┆ Arnold │
└────────────┴────────────┴───────────┴───────────────────┴───────────┘
# 条件判断
q = dataset.lazy().group_by('state').agg(
(pl.col('party') == 'Pro-Administration').sum().alias('pro')
).limit(3)
print(q.collect())
shape: (3, 2)
┌───────┬─────┐
│ state ┆ pro │
│ --- ┆ --- │
│ cat ┆ u32 │
╞═══════╪═════╡
│ GU ┆ 0 │
│ MT ┆ 0 │
│ MS ┆ 0 │
└───────┴─────┘
# 多次分组
q = dataset.lazy().group_by('state', 'party').agg(
pl.col('gender').count().alias('count')).filter(pl.col('count')>1).sort('state')
q.collect()
state | party | count |
---|---|---|
cat | cat | u32 |
"DE" | "Jackson" | 2 |
"DE" | "Ind. Republican-Democrat" | 2 |
"DE" | null | 3 |
"DE" | "Whig" | 7 |
"DE" | "Federalist" | 9 |
… | … | … |
"AK" | "Democrat" | 8 |
"PI" | null | 13 |
"VI" | "Democrat" | 2 |
"GU" | "Democrat" | 4 |
"AS" | "Democrat" | 2 |
# 自定义函数,agg用法过于,后续结合需求加深学习,https://docs.pola.rs/user-guide/expressions/aggregation/#filtering
from datetime import date
def compute_age():
return date.today().year - pl.col("birthday").dt.year()
def avg_birthday(gender: str) -> pl.Expr:
return (
compute_age()
.filter(pl.col("gender") == gender)
.mean()
.alias(f"avg {gender} birthday")
)
q = dataset.lazy().group_by("state").agg(
avg_birthday("M"),
avg_birthday("F"),
(pl.col("gender") == "M").sum().alias("# male"),
(pl.col("gender") == "F").sum().alias("# female"),
).limit(3)
q.collect()
state | avg M birthday | avg F birthday | # male | # female |
---|---|---|---|---|
cat | f64 | f64 | u32 | u32 |
"IL" | 156.710638 | 100.4 | 478 | 15 |
"MN" | 140.647799 | 97.333333 | 159 | 3 |
"NJ" | 177.005865 | 115.0 | 357 | 5 |
缺失值处理 missing data
# 缺失值计算查看的两种方式
# null_count() is_null()
df = pl.DataFrame(
{
"col1": [1, 2, 3],
"col2": [1, None, 5],
},
)
print(df.null_count())
print(df.count()) # 常规计算时,null不算
print(df.select(pl.all().is_null()))
shape: (1, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ u32 ┆ u32 │
╞══════╪══════╡
│ 0 ┆ 1 │
└──────┴──────┘
shape: (1, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ u32 ┆ u32 │
╞══════╪══════╡
│ 3 ┆ 2 │
└──────┴──────┘
shape: (3, 2)
┌───────┬───────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ bool ┆ bool │
╞═══════╪═══════╡
│ false ┆ false │
│ false ┆ true │
│ false ┆ false │
└───────┴───────┘
缺失值填充
# 使用pl.lit指定值进行填充
print(df.with_columns(
pl.col('col2').fill_null(pl.lit(9)).alias('fill_col2')
))
# 使用前向/后项等策略填充
print(df.with_columns(
pl.col('col2').fill_null(strategy='forward').alias('forward'),
pl.col('col2').fill_null(strategy='backward').alias('backward'),
))
# 使用表达式值填充,使用插入法填充interpolate
print(df.with_columns(
pl.col('col2').fill_null(pl.col('col1').mean()).alias('expression'),
pl.col('col2').interpolate().alias('interpolate')
))
shape: (3, 3)
┌──────┬──────┬───────────┐
│ col1 ┆ col2 ┆ fill_col2 │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪═══════════╡
│ 1 ┆ 1 ┆ 1 │
│ 2 ┆ null ┆ 9 │
│ 3 ┆ 5 ┆ 5 │
└──────┴──────┴───────────┘
shape: (3, 4)
┌──────┬──────┬─────────┬──────────┐
│ col1 ┆ col2 ┆ forward ┆ backward │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪═════════╪══════════╡
│ 1 ┆ 1 ┆ 1 ┆ 1 │
│ 2 ┆ null ┆ 1 ┆ 5 │
│ 3 ┆ 5 ┆ 5 ┆ 5 │
└──────┴──────┴─────────┴──────────┘
shape: (3, 4)
┌──────┬──────┬────────────┬─────────────┐
│ col1 ┆ col2 ┆ expression ┆ interpolate │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ f64 ┆ f64 │
╞══════╪══════╪════════════╪═════════════╡
│ 1 ┆ 1 ┆ 1.0 ┆ 1.0 │
│ 2 ┆ null ┆ 2.0 ┆ 3.0 │
│ 3 ┆ 5 ┆ 5.0 ┆ 5.0 │
└──────┴──────┴────────────┴─────────────┘
NotaNumber or NaN值
NaN和null原来是不相等的
# 使用fill_null时,不会对nan值进行处理(计数时也不会),polars中有fill_nan与is_nan方法对于处理
df = pl.DataFrame(
{
"col1": [1, float('nan'), 3],
"col2": [1, None, 5],
},
)
print(df.null_count())
print(df.with_columns(
pl.col('col1').fill_null(9).alias('fill_null'),
pl.col('col1',).fill_nan(9).alias('fill_nan'),
pl.col('col2',).fill_nan(9).alias('null_fill_nan'),
))
shape: (1, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ u32 ┆ u32 │
╞══════╪══════╡
│ 0 ┆ 1 │
└──────┴──────┘
shape: (3, 5)
┌──────┬──────┬───────────┬──────────┬───────────────┐
│ col1 ┆ col2 ┆ fill_null ┆ fill_nan ┆ null_fill_nan │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ i64 ┆ f64 ┆ f64 ┆ i64 │
╞══════╪══════╪═══════════╪══════════╪═══════════════╡
│ 1.0 ┆ 1 ┆ 1.0 ┆ 1.0 ┆ 1 │
│ NaN ┆ null ┆ NaN ┆ 9.0 ┆ null │
│ 3.0 ┆ 5 ┆ 3.0 ┆ 3.0 ┆ 5 │
└──────┴──────┴───────────┴──────────┴───────────────┘