数据读写
polars数据读写与pandas类似
存储 df.write_csv(“docs/data/output.csv”)
读取 df_csv = pl.read_csv(“docs/data/output.csv”)
import polars as pl
from datetime import datetime
df = pl.DataFrame(
{
"integer": [1, 2, 3],
"date": [
datetime(2025, 1, 1),
datetime(2025, 1, 2),
datetime(2025, 1, 3),
],
"float": [4.0, 5.0, 6.0],
"string": ["a", "b", "c"],
}
)
df
shape: (3, 4)
integer | date | float | string |
---|---|---|---|
i64 | datetime[μs] | f64 | str |
1 | 2025-01-01 00:00:00 | 4.0 | "a" |
2 | 2025-01-02 00:00:00 | 5.0 | "b" |
3 | 2025-01-03 00:00:00 | 6.0 | "c" |
Expressions
polars中最核心的部分Expressions。Espressions提供了一个可以模块结构,在该结构内,你可以使用并不断叠加简单的concepts(另外一个核心的概念),最终实现复杂的查询。
在polars中,主要由以下四个基本的模块结构:
-
select
-
filter
-
group_by
-
with_columns
-
select
为了选择某列,首先需求定义对应的数据集dataframe,其次要明确需要的列
# col('*')表示选择所有列, 与pl.all()相同
print(df.select(pl.col("*")))
print(df.select(pl.all()))
# 选择特定列
print(df.select(pl.col('float','date')))
shape: (3, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date ┆ float ┆ string │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ str │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 1 ┆ 2025-01-01 00:00:00 ┆ 4.0 ┆ a │
│ 2 ┆ 2025-01-02 00:00:00 ┆ 5.0 ┆ b │
│ 3 ┆ 2025-01-03 00:00:00 ┆ 6.0 ┆ c │
└─────────┴─────────────────────┴───────┴────────┘
shape: (3, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date ┆ float ┆ string │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ str │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 1 ┆ 2025-01-01 00:00:00 ┆ 4.0 ┆ a │
│ 2 ┆ 2025-01-02 00:00:00 ┆ 5.0 ┆ b │
│ 3 ┆ 2025-01-03 00:00:00 ┆ 6.0 ┆ c │
└─────────┴─────────────────────┴───────┴────────┘
shape: (3, 2)
┌───────┬─────────────────────┐
│ float ┆ date │
│ --- ┆ --- │
│ f64 ┆ datetime[μs] │
╞═══════╪═════════════════════╡
│ 4.0 ┆ 2025-01-01 00:00:00 │
│ 5.0 ┆ 2025-01-02 00:00:00 │
│ 6.0 ┆ 2025-01-03 00:00:00 │
└───────┴─────────────────────┘
- filter
# 通过日期筛选
print(df.filter(pl.col('date').is_between(datetime(2025, 1, 1), datetime(2025, 1, 2))))
#通过数值筛选
print(df.filter(pl.col('float').is_between(5, 6) ))
shape: (2, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date ┆ float ┆ string │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ str │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 1 ┆ 2025-01-01 00:00:00 ┆ 4.0 ┆ a │
│ 2 ┆ 2025-01-02 00:00:00 ┆ 5.0 ┆ b │
└─────────┴─────────────────────┴───────┴────────┘
shape: (2, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date ┆ float ┆ string │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ str │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 2 ┆ 2025-01-02 00:00:00 ┆ 5.0 ┆ b │
│ 3 ┆ 2025-01-03 00:00:00 ┆ 6.0 ┆ c │
└─────────┴─────────────────────┴───────┴────────┘
select和flite返回的dataframe均为筛选后的,其一般不会新增列,group_by和 with_columns能对原始数据的列进行替换或添加
- with_column
print(df.with_columns(pl.col('float').sum().alias('new_folat'), (pl.col('string')+'add').alias('string+add')))
# 使用alias创建新列,否则替换原列
print(df.with_columns(pl.col('float').sum(), (pl.col('string')+'add')))
shape: (3, 6)
┌─────────┬─────────────────────┬───────┬────────┬───────────┬────────────┐
│ integer ┆ date ┆ float ┆ string ┆ new_folat ┆ string+add │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ str ┆ f64 ┆ str │
╞═════════╪═════════════════════╪═══════╪════════╪═══════════╪════════════╡
│ 1 ┆ 2025-01-01 00:00:00 ┆ 4.0 ┆ a ┆ 15.0 ┆ aadd │
│ 2 ┆ 2025-01-02 00:00:00 ┆ 5.0 ┆ b ┆ 15.0 ┆ badd │
│ 3 ┆ 2025-01-03 00:00:00 ┆ 6.0 ┆ c ┆ 15.0 ┆ cadd │
└─────────┴─────────────────────┴───────┴────────┴───────────┴────────────┘
shape: (3, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date ┆ float ┆ string │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ str │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 1 ┆ 2025-01-01 00:00:00 ┆ 15.0 ┆ aadd │
│ 2 ┆ 2025-01-02 00:00:00 ┆ 15.0 ┆ badd │
│ 3 ┆ 2025-01-03 00:00:00 ┆ 15.0 ┆ cadd │
└─────────┴─────────────────────┴───────┴────────┘
- group_by
df2 = pl.DataFrame(
{
"x": range(8),
"y": ["A", "A", "A", "B", "B", "C", "X", "X"],
}
)
df2.head()
shape: (5, 2)
x | y |
---|---|
i64 | str |
0 | "A" |
1 | "A" |
2 | "A" |
3 | "B" |
4 | "B" |
df2.group_by(['y'], maintain_order=True).mean()
shape: (4, 2)
y | x |
---|---|
str | f64 |
"A" | 1.0 |
"B" | 3.5 |
"C" | 5.0 |
"X" | 6.5 |
print(df2.group_by('y', maintain_order=True).agg(pl.col('*').mean().alias('mean'), pl.col('*').count().alias('count'),))
shape: (4, 3)
┌─────┬──────┬───────┐
│ y ┆ mean ┆ count │
│ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ u32 │
╞═════╪══════╪═══════╡
│ A ┆ 1.0 ┆ 3 │
│ B ┆ 3.5 ┆ 2 │
│ C ┆ 5.0 ┆ 1 │
│ X ┆ 6.5 ┆ 2 │
└─────┴──────┴───────┘
以上4种结构不仅可以单独使用,还可以相互配合以实现更强大的查询需求
print( df.with_columns((pl.col('float')*6).alias('float*6')).select(pl.all().exclude('string')))
shape: (3, 4)
┌─────────┬─────────────────────┬───────┬─────────┐
│ integer ┆ date ┆ float ┆ float*6 │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ f64 │
╞═════════╪═════════════════════╪═══════╪═════════╡
│ 1 ┆ 2025-01-01 00:00:00 ┆ 4.0 ┆ 24.0 │
│ 2 ┆ 2025-01-02 00:00:00 ┆ 5.0 ┆ 30.0 │
│ 3 ┆ 2025-01-03 00:00:00 ┆ 6.0 ┆ 36.0 │
└─────────┴─────────────────────┴───────┴─────────┘
合并数据
import numpy as np
df3 = pl.DataFrame(
{
"a": range(8),
"b": np.random.rand(8),
"d": [1, 2.0, float("nan"), float("nan"), 0, -5, -42, None],
}
)
df4 = pl.DataFrame(
{
"x": range(8),
"y": ["A", "A", "A", "B", "B", "C", "X", "X"],
}
)
print(df3.head(), df4.head())
shape: (5, 3)
┌─────┬──────────┬─────┐
│ a ┆ b ┆ d │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ f64 │
╞═════╪══════════╪═════╡
│ 0 ┆ 0.411314 ┆ 1.0 │
│ 1 ┆ 0.984068 ┆ 2.0 │
│ 2 ┆ 0.169014 ┆ NaN │
│ 3 ┆ 0.712731 ┆ NaN │
│ 4 ┆ 0.248682 ┆ 0.0 │
└─────┴──────────┴─────┘ shape: (5, 2)
┌─────┬─────┐
│ x ┆ y │
│ --- ┆ --- │
│ i64 ┆ str │
╞═════╪═════╡
│ 0 ┆ A │
│ 1 ┆ A │
│ 2 ┆ A │
│ 3 ┆ B │
│ 4 ┆ B │
└─────┴─────┘
df5 = df3.join(df4, left_on="a", right_on="x")
df5
shape: (8, 4)
a | b | d | y |
---|---|---|---|
i64 | f64 | f64 | str |
0 | 0.411314 | 1.0 | "A" |
1 | 0.984068 | 2.0 | "A" |
2 | 0.169014 | NaN | "A" |
3 | 0.712731 | NaN | "B" |
4 | 0.248682 | 0.0 | "B" |
5 | 0.921465 | -5.0 | "C" |
6 | 0.516578 | -42.0 | "X" |
7 | 0.145339 | null | "X" |
df3.hstack(df4)
shape: (8, 5)
a | b | d | x | y |
---|---|---|---|---|
i64 | f64 | f64 | i64 | str |
0 | 0.411314 | 1.0 | 0 | "A" |
1 | 0.984068 | 2.0 | 1 | "A" |
2 | 0.169014 | NaN | 2 | "A" |
3 | 0.712731 | NaN | 3 | "B" |
4 | 0.248682 | 0.0 | 4 | "B" |
5 | 0.921465 | -5.0 | 5 | "C" |
6 | 0.516578 | -42.0 | 6 | "X" |
7 | 0.145339 | null | 7 | "X" |