开始简介
数据读写
polars数据读写与pandas类似
存储 df.write_csv(“docs/data/output.csv”)
读取 df_csv = pl.read_csv(“docs/data/output.csv”)
import polars as pl
from datetime import datetime
df = pl. DataFrame(
{
"integer" : [ 1 , 2 , 3 ] ,
"date" : [
datetime( 2025 , 1 , 1 ) ,
datetime( 2025 , 1 , 2 ) ,
datetime( 2025 , 1 , 3 ) ,
] ,
"float" : [ 4.0 , 5.0 , 6.0 ] ,
"string" : [ "a" , "b" , "c" ] ,
}
)
df
shape: (3, 4)
integer date float string i64 datetime[μs] f64 str 1 2025-01-01 00:00:00 4.0 "a" 2 2025-01-02 00:00:00 5.0 "b" 3 2025-01-03 00:00:00 6.0 "c"
Expressions
polars中最核心的部分Expressions,Expressions提供了一个模块结构,在该结构内,你可以使用并不断叠加简单的concepts(另外一个核心的概念),最终实现复杂的查询。 在polars中,主要由以下四个基本的模块结构(也称contexts):
print ( df. select( pl. col( "*" ) ) )
print ( df. select( pl. all ( ) ) )
print ( df. select( pl. col( 'float' , 'date' ) ) )
shape: (3, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date ┆ float ┆ string │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ str │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 1 ┆ 2025-01-01 00:00:00 ┆ 4.0 ┆ a │
│ 2 ┆ 2025-01-02 00:00:00 ┆ 5.0 ┆ b │
│ 3 ┆ 2025-01-03 00:00:00 ┆ 6.0 ┆ c │
└─────────┴─────────────────────┴───────┴────────┘
shape: (3, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date ┆ float ┆ string │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ str │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 1 ┆ 2025-01-01 00:00:00 ┆ 4.0 ┆ a │
│ 2 ┆ 2025-01-02 00:00:00 ┆ 5.0 ┆ b │
│ 3 ┆ 2025-01-03 00:00:00 ┆ 6.0 ┆ c │
└─────────┴─────────────────────┴───────┴────────┘
shape: (3, 2)
┌───────┬─────────────────────┐
│ float ┆ date │
│ --- ┆ --- │
│ f64 ┆ datetime[μs] │
╞═══════╪═════════════════════╡
│ 4.0 ┆ 2025-01-01 00:00:00 │
│ 5.0 ┆ 2025-01-02 00:00:00 │
│ 6.0 ┆ 2025-01-03 00:00:00 │
└───────┴─────────────────────┘
print ( df. filter ( pl. col( 'date' ) . is_between( datetime( 2025 , 1 , 1 ) , datetime( 2025 , 1 , 2 ) ) ) )
print ( df. filter ( pl. col( 'float' ) . is_between( 5 , 6 ) ) )
shape: (2, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date ┆ float ┆ string │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ str │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 1 ┆ 2025-01-01 00:00:00 ┆ 4.0 ┆ a │
│ 2 ┆ 2025-01-02 00:00:00 ┆ 5.0 ┆ b │
└─────────┴─────────────────────┴───────┴────────┘
shape: (2, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date ┆ float ┆ string │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ str │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 2 ┆ 2025-01-02 00:00:00 ┆ 5.0 ┆ b │
│ 3 ┆ 2025-01-03 00:00:00 ┆ 6.0 ┆ c │
└─────────┴─────────────────────┴───────┴────────┘
select和fliter返回的dataframe均为筛选后的,其一般不会新增列,group_by和 with_columns能对原始数据的列进行替换或添加
print ( df. with_columns( pl. col( 'float' ) . sum ( ) . alias( 'new_folat' ) , ( pl. col( 'string' ) + 'add' ) . alias( 'string+add' ) ) )
print ( df. with_columns( pl. col( 'float' ) . sum ( ) , ( pl. col( 'string' ) + 'add' ) ) )
shape: (3, 6)
┌─────────┬─────────────────────┬───────┬────────┬───────────┬────────────┐
│ integer ┆ date ┆ float ┆ string ┆ new_folat ┆ string+add │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ str ┆ f64 ┆ str │
╞═════════╪═════════════════════╪═══════╪════════╪═══════════╪════════════╡
│ 1 ┆ 2025-01-01 00:00:00 ┆ 4.0 ┆ a ┆ 15.0 ┆ aadd │
│ 2 ┆ 2025-01-02 00:00:00 ┆ 5.0 ┆ b ┆ 15.0 ┆ badd │
│ 3 ┆ 2025-01-03 00:00:00 ┆ 6.0 ┆ c ┆ 15.0 ┆ cadd │
└─────────┴─────────────────────┴───────┴────────┴───────────┴────────────┘
shape: (3, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date ┆ float ┆ string │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ str │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 1 ┆ 2025-01-01 00:00:00 ┆ 15.0 ┆ aadd │
│ 2 ┆ 2025-01-02 00:00:00 ┆ 15.0 ┆ badd │
│ 3 ┆ 2025-01-03 00:00:00 ┆ 15.0 ┆ cadd │
└─────────┴─────────────────────┴───────┴────────┘
df2 = pl. DataFrame(
{
"x" : range ( 8 ) ,
"y" : [ "A" , "A" , "A" , "B" , "B" , "C" , "X" , "X" ] ,
}
)
df2. head( )
shape: (5, 2)
x y i64 str 0 "A" 1 "A" 2 "A" 3 "B" 4 "B"
df2. group_by( [ 'y' ] , maintain_order= True ) . mean( )
shape: (4, 2)
y x str f64 "A" 1.0 "B" 3.5 "C" 5.0 "X" 6.5
print ( df2. group_by( 'y' , maintain_order= True ) . agg( pl. col( '*' ) . mean( ) . alias( 'mean' ) , pl. col( '*' ) . count( ) . alias( 'count' ) , ) )
shape: (4, 3)
┌─────┬──────┬───────┐
│ y ┆ mean ┆ count │
│ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ u32 │
╞═════╪══════╪═══════╡
│ A ┆ 1.0 ┆ 3 │
│ B ┆ 3.5 ┆ 2 │
│ C ┆ 5.0 ┆ 1 │
│ X ┆ 6.5 ┆ 2 │
└─────┴──────┴───────┘
以上4种结构不仅可以单独使用,还可以相互配合以实现更强大的查询需求
print ( df. with_columns( ( pl. col( 'float' ) * 6 ) . alias( 'float*6' ) ) . select( pl. all ( ) . exclude( 'string' ) ) )
shape: (3, 4)
┌─────────┬─────────────────────┬───────┬─────────┐
│ integer ┆ date ┆ float ┆ float*6 │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 ┆ f64 │
╞═════════╪═════════════════════╪═══════╪═════════╡
│ 1 ┆ 2025-01-01 00:00:00 ┆ 4.0 ┆ 24.0 │
│ 2 ┆ 2025-01-02 00:00:00 ┆ 5.0 ┆ 30.0 │
│ 3 ┆ 2025-01-03 00:00:00 ┆ 6.0 ┆ 36.0 │
└─────────┴─────────────────────┴───────┴─────────┘
合并数据
import numpy as np
df3 = pl. DataFrame(
{
"a" : range ( 8 ) ,
"b" : np. random. rand( 8 ) ,
"d" : [ 1 , 2.0 , float ( "nan" ) , float ( "nan" ) , 0 , - 5 , - 42 , None ] ,
}
)
df4 = pl. DataFrame(
{
"x" : range ( 8 ) ,
"y" : [ "A" , "A" , "A" , "B" , "B" , "C" , "X" , "X" ] ,
}
)
print ( df3. head( ) , df4. head( ) )
shape: (5, 3)
┌─────┬──────────┬─────┐
│ a ┆ b ┆ d │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ f64 │
╞═════╪══════════╪═════╡
│ 0 ┆ 0.411314 ┆ 1.0 │
│ 1 ┆ 0.984068 ┆ 2.0 │
│ 2 ┆ 0.169014 ┆ NaN │
│ 3 ┆ 0.712731 ┆ NaN │
│ 4 ┆ 0.248682 ┆ 0.0 │
└─────┴──────────┴─────┘ shape: (5, 2)
┌─────┬─────┐
│ x ┆ y │
│ --- ┆ --- │
│ i64 ┆ str │
╞═════╪═════╡
│ 0 ┆ A │
│ 1 ┆ A │
│ 2 ┆ A │
│ 3 ┆ B │
│ 4 ┆ B │
└─────┴─────┘
df5 = df3. join( df4, left_on= "a" , right_on= "x" )
df5
shape: (8, 4)
a b d y i64 f64 f64 str 0 0.411314 1.0 "A" 1 0.984068 2.0 "A" 2 0.169014 NaN "A" 3 0.712731 NaN "B" 4 0.248682 0.0 "B" 5 0.921465 -5.0 "C" 6 0.516578 -42.0 "X" 7 0.145339 null "X"
df3. hstack( df4)
shape: (8, 5)
a b d x y i64 f64 f64 i64 str 0 0.411314 1.0 0 "A" 1 0.984068 2.0 1 "A" 2 0.169014 NaN 2 "A" 3 0.712731 NaN 3 "B" 4 0.248682 0.0 4 "B" 5 0.921465 -5.0 5 "C" 6 0.516578 -42.0 6 "X" 7 0.145339 null 7 "X"
Concepts - Polars API 的核心概念介绍
数据类型
常见的数据类型都包括在polars中,详见:https://docs.pola.rs/user-guide/concepts/data-types/overview/ 在使用中,可以采用.cast(dtyps)的方法,转换对应的数据类型
数据结构
与pandas一样,主要包括 Series 和 DataFrame
import polars as pl
pl. Series( 'num' , range ( 6 ) )
DataFrame
二维数据,与pandas中DataFrame概念相同,可以通过读取、创建、变换等方式得到。使用pl.DataFrame(),与df.to_pandas(),可以和pandas的DataFrame相互变换.
from datetime import datetime
df = pl. DataFrame(
{
"integer" : [ 1 , 2 , 3 , 4 , 5 ] ,
"date" : [
datetime( 2022 , 1 , 1 ) ,
datetime( 2022 , 1 , 2 ) ,
datetime( 2022 , 1 , 3 ) ,
datetime( 2022 , 1 , 4 ) ,
datetime( 2022 , 1 , 5 ) ,
] ,
"float" : [ 4.0 , 5.0 , 6.0 , 7.0 , 8.0 ] ,
}
)
print ( df)
print ( type ( df. to_pandas( ) ) )
shape: (5, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date ┆ float │
│ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 │
╞═════════╪═════════════════════╪═══════╡
│ 1 ┆ 2022-01-01 00:00:00 ┆ 4.0 │
│ 2 ┆ 2022-01-02 00:00:00 ┆ 5.0 │
│ 3 ┆ 2022-01-03 00:00:00 ┆ 6.0 │
│ 4 ┆ 2022-01-04 00:00:00 ┆ 7.0 │
│ 5 ┆ 2022-01-05 00:00:00 ┆ 8.0 │
└─────────┴─────────────────────┴───────┘
<class 'pandas.core.frame.DataFrame'>
数据浏览
与pandas相同,polars包含.head(), .tail(), .sample(), .describe()等方法初步观看数据,同时支持[:,:]切片查询
print ( df. head( 2 ) )
print ( df. tail( 3 ) )
print ( df. sample( 1 ) )
print ( df. describe( ) )
print ( df[ : , 1 : 3 ] )
shape: (2, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date ┆ float │
│ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 │
╞═════════╪═════════════════════╪═══════╡
│ 1 ┆ 2022-01-01 00:00:00 ┆ 4.0 │
│ 2 ┆ 2022-01-02 00:00:00 ┆ 5.0 │
└─────────┴─────────────────────┴───────┘
shape: (3, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date ┆ float │
│ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 │
╞═════════╪═════════════════════╪═══════╡
│ 3 ┆ 2022-01-03 00:00:00 ┆ 6.0 │
│ 4 ┆ 2022-01-04 00:00:00 ┆ 7.0 │
│ 5 ┆ 2022-01-05 00:00:00 ┆ 8.0 │
└─────────┴─────────────────────┴───────┘
shape: (1, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date ┆ float │
│ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 │
╞═════════╪═════════════════════╪═══════╡
│ 3 ┆ 2022-01-03 00:00:00 ┆ 6.0 │
└─────────┴─────────────────────┴───────┘
shape: (9, 4)
┌────────────┬──────────┬─────────────────────┬──────────┐
│ statistic ┆ integer ┆ date ┆ float │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ str ┆ f64 │
╞════════════╪══════════╪═════════════════════╪══════════╡
│ count ┆ 5.0 ┆ 5 ┆ 5.0 │
│ null_count ┆ 0.0 ┆ 0 ┆ 0.0 │
│ mean ┆ 3.0 ┆ 2022-01-03 00:00:00 ┆ 6.0 │
│ std ┆ 1.581139 ┆ null ┆ 1.581139 │
│ min ┆ 1.0 ┆ 2022-01-01 00:00:00 ┆ 4.0 │
│ 25% ┆ 2.0 ┆ 2022-01-02 00:00:00 ┆ 5.0 │
│ 50% ┆ 3.0 ┆ 2022-01-03 00:00:00 ┆ 6.0 │
│ 75% ┆ 4.0 ┆ 2022-01-04 00:00:00 ┆ 7.0 │
│ max ┆ 5.0 ┆ 2022-01-05 00:00:00 ┆ 8.0 │
└────────────┴──────────┴─────────────────────┴──────────┘
shape: (5, 2)
┌─────────────────────┬───────┐
│ date ┆ float │
│ --- ┆ --- │
│ datetime[μs] ┆ f64 │
╞═════════════════════╪═══════╡
│ 2022-01-01 00:00:00 ┆ 4.0 │
│ 2022-01-02 00:00:00 ┆ 5.0 │
│ 2022-01-03 00:00:00 ┆ 6.0 │
│ 2022-01-04 00:00:00 ┆ 7.0 │
│ 2022-01-05 00:00:00 ┆ 8.0 │
└─────────────────────┴───────┘
Contexts 查询背景
polars的两个核心组件是背景(Contexts)和表达式(Expressions) polars中主要的三种查询背景分别为:
选择:df.select(), df.with_columns() 筛选: df.filter() 分类/聚合: df.group_by().agg()
print ( df)
print ( df. select(
pl. col( 'date' ) . dt. day( ) ,
pl. col( 'float' ) / pl. col( 'float' ) . sum ( ) ,
( pl. col( 'float' ) * 10 ) . alias( 'float*10' )
) )
shape: (5, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date ┆ float │
│ --- ┆ --- ┆ --- │
│ i64 ┆ datetime[μs] ┆ f64 │
╞═════════╪═════════════════════╪═══════╡
│ 1 ┆ 2022-01-01 00:00:00 ┆ 4.0 │
│ 2 ┆ 2022-01-02 00:00:00 ┆ 5.0 │
│ 3 ┆ 2022-01-03 00:00:00 ┆ 6.0 │
│ 4 ┆ 2022-01-04 00:00:00 ┆ 7.0 │
│ 5 ┆ 2022-01-05 00:00:00 ┆ 8.0 │
└─────────┴─────────────────────┴───────┘
shape: (5, 3)
┌──────┬──────────┬──────────┐
│ date ┆ float ┆ float*10 │
│ --- ┆ --- ┆ --- │
│ i8 ┆ f64 ┆ f64 │
╞══════╪══════════╪══════════╡
│ 1 ┆ 0.133333 ┆ 40.0 │
│ 2 ┆ 0.166667 ┆ 50.0 │
│ 3 ┆ 0.2 ┆ 60.0 │
│ 4 ┆ 0.233333 ┆ 70.0 │
│ 5 ┆ 0.266667 ┆ 80.0 │
└──────┴──────────┴──────────┘
print ( df. select(
pl. col( 'date' ) . dt. day( ) ,
pl. col( 'float' ) / pl. col( 'float' ) . sum ( ) ,
( pl. col( 'float' ) * 10 ) . alias( 'float*10' )
) )
print ( df. with_columns(
pl. col( 'date' ) . dt. day( ) ,
pl. col( 'float' ) / pl. col( 'float' ) . sum ( ) ,
( pl. col( 'float' ) * 10 ) . alias( 'float*10' )
) )
shape: (5, 3)
┌──────┬──────────┬──────────┐
│ date ┆ float ┆ float*10 │
│ --- ┆ --- ┆ --- │
│ i8 ┆ f64 ┆ f64 │
╞══════╪══════════╪══════════╡
│ 1 ┆ 0.133333 ┆ 40.0 │
│ 2 ┆ 0.166667 ┆ 50.0 │
│ 3 ┆ 0.2 ┆ 60.0 │
│ 4 ┆ 0.233333 ┆ 70.0 │
│ 5 ┆ 0.266667 ┆ 80.0 │
└──────┴──────────┴──────────┘
shape: (5, 4)
┌─────────┬──────┬──────────┬──────────┐
│ integer ┆ date ┆ float ┆ float*10 │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i8 ┆ f64 ┆ f64 │
╞═════════╪══════╪══════════╪══════════╡
│ 1 ┆ 1 ┆ 0.133333 ┆ 40.0 │
│ 2 ┆ 2 ┆ 0.166667 ┆ 50.0 │
│ 3 ┆ 3 ┆ 0.2 ┆ 60.0 │
│ 4 ┆ 4 ┆ 0.233333 ┆ 70.0 │
│ 5 ┆ 5 ┆ 0.266667 ┆ 80.0 │
└─────────┴──────┴──────────┴──────────┘
df. filter ( ( pl. col( 'integer' ) > 2 ) & ( pl. col( 'float' ) < 7 ) )
shape: (1, 3)
integer date float i64 datetime[μs] f64 3 2022-01-03 00:00:00 6.0
df = pl. DataFrame(
{
"nrs" : [ 1 , 2 , 3 , None , 5 ] ,
"names" : [ "foo" , "ham" , "spam" , "egg" , None ] ,
"random" : [ 2 , 1 , 2 , 4 , 5 ] ,
"groups" : [ "A" , "A" , "B" , "C" , "B" ] ,
}
)
print ( df)
print ( df. group_by( 'groups' ) . agg(
pl. col( 'nrs' ) ,
pl. col( 'nrs' ) . mean( ) . alias( 'nrs_mean' ) ,
pl. col( 'nrs' ) . count( ) . alias( 'nrs_count' ) ,
pl. col( 'nrs' ) . is_null( ) . sum ( ) . alias( 'nrs_na_sum' ) ,
pl. col( 'names' )
) )
shape: (5, 4)
┌──────┬───────┬────────┬────────┐
│ nrs ┆ names ┆ random ┆ groups │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ i64 ┆ str │
╞══════╪═══════╪════════╪════════╡
│ 1 ┆ foo ┆ 2 ┆ A │
│ 2 ┆ ham ┆ 1 ┆ A │
│ 3 ┆ spam ┆ 2 ┆ B │
│ null ┆ egg ┆ 4 ┆ C │
│ 5 ┆ null ┆ 5 ┆ B │
└──────┴───────┴────────┴────────┘
shape: (3, 6)
┌────────┬───────────┬──────────┬───────────┬────────────┬────────────────┐
│ groups ┆ nrs ┆ nrs_mean ┆ nrs_count ┆ nrs_na_sum ┆ names │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ list[i64] ┆ f64 ┆ u32 ┆ u32 ┆ list[str] │
╞════════╪═══════════╪══════════╪═══════════╪════════════╪════════════════╡
│ C ┆ [null] ┆ null ┆ 0 ┆ 1 ┆ ["egg"] │
│ A ┆ [1, 2] ┆ 1.5 ┆ 2 ┆ 0 ┆ ["foo", "ham"] │
│ B ┆ [3, 5] ┆ 4.0 ┆ 2 ┆ 0 ┆ ["spam", null] │
└────────┴───────────┴──────────┴───────────┴────────────┴────────────────┘
Expressions 查询表达
expression是polars的核心内容,拥有者强大的选择查询能力,以下为一个expression示例
pl.col(‘nrs’).sum()
该expression示意:1.选择‘nrs’列;
2.对列进行求和
df. select( pl. col( 'random' ) . sort( ) . head( 3 ) , pl. col( 'nrs' ) . filter ( pl. col( 'groups' ) == 'A' ) . sum ( ) )
Lazy API 与 Streaming API
文档中特别强调该两接口,在大规模数据加载与处理过程中,使用该接口能很好的加快速度,lazy接口能避免重复的执行查询,优化查询路径,从而加快速度
df = pl. read_csv( "docs/data/iris.csv" )
df_small = df. filter ( pl. col( "sepal_length" ) > 5 )
df_agg = df_small. group_by( "species" ) . agg( pl. col( "sepal_width" ) . mean( ) )
q = (
pl. scan_csv( "docs/data/iris.csv" )
. filter ( pl. col( "sepal_length" ) > 5 )
. group_by( "species" )
. agg( pl. col( "sepal_width" ) . mean( ) )
)
df = q. collect( streaming= True )
Expressions – polar核心,查询的表达
基本的运算
在python和Rust中,polar支持如“+ - * / < >"等运算符的直接使用
import polars as pl
import numpy as np
df = pl. DataFrame(
{
"nrs" : [ 1 , 2 , 3 , None , 5 ] ,
"names" : [ "foo" , "ham" , "spam" , "egg" , None ] ,
"random" : np. random. rand( 5 ) ,
"groups" : [ "A" , "A" , "B" , "C" , "B" ] ,
}
)
print ( df)
shape: (5, 4)
┌──────┬───────┬──────────┬────────┐
│ nrs ┆ names ┆ random ┆ groups │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ f64 ┆ str │
╞══════╪═══════╪══════════╪════════╡
│ 1 ┆ foo ┆ 0.829716 ┆ A │
│ 2 ┆ ham ┆ 0.039304 ┆ A │
│ 3 ┆ spam ┆ 0.865325 ┆ B │
│ null ┆ egg ┆ 0.954994 ┆ C │
│ 5 ┆ null ┆ 0.324218 ┆ B │
└──────┴───────┴──────────┴────────┘
print ( df. select(
( pl. col( 'nrs' ) + 5 ) . alias( 'nrs+5' ) ,
( pl. col( 'nrs' ) * pl. col( 'random' ) ) . alias( 'nrs*random' )
) )
print ( df. select(
( pl. col( 'nrs' ) > 1 ) . alias( 'nrs>1' ) ,
( pl. col( 'groups' ) != 'A' ) . alias( 'groups!=A' ) ,
( ( pl. col( 'nrs' ) > 1 ) & ( pl. col( 'random' ) < 0.5 ) ) . alias( 'and_expression' ) ,
( ( pl. col( 'nrs' ) > 1 ) | ( pl. col( 'random' ) < 0.5 ) ) . alias( 'or_expression' )
) )
shape: (5, 2)
┌───────┬────────────┐
│ nrs+5 ┆ nrs*random │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═══════╪════════════╡
│ 6 ┆ 0.829716 │
│ 7 ┆ 0.078608 │
│ 8 ┆ 2.595976 │
│ null ┆ null │
│ 10 ┆ 1.621089 │
└───────┴────────────┘
shape: (5, 4)
┌───────┬───────────┬────────────────┬───────────────┐
│ nrs>1 ┆ groups!=A ┆ and_expression ┆ or_expression │
│ --- ┆ --- ┆ --- ┆ --- │
│ bool ┆ bool ┆ bool ┆ bool │
╞═══════╪═══════════╪════════════════╪═══════════════╡
│ false ┆ false ┆ false ┆ false │
│ true ┆ false ┆ true ┆ true │
│ true ┆ true ┆ false ┆ true │
│ null ┆ true ┆ false ┆ null │
│ true ┆ true ┆ true ┆ true │
└───────┴───────────┴────────────────┴───────────────┘
选择列
常规选择
from datetime import date, datetime
df = pl. DataFrame(
{
"id" : [ 9 , 4 , 2 ] ,
"place" : [ "Mars" , "Earth" , "Saturn" ] ,
"date" : pl. date_range( date( 2022 , 1 , 1 ) , date( 2022 , 1 , 3 ) , "1d" , eager= True ) ,
"sales" : [ 33.4 , 2142134.1 , 44.7 ] ,
"has_people" : [ False , True , False ] ,
"logged_at" : pl. datetime_range(
datetime( 2022 , 12 , 1 ) , datetime( 2022 , 12 , 1 , 0 , 0 , 2 ) , "1s" , eager= True
) ,
}
) . with_row_index( "index" )
print ( df. select( pl. all ( ) ) )
print ( df. select( pl. all ( ) . exclude( 'sales' , 'place' ) ) )
print ( df. select( pl. col( 'date' , 'logged_at' ) . dt. strftime( "%Y-%m-%d" ) ) )
shape: (3, 7)
┌───────┬─────┬────────┬────────────┬───────────┬────────────┬─────────────────────┐
│ index ┆ id ┆ place ┆ date ┆ sales ┆ has_people ┆ logged_at │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ i64 ┆ str ┆ date ┆ f64 ┆ bool ┆ datetime[μs] │
╞═══════╪═════╪════════╪════════════╪═══════════╪════════════╪═════════════════════╡
│ 0 ┆ 9 ┆ Mars ┆ 2022-01-01 ┆ 33.4 ┆ false ┆ 2022-12-01 00:00:00 │
│ 1 ┆ 4 ┆ Earth ┆ 2022-01-02 ┆ 2142134.1 ┆ true ┆ 2022-12-01 00:00:01 │
│ 2 ┆ 2 ┆ Saturn ┆ 2022-01-03 ┆ 44.7 ┆ false ┆ 2022-12-01 00:00:02 │
└───────┴─────┴────────┴────────────┴───────────┴────────────┴─────────────────────┘
shape: (3, 5)
┌───────┬─────┬────────────┬────────────┬─────────────────────┐
│ index ┆ id ┆ date ┆ has_people ┆ logged_at │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ i64 ┆ date ┆ bool ┆ datetime[μs] │
╞═══════╪═════╪════════════╪════════════╪═════════════════════╡
│ 0 ┆ 9 ┆ 2022-01-01 ┆ false ┆ 2022-12-01 00:00:00 │
│ 1 ┆ 4 ┆ 2022-01-02 ┆ true ┆ 2022-12-01 00:00:01 │
│ 2 ┆ 2 ┆ 2022-01-03 ┆ false ┆ 2022-12-01 00:00:02 │
└───────┴─────┴────────────┴────────────┴─────────────────────┘
shape: (3, 2)
┌────────────┬────────────┐
│ date ┆ logged_at │
│ --- ┆ --- │
│ str ┆ str │
╞════════════╪════════════╡
│ 2022-01-01 ┆ 2022-12-01 │
│ 2022-01-02 ┆ 2022-12-01 │
│ 2022-01-03 ┆ 2022-12-01 │
└────────────┴────────────┘
正则选择
pl.col方法支持使用正则筛选列
df. select( pl. col( '^.*(people).*$' ) )
shape: (3, 1)
has_people bool false true false
根据数据类型选择
df. select( pl. col( pl. Int64, pl. Boolean) )
shape: (3, 2)
id has_people i64 bool 9 false 4 true 2 false
使用selectors选择器
使用 polars.selectors
import polars. selectors as cs
print ( dir ( cs) )
print ( df. select( cs. integer( ) ) )
print ( df. select( cs. numeric( ) - cs. first( ) ) )
print ( df. select( cs. by_name( 'index' ) | ~ cs. numeric( ) ) )
['Any', 'Binary', 'Boolean', 'Categorical', 'Collection', 'Date', 'Datetime', 'Decimal', 'Duration', 'Expr', 'F', 'FLOAT_DTYPES', 'INTEGER_DTYPES', 'Literal', 'Mapping', 'NUMERIC_DTYPES', 'NoReturn', 'Object', 'SIGNED_INTEGER_DTYPES', 'Sequence', 'String', 'TEMPORAL_DTYPES', 'TYPE_CHECKING', 'Time', 'UNSIGNED_INTEGER_DTYPES', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '_combine_as_selector', '_expand_selector_dicts', '_expand_selectors', '_parse_inputs_as_iterable', '_re_string', '_selector_proxy_', 'all', 'alpha', 'alphanumeric', 'annotations', 'binary', 'boolean', 'by_dtype', 'by_index', 'by_name', 'categorical', 'contains', 'date', 'datetime', 'decimal', 'deprecate_nonkeyword_arguments', 'digit', 'duration', 'ends_with', 'exclude', 'expand_selector', 'first', 'float', 'integer', 'is_column', 'is_polars_dtype', 'is_selector', 'last', 'matches', 'numeric', 'object', 'or_', 'overload', 're_escape', 'reduce', 'signed_integer', 'starts_with', 'string', 'temporal', 'time', 'timezone', 'unsigned_integer']
shape: (3, 2)
┌───────┬─────┐
│ index ┆ id │
│ --- ┆ --- │
│ u32 ┆ i64 │
╞═══════╪═════╡
│ 0 ┆ 9 │
│ 1 ┆ 4 │
│ 2 ┆ 2 │
└───────┴─────┘
shape: (3, 2)
┌─────┬───────────┐
│ id ┆ sales │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════╪═══════════╡
│ 9 ┆ 33.4 │
│ 4 ┆ 2142134.1 │
│ 2 ┆ 44.7 │
└─────┴───────────┘
shape: (3, 5)
┌───────┬────────┬────────────┬────────────┬─────────────────────┐
│ index ┆ place ┆ date ┆ has_people ┆ logged_at │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ date ┆ bool ┆ datetime[μs] │
╞═══════╪════════╪════════════╪════════════╪═════════════════════╡
│ 0 ┆ Mars ┆ 2022-01-01 ┆ false ┆ 2022-12-01 00:00:00 │
│ 1 ┆ Earth ┆ 2022-01-02 ┆ true ┆ 2022-12-01 00:00:01 │
│ 2 ┆ Saturn ┆ 2022-01-03 ┆ false ┆ 2022-12-01 00:00:02 │
└───────┴────────┴────────────┴────────────┴─────────────────────┘
print ( df. select( cs. contains( 'ind' ) , cs. matches( '.*_.*' ) ) )
shape: (3, 3)
┌───────┬────────────┬─────────────────────┐
│ index ┆ has_people ┆ logged_at │
│ --- ┆ --- ┆ --- │
│ u32 ┆ bool ┆ datetime[μs] │
╞═══════╪════════════╪═════════════════════╡
│ 0 ┆ false ┆ 2022-12-01 00:00:00 │
│ 1 ┆ true ┆ 2022-12-01 00:00:01 │
│ 2 ┆ false ┆ 2022-12-01 00:00:02 │
└───────┴────────────┴─────────────────────┘
print ( df. select( cs. temporal( ) . as_expr( ) . dt. strftime( '%Y-%m' ) ) )
from polars. selectors import is_selector
out = cs. numeric( )
print ( is_selector( out) )
out = cs. boolean( ) | cs. numeric( )
print ( is_selector( out) )
out = cs. numeric( ) + pl. lit( 123 )
print ( is_selector( out) )
shape: (3, 2)
┌─────────┬───────────┐
│ date ┆ logged_at │
│ --- ┆ --- │
│ str ┆ str │
╞═════════╪═══════════╡
│ 2022-01 ┆ 2022-12 │
│ 2022-01 ┆ 2022-12 │
│ 2022-01 ┆ 2022-12 │
└─────────┴───────────┘
True
True
False
功能-Functions
try :
df. select( pl. col( 'sales' ) + 5 , pl. col( 'sales' ) - 5 )
except Exception as e:
print ( e)
print ( df. select( ( pl. col( 'sales' ) + 5 ) . alias( '+5' ) , ( pl. col( 'sales' ) - 5 ) . alias( '-5' ) ) )
the name: 'sales' is duplicate
It's possible that multiple expressions are returning the same default column name. If this is the case, try renaming the columns with `.alias("new_name")` to avoid duplicate column names.
shape: (3, 2)
┌───────────┬───────────┐
│ +5 ┆ -5 │
│ --- ┆ --- │
│ f64 ┆ f64 │
╞═══════════╪═══════════╡
│ 38.4 ┆ 28.4 │
│ 2142139.1 ┆ 2142129.1 │
│ 49.7 ┆ 39.7 │
└───────────┴───────────┘
print ( df. select( pl. col( '*' ) . name. map ( lambda x: x[ : - 1 ] ) ) )
print ( df. select( pl. col( 'place' ) . name. suffix( '_suffix' ) , pl. col( 'place' ) . name. prefix( 'prefix_' ) ) )
shape: (3, 7)
┌──────┬─────┬────────┬────────────┬───────────┬───────────┬─────────────────────┐
│ inde ┆ i ┆ plac ┆ dat ┆ sale ┆ has_peopl ┆ logged_a │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ i64 ┆ str ┆ date ┆ f64 ┆ bool ┆ datetime[μs] │
╞══════╪═════╪════════╪════════════╪═══════════╪═══════════╪═════════════════════╡
│ 0 ┆ 9 ┆ Mars ┆ 2022-01-01 ┆ 33.4 ┆ false ┆ 2022-12-01 00:00:00 │
│ 1 ┆ 4 ┆ Earth ┆ 2022-01-02 ┆ 2142134.1 ┆ true ┆ 2022-12-01 00:00:01 │
│ 2 ┆ 2 ┆ Saturn ┆ 2022-01-03 ┆ 44.7 ┆ false ┆ 2022-12-01 00:00:02 │
└──────┴─────┴────────┴────────────┴───────────┴───────────┴─────────────────────┘
shape: (3, 2)
┌──────────────┬──────────────┐
│ place_suffix ┆ prefix_place │
│ --- ┆ --- │
│ str ┆ str │
╞══════════════╪══════════════╡
│ Mars ┆ Mars │
│ Earth ┆ Earth │
│ Saturn ┆ Saturn │
└──────────────┴──────────────┘
df. select(
pl. col( 'place' ) . n_unique( ) . alias( 'unique' ) ,
pl. approx_n_unique( 'place' ) . alias( 'unique_approx' )
)
shape: (1, 2)
unique unique_approx u32 u32 3 3
print ( df)
print ( df. select( pl. when( pl. col( 'id' ) > 2 ) . then( pl. col( 'sales' ) + 2 ) . otherwise( pl. col( 'sales' ) - 2 ) ) )
shape: (3, 7)
┌───────┬─────┬────────┬────────────┬───────────┬────────────┬─────────────────────┐
│ index ┆ id ┆ place ┆ date ┆ sales ┆ has_people ┆ logged_at │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ i64 ┆ str ┆ date ┆ f64 ┆ bool ┆ datetime[μs] │
╞═══════╪═════╪════════╪════════════╪═══════════╪════════════╪═════════════════════╡
│ 0 ┆ 9 ┆ Mars ┆ 2022-01-01 ┆ 33.4 ┆ false ┆ 2022-12-01 00:00:00 │
│ 1 ┆ 4 ┆ Earth ┆ 2022-01-02 ┆ 2142134.1 ┆ true ┆ 2022-12-01 00:00:01 │
│ 2 ┆ 2 ┆ Saturn ┆ 2022-01-03 ┆ 44.7 ┆ false ┆ 2022-12-01 00:00:02 │
└───────┴─────┴────────┴────────────┴───────────┴────────────┴─────────────────────┘
shape: (3, 1)
┌───────────┐
│ sales │
│ --- │
│ f64 │
╞═══════════╡
│ 35.4 │
│ 2142136.1 │
│ 42.7 │
└───────────┘
Casting 数据类型变换
polars中可以使用cast方法实现数据类型的转换,同时cast的strict参数决定在转换过程中如何处理不能被转换的数据,strict默认为True,此时遇到不能被转换数据会报错,设置strict=False时,不能被转换的数据将会被设置为null
数值
df = pl. DataFrame(
{
"integers" : [ 1 , 2 , 3 , 4 , 5 ] ,
"big_integers" : [ 1 , 10000002 , 3 , 10000004 , 10000005 ] ,
"floats" : [ 4.0 , 5.0 , 6.0 , 7.0 , 8.0 ] ,
"floats_with_decimal" : [ 4.532 , 5.5 , 6.5 , 7.5 , 8.5 ] ,
}
)
print ( df)
print ( df. select(
pl. col( 'floats_with_decimal' ) . cast( pl. Int32) ,
pl. col( 'big_integers' ) . cast( pl. Int8, strict= False )
) )
shape: (5, 4)
┌──────────┬──────────────┬────────┬─────────────────────┐
│ integers ┆ big_integers ┆ floats ┆ floats_with_decimal │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ f64 ┆ f64 │
╞══════════╪══════════════╪════════╪═════════════════════╡
│ 1 ┆ 1 ┆ 4.0 ┆ 4.532 │
│ 2 ┆ 10000002 ┆ 5.0 ┆ 5.5 │
│ 3 ┆ 3 ┆ 6.0 ┆ 6.5 │
│ 4 ┆ 10000004 ┆ 7.0 ┆ 7.5 │
│ 5 ┆ 10000005 ┆ 8.0 ┆ 8.5 │
└──────────┴──────────────┴────────┴─────────────────────┘
shape: (5, 2)
┌─────────────────────┬──────────────┐
│ floats_with_decimal ┆ big_integers │
│ --- ┆ --- │
│ i32 ┆ i8 │
╞═════════════════════╪══════════════╡
│ 4 ┆ 1 │
│ 5 ┆ null │
│ 6 ┆ 3 │
│ 7 ┆ null │
│ 8 ┆ null │
└─────────────────────┴──────────────┘
字符串
df. select( pl. col( "*" ) . cast( pl. String) )
shape: (5, 4)
integers big_integers floats floats_with_decimal str str str str "1" "1" "4.0" "4.532" "2" "10000002" "5.0" "5.5" "3" "3" "6.0" "6.5" "4" "10000004" "7.0" "7.5" "5" "10000005" "8.0" "8.5"
布尔值
df. select( pl. col( '*' ) . cast( pl. Boolean) )
shape: (5, 4)
integers big_integers floats floats_with_decimal bool bool bool bool true true true true true true true true true true true true true true true true true true true true
日期
from datetime import date, datetime
df = pl. DataFrame(
{
"date" : pl. date_range( date( 2022 , 1 , 1 ) , date( 2022 , 1 , 5 ) , eager= True ) ,
"datetime" : pl. datetime_range(
datetime( 2022 , 1 , 1 ) , datetime( 2022 , 1 , 5 ) , eager= True
) ,
}
)
print ( df)
print ( df. select( pl. col( "date" ) . cast( pl. Int64) , pl. col( "datetime" ) . dt. to_string( '%Y-%m' ) ) )
shape: (5, 2)
┌────────────┬─────────────────────┐
│ date ┆ datetime │
│ --- ┆ --- │
│ date ┆ datetime[μs] │
╞════════════╪═════════════════════╡
│ 2022-01-01 ┆ 2022-01-01 00:00:00 │
│ 2022-01-02 ┆ 2022-01-02 00:00:00 │
│ 2022-01-03 ┆ 2022-01-03 00:00:00 │
│ 2022-01-04 ┆ 2022-01-04 00:00:00 │
│ 2022-01-05 ┆ 2022-01-05 00:00:00 │
└────────────┴─────────────────────┘
shape: (5, 2)
┌───────┬──────────┐
│ date ┆ datetime │
│ --- ┆ --- │
│ i64 ┆ str │
╞═══════╪══════════╡
│ 18993 ┆ 2022-01 │
│ 18994 ┆ 2022-01 │
│ 18995 ┆ 2022-01 │
│ 18996 ┆ 2022-01 │
│ 18997 ┆ 2022-01 │
└───────┴──────────┘
字符串处理与分析
df = pl. DataFrame( { "animal" : [ "Crab" , "cat and dog" , "rab$bit" , None ] } )
print ( df)
print ( df. select( pl. col( 'animal' ) . str . len_bytes( ) . alias( 'len_bytes' ) ,
pl. col( 'animal' ) . str . len_chars( ) . alias( 'len_chars' ) ,
) )
shape: (4, 1)
┌─────────────┐
│ animal │
│ --- │
│ str │
╞═════════════╡
│ Crab │
│ cat and dog │
│ rab$bit │
│ null │
└─────────────┘
shape: (4, 2)
┌───────────┬───────────┐
│ len_bytes ┆ len_chars │
│ --- ┆ --- │
│ u32 ┆ u32 │
╞═══════════╪═══════════╡
│ 4 ┆ 4 │
│ 11 ┆ 11 │
│ 7 ┆ 7 │
│ null ┆ null │
└───────────┴───────────┘
print ( df. select(
pl. all ( ) ,
pl. all ( ) . str . contains( 'ca|ra' ) . alias( 'contains_ca|ra' ) ,
pl. all ( ) . str . contains( 'rab$' , literal= True ) . alias( 'cliteral' ) ,
pl. all ( ) . str . ends_with( 'dog' ) . alias( "ends_with" ) ,
) )
shape: (4, 4)
┌─────────────┬────────────────┬──────────┬───────────┐
│ animal ┆ contains_ca|ra ┆ cliteral ┆ ends_with │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ bool ┆ bool ┆ bool │
╞═════════════╪════════════════╪══════════╪═══════════╡
│ Crab ┆ true ┆ false ┆ false │
│ cat and dog ┆ true ┆ false ┆ true │
│ rab$bit ┆ true ┆ true ┆ false │
│ null ┆ null ┆ null ┆ null │
└─────────────┴────────────────┴──────────┴───────────┘
df = pl. DataFrame( { "foo" : [ "123 bla 45 asd" , "xyz 678 910t" ] } )
out = df. select(
pl. col( 'foo' ) . str . extract( r'(\d{3})' , group_index= 1 ) . alias( 'extract' ) ,
pl. col( "foo" ) . str . extract_all( r"(\d+)" ) . alias( "extracted_all" ) ,
)
print ( out)
shape: (2, 2)
┌─────────┬────────────────┐
│ extract ┆ extracted_all │
│ --- ┆ --- │
│ str ┆ list[str] │
╞═════════╪════════════════╡
│ 123 ┆ ["123", "45"] │
│ 678 ┆ ["678", "910"] │
└─────────┴────────────────┘
df. select(
pl. col( 'foo' ) ,
pl. col( 'foo' ) . str . replace( '123' , '321' ) . alias( 'replace' ) ,
pl. col( 'foo' ) . str . replace_all( 'a' , '*' ) . alias( 'replace_all' )
)
shape: (2, 3)
foo replace replace_all str str str "123 bla 45 asd" "321 bla 45 asd" "123 bl* 45 *sd" "xyz 678 910t" "xyz 678 910t" "xyz 678 910t"
聚合 aggregation
聚合可以生成多种多样的匹配方式,且polars的聚合context中可以添加多个expressions,不设操作上限,功能强大。 使用多个expression时,利用lazy() API有助于加快速度,但与eager() API没有操作上的差别
url = "https://theunitedstates.io/congress-legislators/legislators-historical.csv"
dataset = pl. read_csv( url) . with_columns(
pl. col( "birthday" ) . str . to_date( strict= False ) ,
pl. col( "first_name" , "gender" , "type" , "state" , "party" ) . cast( pl. Categorical)
)
dataset. head( 3 )
shape: (3, 36)
last_name first_name middle_name suffix nickname full_name birthday gender type state district senate_class party url address phone contact_form rss_url twitter twitter_id facebook youtube youtube_id mastodon bioguide_id thomas_id opensecrets_id lis_id fec_ids cspan_id govtrack_id votesmart_id ballotpedia_id washington_post_id icpsr_id wikipedia_id str cat str str str str date cat cat cat i64 i64 cat str str str str str str str str str str str str str str str str str i64 str str str i64 str "Bassett" "Richard" null null null null 1745-04-02 "M" "sen" "DE" null 2 "Anti-Administration" null null null null null null null null null null null "B000226" null null null null null 401222 null null null 507 "Richard Bassett (Delaware poli… "Bland" "Theodorick" null null null null 1742-03-21 "M" "rep" "VA" 9 null null null null null null null null null null null null null "B000546" null null null null null 401521 null null null 786 "Theodorick Bland (congressman)" "Burke" "Aedanus" null null null null 1743-06-16 "M" "rep" "SC" 2 null null null null null null null null null null null null null "B001086" null null null null null 402032 null null null 1260 "Aedanus Burke"
print ( dataset. group_by( 'first_name' ) . agg(
pl. count( 'party' ) . alias( 'short_form' ) ,
pl. col( 'party' ) . count( ) . alias( 'full_form' ) ,
pl. col( 'gender' ) ,
pl. first( 'last_name' ) . alias( 'shortform' ) ,
) . limit( 3 ) )
shape: (3, 5)
┌────────────┬────────────┬───────────┬───────────────────┬───────────┐
│ first_name ┆ short_form ┆ full_form ┆ gender ┆ shortform │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ cat ┆ u32 ┆ u32 ┆ list[cat] ┆ str │
╞════════════╪════════════╪═══════════╪═══════════════════╪═══════════╡
│ Wyche ┆ 1 ┆ 1 ┆ ["M"] ┆ Fowler │
│ Philemon ┆ 5 ┆ 5 ┆ ["M", "M", … "M"] ┆ Dickinson │
│ Benedict ┆ 0 ┆ 0 ┆ ["M", "M"] ┆ Arnold │
└────────────┴────────────┴───────────┴───────────────────┴───────────┘
q = dataset. lazy( ) . group_by( 'state' ) . agg(
( pl. col( 'party' ) == 'Pro-Administration' ) . sum ( ) . alias( 'pro' )
) . limit( 3 )
print ( q. collect( ) )
shape: (3, 2)
┌───────┬─────┐
│ state ┆ pro │
│ --- ┆ --- │
│ cat ┆ u32 │
╞═══════╪═════╡
│ GU ┆ 0 │
│ MT ┆ 0 │
│ MS ┆ 0 │
└───────┴─────┘
q = dataset. lazy( ) . group_by( 'state' , 'party' ) . agg(
pl. col( 'gender' ) . count( ) . alias( 'count' ) ) . filter ( pl. col( 'count' ) > 1 ) . sort( 'state' )
q. collect( )
shape: (321, 3)
state party count cat cat u32 "DE" "Jackson" 2 "DE" "Ind. Republican-Democrat" 2 "DE" null 3 "DE" "Whig" 7 "DE" "Federalist" 9 … … … "AK" "Democrat" 8 "PI" null 13 "VI" "Democrat" 2 "GU" "Democrat" 4 "AS" "Democrat" 2
from datetime import date
def compute_age ( ) :
return date. today( ) . year - pl. col( "birthday" ) . dt. year( )
def avg_birthday ( gender: str ) - > pl. Expr:
return (
compute_age( )
. filter ( pl. col( "gender" ) == gender)
. mean( )
. alias( f"avg { gender} birthday" )
)
q = dataset. lazy( ) . group_by( "state" ) . agg(
avg_birthday( "M" ) ,
avg_birthday( "F" ) ,
( pl. col( "gender" ) == "M" ) . sum ( ) . alias( "# male" ) ,
( pl. col( "gender" ) == "F" ) . sum ( ) . alias( "# female" ) ,
) . limit( 3 )
q. collect( )
shape: (3, 5)
state avg M birthday avg F birthday # male # female cat f64 f64 u32 u32 "IL" 156.710638 100.4 478 15 "MN" 140.647799 97.333333 159 3 "NJ" 177.005865 115.0 357 5
缺失值处理 missing data
df = pl. DataFrame(
{
"col1" : [ 1 , 2 , 3 ] ,
"col2" : [ 1 , None , 5 ] ,
} ,
)
print ( df. null_count( ) )
print ( df. count( ) )
print ( df. select( pl. all ( ) . is_null( ) ) )
shape: (1, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ u32 ┆ u32 │
╞══════╪══════╡
│ 0 ┆ 1 │
└──────┴──────┘
shape: (1, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ u32 ┆ u32 │
╞══════╪══════╡
│ 3 ┆ 2 │
└──────┴──────┘
shape: (3, 2)
┌───────┬───────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ bool ┆ bool │
╞═══════╪═══════╡
│ false ┆ false │
│ false ┆ true │
│ false ┆ false │
└───────┴───────┘
缺失值填充
print ( df. with_columns(
pl. col( 'col2' ) . fill_null( pl. lit( 9 ) ) . alias( 'fill_col2' )
) )
print ( df. with_columns(
pl. col( 'col2' ) . fill_null( strategy= 'forward' ) . alias( 'forward' ) ,
pl. col( 'col2' ) . fill_null( strategy= 'backward' ) . alias( 'backward' ) ,
) )
print ( df. with_columns(
pl. col( 'col2' ) . fill_null( pl. col( 'col1' ) . mean( ) ) . alias( 'expression' ) ,
pl. col( 'col2' ) . interpolate( ) . alias( 'interpolate' )
) )
shape: (3, 3)
┌──────┬──────┬───────────┐
│ col1 ┆ col2 ┆ fill_col2 │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪═══════════╡
│ 1 ┆ 1 ┆ 1 │
│ 2 ┆ null ┆ 9 │
│ 3 ┆ 5 ┆ 5 │
└──────┴──────┴───────────┘
shape: (3, 4)
┌──────┬──────┬─────────┬──────────┐
│ col1 ┆ col2 ┆ forward ┆ backward │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪═════════╪══════════╡
│ 1 ┆ 1 ┆ 1 ┆ 1 │
│ 2 ┆ null ┆ 1 ┆ 5 │
│ 3 ┆ 5 ┆ 5 ┆ 5 │
└──────┴──────┴─────────┴──────────┘
shape: (3, 4)
┌──────┬──────┬────────────┬─────────────┐
│ col1 ┆ col2 ┆ expression ┆ interpolate │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ f64 ┆ f64 │
╞══════╪══════╪════════════╪═════════════╡
│ 1 ┆ 1 ┆ 1.0 ┆ 1.0 │
│ 2 ┆ null ┆ 2.0 ┆ 3.0 │
│ 3 ┆ 5 ┆ 5.0 ┆ 5.0 │
└──────┴──────┴────────────┴─────────────┘
NotaNumber or NaN值
NaN和null原来是不相等的
df = pl. DataFrame(
{
"col1" : [ 1 , float ( 'nan' ) , 3 ] ,
"col2" : [ 1 , None , 5 ] ,
} ,
)
print ( df. null_count( ) )
print ( df. with_columns(
pl. col( 'col1' ) . fill_null( 9 ) . alias( 'fill_null' ) ,
pl. col( 'col1' , ) . fill_nan( 9 ) . alias( 'fill_nan' ) ,
pl. col( 'col2' , ) . fill_nan( 9 ) . alias( 'null_fill_nan' ) ,
) )
shape: (1, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ u32 ┆ u32 │
╞══════╪══════╡
│ 0 ┆ 1 │
└──────┴──────┘
shape: (3, 5)
┌──────┬──────┬───────────┬──────────┬───────────────┐
│ col1 ┆ col2 ┆ fill_null ┆ fill_nan ┆ null_fill_nan │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ i64 ┆ f64 ┆ f64 ┆ i64 │
╞══════╪══════╪═══════════╪══════════╪═══════════════╡
│ 1.0 ┆ 1 ┆ 1.0 ┆ 1.0 ┆ 1 │
│ NaN ┆ null ┆ NaN ┆ 9.0 ┆ null │
│ 3.0 ┆ 5 ┆ 3.0 ┆ 3.0 ┆ 5 │
└──────┴──────┴───────────┴──────────┴───────────────┘
window function
对expression指定窗口(类别,大概这个意思),与聚合group_by作用类似,但能在expression背景下使用,个人觉得十分便捷
df = pl. read_csv(
"https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv"
)
df. head( )
shape: (5, 13)
# Name Type 1 Type 2 Total HP Attack Defense Sp. Atk Sp. Def Speed Generation Legendary i64 str str str i64 i64 i64 i64 i64 i64 i64 i64 bool 1 "Bulbasaur" "Grass" "Poison" 318 45 49 49 65 65 45 1 false 2 "Ivysaur" "Grass" "Poison" 405 60 62 63 80 80 60 1 false 3 "Venusaur" "Grass" "Poison" 525 80 82 83 100 100 80 1 false 3 "VenusaurMega Venusaur" "Grass" "Poison" 625 80 100 123 122 120 80 1 false 4 "Charmander" "Fire" null 309 39 52 43 60 50 65 1 false
print ( df. tail( 6 ) . select(
"Type 1" ,
"Type 2" ,
'Attack' ,
pl. col( "Attack" ) . mean( ) . over( "Type 1" ) . alias( "avg_attack_by_type" ) ,
pl. col( "Attack" ) . mean( ) . alias( "avg_attack" ) ,
'Defense' ,
pl. col( "Defense" ) . mean( ) . over( [ "Type 1" , "Type 2" ] ) . alias( "avg_defense_by_type_combination" ) ,
pl. col( "Defense" ) . mean( ) . alias( "avg_defense" ) ,
) )
shape: (6, 8)
┌──────────┬────────┬────────┬────────────────┬────────────┬─────────┬───────────────┬─────────────┐
│ Type 1 ┆ Type 2 ┆ Attack ┆ avg_attack_by_ ┆ avg_attack ┆ Defense ┆ avg_defense_b ┆ avg_defense │
│ --- ┆ --- ┆ --- ┆ type ┆ --- ┆ --- ┆ y_type_combin ┆ --- │
│ str ┆ str ┆ i64 ┆ --- ┆ f64 ┆ i64 ┆ atio… ┆ f64 │
│ ┆ ┆ ┆ f64 ┆ ┆ ┆ --- ┆ │
│ ┆ ┆ ┆ ┆ ┆ ┆ f64 ┆ │
╞══════════╪════════╪════════╪════════════════╪════════════╪═════════╪═══════════════╪═════════════╡
│ Electric ┆ Flying ┆ 90 ┆ 90.0 ┆ 97.0 ┆ 85 ┆ 85.0 ┆ 78.333333 │
│ Fire ┆ Flying ┆ 100 ┆ 100.0 ┆ 97.0 ┆ 90 ┆ 90.0 ┆ 78.333333 │
│ Dragon ┆ null ┆ 64 ┆ 94.0 ┆ 97.0 ┆ 45 ┆ 55.0 ┆ 78.333333 │
│ Dragon ┆ null ┆ 84 ┆ 94.0 ┆ 97.0 ┆ 65 ┆ 55.0 ┆ 78.333333 │
│ Dragon ┆ Flying ┆ 134 ┆ 94.0 ┆ 97.0 ┆ 95 ┆ 95.0 ┆ 78.333333 │
│ Psychic ┆ null ┆ 110 ┆ 110.0 ┆ 97.0 ┆ 90 ┆ 90.0 ┆ 78.333333 │
└──────────┴────────┴────────┴────────────────┴────────────┴─────────┴───────────────┴─────────────┘
df. filter ( pl. col( 'Type 2' ) == 'Psychic' ) . select( 'Name' , 'Type 1' , "Speed" )
shape: (7, 3)
Name Type 1 Speed str str i64 "Slowpoke" "Water" 15 "Slowbro" "Water" 30 "SlowbroMega Slowbro" "Water" 30 "Exeggcute" "Grass" 40 "Exeggutor" "Grass" 55 "Starmie" "Water" 115 "Jynx" "Ice" 95
df. filter ( pl. col( 'Type 2' ) == 'Psychic' ) . with_columns( pl. col( 'Name' , 'Type 1' , "Speed" ) . sort_by( 'Speed' , descending= True ) . over( 'Type 1' ) ) . select( 'Name' , 'Type 1' , "Speed" )
shape: (7, 3)
Name Type 1 Speed str str i64 "Starmie" "Water" 115 "Slowbro" "Water" 30 "SlowbroMega Slowbro" "Water" 30 "Exeggutor" "Grass" 55 "Exeggcute" "Grass" 40 "Slowpoke" "Water" 15 "Jynx" "Ice" 95
df. select( 'Attack' , 'Defense' , 'Type 2' ) . tail( 6 ) . with_columns(
pl. sum ( 'Attack' ) . over( 'Type 2' ) . alias( 'sum_attack_by_Type2' ) ,
( pl. col( 'Attack' ) . sum ( ) + pl. col( 'Defense' ) ) . over( 'Type 2' ) . alias( 'group_to_rows' ) ,
( pl. col( 'Attack' ) . sum ( ) + pl. col( 'Defense' ) ) . over( 'Type 2' , mapping_strategy= "join" ) . alias( 'join' ) ,
( pl. col( 'Attack' ) . sum ( ) + pl. col( 'Defense' ) ) . over( 'Type 2' , mapping_strategy= "explode" ) . alias( 'explode' ) ,
)
shape: (6, 7)
Attack Defense Type 2 sum_attack_by_Type2 group_to_rows join explode i64 i64 str i64 i64 list[i64] i64 90 85 "Flying" 324 409 [409, 414, 419] 409 100 90 "Flying" 324 414 [409, 414, 419] 414 64 45 null 258 303 [303, 323, 348] 419 84 65 null 258 323 [303, 323, 348] 303 134 95 "Flying" 324 419 [409, 414, 419] 323 110 90 null 258 348 [303, 323, 348] 348
df. sort( "Type 1" ) . select(
pl. col( "Type 1" ) . head( 3 ) . over( "Type 1" , mapping_strategy= "explode" ) ,
pl. col( "Name" )
. sort_by( pl. col( "Speed" ) , descending= True )
. head( 3 )
. over( "Type 1" , mapping_strategy= "explode" )
. alias( "fastest/group" ) ,
pl. col( "Name" )
. sort_by( pl. col( "Attack" ) , descending= True )
. head( 3 )
. over( "Type 1" , mapping_strategy= "explode" )
. alias( "strongest/group" ) ,
pl. col( "Name" )
. sort( )
. head( 3 )
. over( "Type 1" , mapping_strategy= "explode" )
. alias( "sorted_by_alphabet" ) ,
)
shape: (43, 4)
Type 1 fastest/group strongest/group sorted_by_alphabet str str str str "Bug" "BeedrillMega Beedrill" "PinsirMega Pinsir" "Beedrill" "Bug" "Scyther" "BeedrillMega Beedrill" "BeedrillMega Beedrill" "Bug" "PinsirMega Pinsir" "Pinsir" "Butterfree" "Dragon" "Dragonite" "Dragonite" "Dragonair" "Dragon" "Dragonair" "Dragonair" "Dragonite" … … … … "Rock" "Aerodactyl" "Golem" "AerodactylMega Aerodactyl" "Rock" "Kabutops" "Kabutops" "Geodude" "Water" "Starmie" "GyaradosMega Gyarados" "Blastoise" "Water" "Tentacruel" "Kingler" "BlastoiseMega Blastoise" "Water" "Poliwag" "Gyarados" "Cloyster"
Fold
对行进行操作(横向)
help ( pl. fold)
Help on function fold in module polars.functions.lazy:
fold(acc: 'IntoExpr', function: 'Callable[[Series, Series], Series]', exprs: 'Sequence[Expr | str] | Expr') -> 'Expr'
Accumulate over multiple columns horizontally/ row wise with a left fold.
Parameters
----------
acc
Accumulator Expression. This is the value that will be initialized when the fold
starts. For a sum this could for instance be lit(0).
function
Function to apply over the accumulator and the value.
Fn(acc, value) -> new_value
exprs
Expressions to aggregate over. May also be a wildcard expression.
Notes
-----
If you simply want the first encountered expression as accumulator,
consider using `reduce`.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 3],
... "b": [3, 4, 5],
... "c": [5, 6, 7],
... }
... )
>>> df
shape: (3, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 1 ┆ 3 ┆ 5 │
│ 2 ┆ 4 ┆ 6 │
│ 3 ┆ 5 ┆ 7 │
└─────┴─────┴─────┘
Horizontally sum over all columns and add 1.
>>> df.select(
... pl.fold(
... acc=pl.lit(1), function=lambda acc, x: acc + x, exprs=pl.col("*")
... ).alias("sum"),
... )
shape: (3, 1)
┌─────┐
│ sum │
│ --- │
│ i64 │
╞═════╡
│ 10 │
│ 13 │
│ 16 │
└─────┘
You can also apply a condition/predicate on all columns:
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 3],
... "b": [0, 1, 2],
... }
... )
>>> df
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 0 │
│ 2 ┆ 1 │
│ 3 ┆ 2 │
└─────┴─────┘
>>> df.filter(
... pl.fold(
... acc=pl.lit(True),
... function=lambda acc, x: acc & x,
... exprs=pl.col("*") > 1,
... )
... )
shape: (1, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 3 ┆ 2 │
└─────┴─────┘
df = pl. DataFrame(
{
"a" : [ 1 , 2 , 3 ] ,
"b" : [ 2 , 3 , 2 ] ,
}
)
df. with_columns(
pl. fold( acc= pl. lit( 6 ) , function= lambda acc, x: acc + x, exprs= pl. all ( ) ) . alias( "sum" ) ,
)
shape: (3, 3)
a b sum i64 i64 i64 1 2 9 2 3 11 3 2 11
df. filter (
pl. fold( acc= True , function= lambda acc, x: acc& x, exprs= pl. col( '*' ) > 1 )
)
df = pl. DataFrame(
{
"a" : [ "a" , "b" , "c" ] ,
"b" : [ 1 , 2 , 3 ] ,
}
)
df. with_columns( pl. concat_str( [ "a" , "b" ] ) . alias( 'concat_str' ) )
shape: (3, 3)
a b concat_str str i64 str "a" 1 "a1" "b" 2 "b2" "c" 3 "c3"
List and Arrays
List
weather = pl. DataFrame(
{
"station" : [ "Station " + str ( x) for x in range ( 1 , 6 ) ] ,
"temperatures" : [
"20 5 5 E1 7 13 19 9 6 20" ,
"18 8 16 11 23 E2 8 E2 E2 E2 90 70 40" ,
"19 24 E9 16 6 12 10 22" ,
"E2 E0 15 7 8 10 E1 24 17 13 6" ,
"14 8 E0 16 22 24 E1" ,
] ,
}
)
print ( weather)
shape: (5, 2)
┌───────────┬─────────────────────────────────┐
│ station ┆ temperatures │
│ --- ┆ --- │
│ str ┆ str │
╞═══════════╪═════════════════════════════════╡
│ Station 1 ┆ 20 5 5 E1 7 13 19 9 6 20 │
│ Station 2 ┆ 18 8 16 11 23 E2 8 E2 E2 E2 90… │
│ Station 3 ┆ 19 24 E9 16 6 12 10 22 │
│ Station 4 ┆ E2 E0 15 7 8 10 E1 24 17 13 6 │
│ Station 5 ┆ 14 8 E0 16 22 24 E1 │
└───────────┴─────────────────────────────────┘
weather. with_columns( pl. col( 'temperatures' ) . str . split( ' ' ) )
shape: (5, 2)
station temperatures str list[str] "Station 1" ["20", "5", … "20"] "Station 2" ["18", "8", … "40"] "Station 3" ["19", "24", … "22"] "Station 4" ["E2", "E0", … "6"] "Station 5" ["14", "8", … "E1"]
weather. with_columns( pl. col( 'temperatures' ) . str . split( ' ' ) ) . explode( 'temperatures' )
shape: (49, 2)
station temperatures str str "Station 1" "20" "Station 1" "5" "Station 1" "5" "Station 1" "E1" "Station 1" "7" … … "Station 5" "E0" "Station 5" "16" "Station 5" "22" "Station 5" "24" "Station 5" "E1"
weather. with_columns( pl. col( 'temperatures' ) . str . split( ' ' ) ) . with_columns(
pl. col( 'temperatures' ) . list . head( 3 ) . alias( 'top3' ) ,
pl. col( 'temperatures' ) . list . slice ( 1 , 3 ) . alias( 'slice' ) ,
pl. col( 'temperatures' ) . list . slice ( 1 , 3 ) . list . len ( ) . alias( 'slice_len' )
)
shape: (5, 5)
station temperatures top3 slice slice_len str list[str] list[str] list[str] u32 "Station 1" ["20", "5", … "20"] ["20", "5", "5"] ["5", "5", "E1"] 3 "Station 2" ["18", "8", … "40"] ["18", "8", "16"] ["8", "16", "11"] 3 "Station 3" ["19", "24", … "22"] ["19", "24", "E9"] ["24", "E9", "16"] 3 "Station 4" ["E2", "E0", … "6"] ["E2", "E0", "15"] ["E0", "15", "7"] 3 "Station 5" ["14", "8", … "E1"] ["14", "8", "E0"] ["8", "E0", "16"] 3
weather. with_columns( pl. col( 'temperatures' ) . str . split( ' ' ) ) . with_columns(
pl. col( 'temperatures' ) . list . eval ( pl. element( ) == '5' ) . list . sum ( ) . alias( 'count_5' )
)
shape: (5, 3)
station temperatures count_5 str list[str] u32 "Station 1" ["20", "5", … "20"] 2 "Station 2" ["18", "8", … "40"] 0 "Station 3" ["19", "24", … "22"] 0 "Station 4" ["E2", "E0", … "6"] 0 "Station 5" ["14", "8", … "E1"] 0
help ( pl. col( 'temperatures' ) . list . eval )
Help on method eval in module polars.expr.list:
eval(expr: 'Expr', *, parallel: 'bool' = False) -> 'Expr' method of polars.expr.list.ExprListNameSpace instance
Run any polars expression against the lists' elements.
Parameters
----------
expr
Expression to run. Note that you can select an element with `pl.first()`, or
`pl.col()`
parallel
Run all expression parallel. Don't activate this blindly.
Parallelism is worth it if there is enough work to do per thread.
This likely should not be used in the group by context, because we already
parallel execution per group
Examples
--------
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2]})
>>> df.with_columns(
... rank=pl.concat_list("a", "b").list.eval(pl.element().rank())
... )
shape: (3, 3)
┌─────┬─────┬────────────┐
│ a ┆ b ┆ rank │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ list[f64] │
╞═════╪═════╪════════════╡
│ 1 ┆ 4 ┆ [1.0, 2.0] │
│ 8 ┆ 5 ┆ [2.0, 1.0] │
│ 3 ┆ 2 ┆ [2.0, 1.0] │
└─────┴─────┴────────────┘
weather_by_day = pl. DataFrame(
{
"station" : [ "Station " + str ( x) for x in range ( 1 , 11 ) ] ,
"day_1" : [ 17 , 11 , 8 , 22 , 9 , 21 , 20 , 8 , 8 , 17 ] ,
"day_2" : [ 15 , 11 , 10 , 8 , 7 , 14 , 18 , 21 , 15 , 13 ] ,
"day_3" : [ 16 , 15 , 24 , 24 , 8 , 23 , 19 , 23 , 16 , 10 ] ,
}
)
rank_pct = ( pl. element( ) . rank( descending= True ) / pl. col( "*" ) . count( ) ) . round ( 2 )
weather_by_day. with_columns(
pl. concat_list( pl. all ( ) . exclude( "station" ) ) . alias( "all_temps" )
) . select(
pl. all ( ) . exclude( "all_temps" ) ,
pl. col( "all_temps" ) . list . eval ( rank_pct) . alias( "temps_rank" ) ,
)
shape: (10, 5)
station day_1 day_2 day_3 temps_rank str i64 i64 i64 list[f64] "Station 1" 17 15 16 [0.33, 1.0, 0.67] "Station 2" 11 11 15 [0.83, 0.83, 0.33] "Station 3" 8 10 24 [1.0, 0.67, 0.33] "Station 4" 22 8 24 [0.67, 1.0, 0.33] "Station 5" 9 7 8 [0.33, 1.0, 0.67] "Station 6" 21 14 23 [0.67, 1.0, 0.33] "Station 7" 20 18 19 [0.33, 1.0, 0.67] "Station 8" 8 21 23 [1.0, 0.67, 0.33] "Station 9" 8 15 16 [1.0, 0.67, 0.33] "Station 10" 17 13 10 [0.33, 0.67, 1.0]
Arrays
array_df = pl. DataFrame(
[
pl. Series( "Array_1" , [ [ 1 , 3 ] , [ 2 , 5 ] ] ) ,
pl. Series( "Array_2" , [ [ 1 , 7 , 3 ] , [ 8 , 1 , 0 ] ] ) ,
] ,
schema= {
"Array_1" : pl. Array( pl. Int64, 2 ) ,
"Array_2" : pl. Array( pl. Int64, 3 ) ,
} ,
)
array_df. select(
pl. col( "Array_1" ) . arr. min ( ) . name. suffix( "_min" ) ,
pl. col( "Array_2" ) . arr. sum ( ) . name. suffix( "_sum" ) ,
)
shape: (2, 2)
Array_1_min Array_2_sum i64 i64 1 11 2 9
使用自定义功能
可以使用map_batches与map_element执行自定义功能
'''df.with_columns([
pl.col("features").map_batches(lambda s: MyNeuralNetwork.forward(s.to_numpy())).alias("activations")
])'''
df. with_columns(
pl. col( 'b' ) . map_batches( lambda x: sum ( x) ) . alias( 'map_batches' ) ,
pl. col( 'b' ) . map_elements( lambda x: x+ 1 , return_dtype= pl. Int32) . alias( 'map_element' )
)
C:\Users\wencj\AppData\Local\Temp\ipykernel_15028\2912822968.py:4: PolarsInefficientMapWarning:
Expr.map_elements is significantly slower than the native expressions API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
- pl.col("b").map_elements(lambda x: ...)
with this one instead:
+ pl.col("b") + 1
pl.col('b').map_elements(lambda x: x+1, return_dtype=pl.Int32).alias('map_element')
shape: (3, 4)
a b map_batches map_element str i64 i64 i32 "a" 1 6 2 "b" 2 6 3 "c" 3 6 4
df = pl. DataFrame(
{
"keys" : [ "a" , "a" , "b" ] ,
"values" : [ 10 , 7 , 1 ] ,
}
)
df
shape: (3, 2)
keys values str i64 "a" 10 "a" 7 "b" 1
df. group_by( "keys" , maintain_order= True ) . agg(
pl. col( "values" )
. map_batches( lambda s: s. shift( ) , is_elementwise= True )
. alias( "shift_map_batches" ) ,
pl. col( "values" ) . shift( ) . alias( "shift_expression" ) ,
)
shape: (2, 3)
keys shift_map_batches shift_expression str list[i64] list[i64] "a" [null, 10] [null, 10] "b" [7] [null]
df. group_by( "keys" , maintain_order= True ) . agg(
pl. col( "values" )
. map_elements( lambda s: s. shift( ) , return_dtype= pl. List( int ) )
. alias( "shift_map_elements" ) ,
pl. col( "values" ) . shift( ) . alias( "shift_expression" ) ,
)
shape: (2, 3)
keys shift_map_elements shift_expression str list[i64] list[i64] "a" [null, 10] [null, 10] "b" [null] [null]
Struct
使用pl.struct对多行进行操作,struct是polars中的一种数据类型
ratings = pl. DataFrame(
{
"Movie" : [ "Cars" , "IT" , "ET" , "Cars" , "Up" , "IT" , "Cars" , "ET" , "Up" , "ET" ] ,
"Theatre" : [ "NE" , "ME" , "IL" , "ND" , "NE" , "SD" , "NE" , "IL" , "IL" , "SD" ] ,
"Avg_Rating" : [ 4.5 , 4.4 , 4.6 , 4.3 , 4.8 , 4.7 , 4.7 , 4.9 , 4.7 , 4.6 ] ,
"Count" : [ 30 , 27 , 26 , 29 , 31 , 28 , 28 , 26 , 33 , 26 ] ,
}
)
out = ratings. select( pl. col( "Theatre" ) . value_counts( sort= True ) )
print ( out)
ratings. select( pl. col( "Theatre" ) . value_counts( sort= True ) ) . unnest( "Theatre" )
shape: (5, 1)
┌───────────┐
│ Theatre │
│ --- │
│ struct[2] │
╞═══════════╡
│ {"NE",3} │
│ {"IL",3} │
│ {"SD",2} │
│ {"ME",1} │
│ {"ND",1} │
└───────────┘
shape: (5, 2)
Theatre count str u32 "NE" 3 "IL" 3 "SD" 2 "ME" 1 "ND" 1
dir ( pl. struct( 'Movie' ) )
['__abs__',
'__add__',
'__and__',
'__annotations__',
'__array_ufunc__',
'__bool__',
'__class__',
'__delattr__',
'__dict__',
'__dir__',
'__doc__',
'__eq__',
'__floordiv__',
'__format__',
'__ge__',
'__getattribute__',
'__getstate__',
'__gt__',
'__hash__',
'__init__',
'__init_subclass__',
'__invert__',
'__le__',
'__lt__',
'__mod__',
'__module__',
'__mul__',
'__ne__',
'__neg__',
'__new__',
'__or__',
'__pos__',
'__pow__',
'__radd__',
'__rand__',
'__reduce__',
'__reduce_ex__',
'__repr__',
'__rfloordiv__',
'__rmod__',
'__rmul__',
'__ror__',
'__rpow__',
'__rsub__',
'__rtruediv__',
'__rxor__',
'__setattr__',
'__setstate__',
'__sizeof__',
'__str__',
'__sub__',
'__subclasshook__',
'__truediv__',
'__weakref__',
'__xor__',
'_accessors',
'_from_pyexpr',
'_map_batches_wrapper',
'_pyexpr',
'_register_plugin',
'_repr_html_',
'abs',
'add',
'agg_groups',
'alias',
'all',
'and_',
'any',
'append',
'apply',
'approx_n_unique',
'arccos',
'arccosh',
'arcsin',
'arcsinh',
'arctan',
'arctanh',
'arg_max',
'arg_min',
'arg_sort',
'arg_true',
'arg_unique',
'arr',
'backward_fill',
'bin',
'bottom_k',
'bottom_k_by',
'cast',
'cat',
'cbrt',
'ceil',
'clip',
'clip_max',
'clip_min',
'cos',
'cosh',
'cot',
'count',
'cum_count',
'cum_max',
'cum_min',
'cum_prod',
'cum_sum',
'cumcount',
'cummax',
'cummin',
'cumprod',
'cumsum',
'cumulative_eval',
'cut',
'degrees',
'deserialize',
'diff',
'dot',
'drop_nans',
'drop_nulls',
'dt',
'entropy',
'eq',
'eq_missing',
'ewm_mean',
'ewm_mean_by',
'ewm_std',
'ewm_var',
'exclude',
'exp',
'explode',
'extend_constant',
'fill_nan',
'fill_null',
'filter',
'first',
'flatten',
'floor',
'floordiv',
'forward_fill',
'from_json',
'gather',
'gather_every',
'ge',
'get',
'gt',
'has_nulls',
'hash',
'head',
'hist',
'implode',
'inspect',
'interpolate',
'interpolate_by',
'is_between',
'is_duplicated',
'is_finite',
'is_first',
'is_first_distinct',
'is_in',
'is_infinite',
'is_last',
'is_last_distinct',
'is_nan',
'is_not',
'is_not_nan',
'is_not_null',
'is_null',
'is_unique',
'keep_name',
'kurtosis',
'last',
'le',
'len',
'limit',
'list',
'log',
'log10',
'log1p',
'lower_bound',
'lt',
'map',
'map_alias',
'map_batches',
'map_dict',
'map_elements',
'max',
'mean',
'median',
'meta',
'min',
'mod',
'mode',
'mul',
'n_unique',
'name',
'nan_max',
'nan_min',
'ne',
'ne_missing',
'neg',
'not_',
'null_count',
'or_',
'over',
'pct_change',
'peak_max',
'peak_min',
'pipe',
'pow',
'prefix',
'product',
'qcut',
'quantile',
'radians',
'rank',
'rechunk',
'register_plugin',
'reinterpret',
'repeat_by',
'replace',
'reshape',
'reverse',
'rle',
'rle_id',
'rolling',
'rolling_apply',
'rolling_map',
'rolling_max',
'rolling_max_by',
'rolling_mean',
'rolling_mean_by',
'rolling_median',
'rolling_median_by',
'rolling_min',
'rolling_min_by',
'rolling_quantile',
'rolling_quantile_by',
'rolling_skew',
'rolling_std',
'rolling_std_by',
'rolling_sum',
'rolling_sum_by',
'rolling_var',
'rolling_var_by',
'round',
'round_sig_figs',
'sample',
'search_sorted',
'set_sorted',
'shift',
'shift_and_fill',
'shrink_dtype',
'shuffle',
'sign',
'sin',
'sinh',
'skew',
'slice',
'sort',
'sort_by',
'sqrt',
'std',
'str',
'struct',
'sub',
'suffix',
'sum',
'tail',
'take',
'take_every',
'tan',
'tanh',
'to_physical',
'top_k',
'top_k_by',
'truediv',
'unique',
'unique_counts',
'upper_bound',
'value_counts',
'var',
'where',
'xor']
ratings. with_columns( pl. struct( "Movie" , "Theatre" ) . is_duplicated( ) . alias( 'struct' ) )
shape: (10, 5)
Movie Theatre Avg_Rating Count struct str str f64 i64 bool "Cars" "NE" 4.5 30 true "IT" "ME" 4.4 27 false "ET" "IL" 4.6 26 true "Cars" "ND" 4.3 29 false "Up" "NE" 4.8 31 false "IT" "SD" 4.7 28 false "Cars" "NE" 4.7 28 true "ET" "IL" 4.9 26 true "Up" "IL" 4.7 33 false "ET" "SD" 4.6 26 false
ratings. with_columns(
pl. struct( "Count" , "Avg_Rating" )
. rank( "dense" , descending= True )
. over( "Movie" , "Theatre" )
. alias( "Rank" )
) . filter ( pl. struct( "Movie" , "Theatre" ) . is_duplicated( ) )
shape: (4, 5)
Movie Theatre Avg_Rating Count Rank str str f64 i64 u32 "Cars" "NE" 4.5 30 1 "ET" "IL" 4.6 26 2 "Cars" "NE" 4.7 28 2 "ET" "IL" 4.9 26 1
数据转换
Joins
join中how参数
策略 描述 inner 返回左右数据中相互匹配的行。不匹配的行将被丢弃。 left 返回左数据中的所有行,无论右数据帧中是否存在匹配。不匹配的行,其右列将被填充为null。 full 返回左右数据中的所有行。如果左列中没有找到匹配,另一列的列将被填充为null。 cross 返回左数据的所有行与右数据的所有行的笛卡尔积。重复的行将被保留;A与B的交叉连接表长度总是len(A) × len(B)。 semi 返回左数据中所有在右数据中也存在的行。 anti 返回左数据中所有在右数据中不存在的行。 同时coalesce 参数决定是否要合并左右的相同列名
df_customers = pl. DataFrame(
{
"customer_id" : [ 1 , 2 , 3 ] ,
"name" : [ "Alice" , "Bob" , "Charlie" ] ,
}
)
df_orders = pl. DataFrame(
{
"order_id" : [ "a" , "b" , "c" ] ,
"customer_id" : [ 1 , 2 , 2 ] ,
"amount" : [ 100 , 200 , 300 ] ,
}
)
print ( df_customers)
print ( df_orders)
df_customers. join( df_orders, on= 'customer_id' , how= 'inner' )
shape: (3, 2)
┌─────────────┬─────────┐
│ customer_id ┆ name │
│ --- ┆ --- │
│ i64 ┆ str │
╞═════════════╪═════════╡
│ 1 ┆ Alice │
│ 2 ┆ Bob │
│ 3 ┆ Charlie │
└─────────────┴─────────┘
shape: (3, 3)
┌──────────┬─────────────┬────────┐
│ order_id ┆ customer_id ┆ amount │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞══════════╪═════════════╪════════╡
│ a ┆ 1 ┆ 100 │
│ b ┆ 2 ┆ 200 │
│ c ┆ 2 ┆ 300 │
└──────────┴─────────────┴────────┘
shape: (3, 4)
customer_id name order_id amount i64 str str i64 1 "Alice" "a" 100 2 "Bob" "b" 200 2 "Bob" "c" 300
df_customers. join( df_orders, on= 'customer_id' , how= 'left' )
shape: (4, 4)
customer_id name order_id amount i64 str str i64 1 "Alice" "a" 100 2 "Bob" "b" 200 2 "Bob" "c" 300 3 "Charlie" null null
print ( df_customers. join( df_orders, on= 'customer_id' , how= 'full' ) )
df_customers. join( df_orders, on= 'customer_id' , how= 'full' , coalesce= True )
shape: (4, 5)
┌─────────────┬─────────┬──────────┬───────────────────┬────────┐
│ customer_id ┆ name ┆ order_id ┆ customer_id_right ┆ amount │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str ┆ i64 ┆ i64 │
╞═════════════╪═════════╪══════════╪═══════════════════╪════════╡
│ 1 ┆ Alice ┆ a ┆ 1 ┆ 100 │
│ 2 ┆ Bob ┆ b ┆ 2 ┆ 200 │
│ 2 ┆ Bob ┆ c ┆ 2 ┆ 300 │
│ 3 ┆ Charlie ┆ null ┆ null ┆ null │
└─────────────┴─────────┴──────────┴───────────────────┴────────┘
shape: (4, 4)
customer_id name order_id amount i64 str str i64 1 "Alice" "a" 100 2 "Bob" "b" 200 2 "Bob" "c" 300 3 "Charlie" null null
df_colors = pl. DataFrame(
{
"color" : [ "red" , "blue" , "green" ] ,
}
)
df_sizes = pl. DataFrame(
{
"size" : [ "S" , "M" , "L" ] ,
}
)
df_colors. join( df_sizes, how= 'cross' )
shape: (9, 2)
color size str str "red" "S" "red" "M" "red" "L" "blue" "S" "blue" "M" "blue" "L" "green" "S" "green" "M" "green" "L"
df_cars = pl. DataFrame(
{
"id" : [ "a" , "b" , "c" ] ,
"make" : [ "ford" , "toyota" , "bmw" ] ,
}
)
df_repairs = pl. DataFrame(
{
"id" : [ "c" , "c" ] ,
"cost" : [ 100 , 200 ] ,
}
)
print ( df_cars. join( df_repairs, on= 'id' , how= 'semi' ) )
print ( df_cars. join( df_repairs, on= 'id' , how= 'anti' ) )
shape: (1, 2)
┌─────┬──────┐
│ id ┆ make │
│ --- ┆ --- │
│ str ┆ str │
╞═════╪══════╡
│ c ┆ bmw │
└─────┴──────┘
shape: (2, 2)
┌─────┬────────┐
│ id ┆ make │
│ --- ┆ --- │
│ str ┆ str │
╞═════╪════════╡
│ a ┆ ford │
│ b ┆ toyota │
└─────┴────────┘
df_trades = pl. DataFrame(
{
"time" : [
datetime( 2020 , 1 , 1 , 9 , 1 , 0 ) ,
datetime( 2020 , 1 , 1 , 9 , 1 , 0 ) ,
datetime( 2020 , 1 , 1 , 9 , 3 , 0 ) ,
datetime( 2020 , 1 , 1 , 9 , 6 , 0 ) ,
] ,
"stock" : [ "A" , "B" , "B" , "C" ] ,
"trade" : [ 101 , 299 , 301 , 500 ] ,
}
)
print ( df_trades)
df_quotes = pl. DataFrame(
{
"time" : [
datetime( 2020 , 1 , 1 , 9 , 0 , 0 ) ,
datetime( 2020 , 1 , 1 , 9 , 2 , 0 ) ,
datetime( 2020 , 1 , 1 , 9 , 4 , 0 ) ,
datetime( 2020 , 1 , 1 , 9 , 6 , 0 ) ,
] ,
"stock" : [ "A" , "B" , "C" , "A" ] ,
"quote" : [ 100 , 300 , 501 , 102 ] ,
}
)
print ( df_quotes)
print ( df_trades. join_asof( df_quotes, on= "time" , by= "stock" ) )
print ( df_trades. join_asof(
df_quotes, on= "time" , by= "stock" , tolerance= "1m"
) )
shape: (4, 3)
┌─────────────────────┬───────┬───────┐
│ time ┆ stock ┆ trade │
│ --- ┆ --- ┆ --- │
│ datetime[μs] ┆ str ┆ i64 │
╞═════════════════════╪═══════╪═══════╡
│ 2020-01-01 09:01:00 ┆ A ┆ 101 │
│ 2020-01-01 09:01:00 ┆ B ┆ 299 │
│ 2020-01-01 09:03:00 ┆ B ┆ 301 │
│ 2020-01-01 09:06:00 ┆ C ┆ 500 │
└─────────────────────┴───────┴───────┘
shape: (4, 3)
┌─────────────────────┬───────┬───────┐
│ time ┆ stock ┆ quote │
│ --- ┆ --- ┆ --- │
│ datetime[μs] ┆ str ┆ i64 │
╞═════════════════════╪═══════╪═══════╡
│ 2020-01-01 09:00:00 ┆ A ┆ 100 │
│ 2020-01-01 09:02:00 ┆ B ┆ 300 │
│ 2020-01-01 09:04:00 ┆ C ┆ 501 │
│ 2020-01-01 09:06:00 ┆ A ┆ 102 │
└─────────────────────┴───────┴───────┘
shape: (4, 4)
┌─────────────────────┬───────┬───────┬───────┐
│ time ┆ stock ┆ trade ┆ quote │
│ --- ┆ --- ┆ --- ┆ --- │
│ datetime[μs] ┆ str ┆ i64 ┆ i64 │
╞═════════════════════╪═══════╪═══════╪═══════╡
│ 2020-01-01 09:01:00 ┆ A ┆ 101 ┆ 100 │
│ 2020-01-01 09:01:00 ┆ B ┆ 299 ┆ null │
│ 2020-01-01 09:03:00 ┆ B ┆ 301 ┆ 300 │
│ 2020-01-01 09:06:00 ┆ C ┆ 500 ┆ 501 │
└─────────────────────┴───────┴───────┴───────┘
shape: (4, 4)
┌─────────────────────┬───────┬───────┬───────┐
│ time ┆ stock ┆ trade ┆ quote │
│ --- ┆ --- ┆ --- ┆ --- │
│ datetime[μs] ┆ str ┆ i64 ┆ i64 │
╞═════════════════════╪═══════╪═══════╪═══════╡
│ 2020-01-01 09:01:00 ┆ A ┆ 101 ┆ 100 │
│ 2020-01-01 09:01:00 ┆ B ┆ 299 ┆ null │
│ 2020-01-01 09:03:00 ┆ B ┆ 301 ┆ 300 │
│ 2020-01-01 09:06:00 ┆ C ┆ 500 ┆ null │
└─────────────────────┴───────┴───────┴───────┘
Concatenation
方法 描述 垂直连接 vertical 具有相同列的两个数据可以垂直连接,形成一个更长的数据。 水平连接 horizontal 具有不重叠列的两个数据可以水平连接,形成一个更宽的数据。 对角连接 diagonal 具有不同行数和列数的两个数据可以对角连接,形成一个可能更长和/或更宽的数据。当列名重叠时,值将垂直连接。当列名不重叠时,将添加新的行和列。缺失的值将设置为null。
df_v1 = pl. DataFrame(
{
"a" : [ 1 ] ,
"b" : [ 3 ] ,
}
)
df_v2 = pl. DataFrame(
{
"a" : [ 2 ] ,
"b" : [ 4 ] ,
}
)
print ( pl. concat( [ df_v1, df_v2] , how= 'vertical' ) )
shape: (2, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 3 │
│ 2 ┆ 4 │
└─────┴─────┘
df_h1 = pl. DataFrame(
{
"l1" : [ 1 , 2 ] ,
"l2" : [ 3 , 4 ] ,
}
)
df_h2 = pl. DataFrame(
{
"r1" : [ 5 , 6 ] ,
"r2" : [ 7 , 8 ] ,
"r3" : [ 9 , 10 ] ,
}
)
pl. concat(
[
df_h1,
df_h2,
] ,
how= "horizontal" ,
)
shape: (2, 5)
l1 l2 r1 r2 r3 i64 i64 i64 i64 i64 1 3 5 7 9 2 4 6 8 10
df_d1 = pl. DataFrame(
{
"a" : [ 1 ] ,
"b" : [ 3 ] ,
}
)
df_d2 = pl. DataFrame(
{
"a" : [ 2 ] ,
"d" : [ 4 ] ,
}
)
pl. concat(
[
df_d1,
df_d2,
] ,
how= "diagonal" ,
)
Pivots 透视表格
Pivot操作指对DataFrame中的列进行重塑,并执行以下聚合操作之一:
first fsum fmin fmax fmean fmedian
df = pl. DataFrame(
{
"foo" : [ "A" , "A" , "B" , "B" , "C" ] ,
"N" : [ 1 , 2 , 2 , 4 , 2 ] ,
"bar" : [ "k" , "l" , "m" , "n" , "o" ] ,
}
)
print ( df. pivot( index= "foo" , columns= "bar" , values= "N" , aggregate_function= "first" ) )
q = (
df. lazy( )
. collect( )
. pivot( index= "foo" , columns= "bar" , values= "N" , aggregate_function= "first" )
. lazy( )
)
print ( q. collect( ) )
shape: (3, 6)
┌─────┬──────┬──────┬──────┬──────┬──────┐
│ foo ┆ k ┆ l ┆ m ┆ n ┆ o │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════╪══════╪══════╪══════╪══════╪══════╡
│ A ┆ 1 ┆ 2 ┆ null ┆ null ┆ null │
│ B ┆ null ┆ null ┆ 2 ┆ 4 ┆ null │
│ C ┆ null ┆ null ┆ null ┆ null ┆ 2 │
└─────┴──────┴──────┴──────┴──────┴──────┘
shape: (3, 6)
┌─────┬──────┬──────┬──────┬──────┬──────┐
│ foo ┆ k ┆ l ┆ m ┆ n ┆ o │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════╪══════╪══════╪══════╪══════╪══════╡
│ A ┆ 1 ┆ 2 ┆ null ┆ null ┆ null │
│ B ┆ null ┆ null ┆ 2 ┆ 4 ┆ null │
│ C ┆ null ┆ null ┆ null ┆ null ┆ 2 │
└─────┴──────┴──────┴──────┴──────┴──────┘
Melts
df = pl. DataFrame(
{
"A" : [ "a" , "b" , "a" ] ,
"B" : [ 1 , 3 , 5 ] ,
"C" : [ 10 , 11 , 12 ] ,
"D" : [ 2 , 4 , 6 ] ,
}
)
print ( df)
print ( df. melt( id_vars= [ "A" , "B" ] , value_vars= [ "C" , "D" ] ) )
shape: (3, 4)
┌─────┬─────┬─────┬─────┐
│ A ┆ B ┆ C ┆ D │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╪═════╡
│ a ┆ 1 ┆ 10 ┆ 2 │
│ b ┆ 3 ┆ 11 ┆ 4 │
│ a ┆ 5 ┆ 12 ┆ 6 │
└─────┴─────┴─────┴─────┘
shape: (6, 4)
┌─────┬─────┬──────────┬───────┐
│ A ┆ B ┆ variable ┆ value │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ str ┆ i64 │
╞═════╪═════╪══════════╪═══════╡
│ a ┆ 1 ┆ C ┆ 10 │
│ b ┆ 3 ┆ C ┆ 11 │
│ a ┆ 5 ┆ C ┆ 12 │
│ a ┆ 1 ┆ D ┆ 2 │
│ b ┆ 3 ┆ D ┆ 4 │
│ a ┆ 5 ┆ D ┆ 6 │
└─────┴─────┴──────────┴───────┘
时序序列
时间数据解析
polars主要有4中时序数据类型Date, Datetime, Duration, Time
pl. read_csv( 'file_path' , try_parse_dates= True )
df. with_columns( pl. col( 'Date' ) . str . to_date( '%Y-%m-%d' ) )
df. with_column( pl. col( 'Date' ) . dt. day( ) . alias( 'day' ) )
data = [
"2021-03-27T00:00:00+0100" ,
"2021-03-28T00:00:00+0100" ,
"2021-03-29T00:00:00+0200" ,
"2021-03-30T00:00:00+0200" ,
]
mixed_parsed = (
pl. Series( data)
. str . to_datetime( "%Y-%m-%dT%H:%M:%S%z" )
. dt. convert_time_zone( "Europe/Brussels" )
)
print ( mixed_parsed)
shape: (4,)
Series: '' [datetime[μs, Europe/Brussels]]
[
2021-03-27 00:00:00 CET
2021-03-28 00:00:00 CET
2021-03-29 00:00:00 CEST
2021-03-30 00:00:00 CEST
]
时序筛选
from datetime import datetime, date
df = pl. DataFrame(
{
"date" : pl. date_range( date( 2022 , 1 , 1 ) , date( 2022 , 1 , 5 ) , eager= True ) ,
"datetime" : pl. datetime_range(
datetime( 2022 , 1 , 1 ) , datetime( 2022 , 1 , 5 ) , eager= True
) ,
}
)
print ( df. filter ( pl. col( 'date' ) == date( 2022 , 1 , 2 ) ) )
print ( df. filter ( pl. col( 'date' ) . is_between( datetime( 2022 , 1 , 2 ) , datetime( 2022 , 1 , 4 ) ) ) )
print ( df. filter ( pl. col( 'date' ) . dt. day( ) < 3 ) )
shape: (1, 2)
┌────────────┬─────────────────────┐
│ date ┆ datetime │
│ --- ┆ --- │
│ date ┆ datetime[μs] │
╞════════════╪═════════════════════╡
│ 2022-01-02 ┆ 2022-01-02 00:00:00 │
└────────────┴─────────────────────┘
shape: (3, 2)
┌────────────┬─────────────────────┐
│ date ┆ datetime │
│ --- ┆ --- │
│ date ┆ datetime[μs] │
╞════════════╪═════════════════════╡
│ 2022-01-02 ┆ 2022-01-02 00:00:00 │
│ 2022-01-03 ┆ 2022-01-03 00:00:00 │
│ 2022-01-04 ┆ 2022-01-04 00:00:00 │
└────────────┴─────────────────────┘
shape: (2, 2)
┌────────────┬─────────────────────┐
│ date ┆ datetime │
│ --- ┆ --- │
│ date ┆ datetime[μs] │
╞════════════╪═════════════════════╡
│ 2022-01-01 ┆ 2022-01-01 00:00:00 │
│ 2022-01-02 ┆ 2022-01-02 00:00:00 │
└────────────┴─────────────────────┘
grouping
group_by_dynamic 参数包括every, period, offset
df = pl. DataFrame(
{
"date" : pl. date_range( date( 2022 , 1 , 1 ) , date( 2023 , 1 , 1 ) , eager= True ) ,
"int" : range ( 0 , 366 , 1 ) ,
}
)
df
shape: (366, 2)
date int date i64 2022-01-01 0 2022-01-02 1 2022-01-03 2 2022-01-04 3 2022-01-05 4 … … 2022-12-28 361 2022-12-29 362 2022-12-30 363 2022-12-31 364 2023-01-01 365
print ( df. group_by_dynamic( "date" , every= "1mo" ) . agg( pl. col( "int" ) . mean( ) ) )
shape: (13, 2)
date int date f64 2022-01-01 15.0 2022-02-01 44.5 2022-03-01 74.0 2022-04-01 104.5 2022-05-01 135.0 … … 2022-09-01 257.5 2022-10-01 288.0 2022-11-01 318.5 2022-12-01 349.0 2023-01-01 365.0
df. group_by_dynamic( 'date' , every= '1mo' , period= '10d' ) . agg( pl. col( 'int' ) . mean( ) )
df. group_by_dynamic( 'date' , every= '1d' , period= '10d' ) . agg( pl. col( 'int' ) . mean( ) )
'''The truncate parameter is a Boolean variable that determines what datetime value is associated with each group in the output.
In the example above the first data point is on 23rd February 1981. If truncate = True (the default) then the date for the first
year in the annual average is 1st January 1981. However, if truncate = False then the date for the first year in the annual average
is the date of the first data point on 23rd February 1981. Note that truncate only affects what's shown in the Date column and does
not affect the window boundaries.'''
shape: (367, 2)
date int date f64 2021-12-31 4.0 2022-01-01 4.5 2022-01-02 5.5 2022-01-03 6.5 2022-01-04 7.5 … … 2022-12-28 363.0 2022-12-29 363.5 2022-12-30 364.0 2022-12-31 364.5 2023-01-01 365.0
df = pl. DataFrame(
{
"time" : pl. datetime_range(
start= datetime( 2021 , 12 , 16 ) ,
end= datetime( 2021 , 12 , 16 , 3 ) ,
interval= "30m" ,
eager= True ,
) ,
"groups" : [ "a" , "a" , "a" , "b" , "b" , "a" , "a" ] ,
}
)
print ( df)
df. group_by_dynamic(
"time" ,
every= "1h" ,
closed= "both" ,
group_by= "groups" ,
include_boundaries= True ,
) . agg( pl. len ( ) )
shape: (7, 2)
┌─────────────────────┬────────┐
│ time ┆ groups │
│ --- ┆ --- │
│ datetime[μs] ┆ str │
╞═════════════════════╪════════╡
│ 2021-12-16 00:00:00 ┆ a │
│ 2021-12-16 00:30:00 ┆ a │
│ 2021-12-16 01:00:00 ┆ a │
│ 2021-12-16 01:30:00 ┆ b │
│ 2021-12-16 02:00:00 ┆ b │
│ 2021-12-16 02:30:00 ┆ a │
│ 2021-12-16 03:00:00 ┆ a │
└─────────────────────┴────────┘
shape: (7, 5)
groups _lower_boundary _upper_boundary time len str datetime[μs] datetime[μs] datetime[μs] u32 "a" 2021-12-15 23:00:00 2021-12-16 00:00:00 2021-12-15 23:00:00 1 "a" 2021-12-16 00:00:00 2021-12-16 01:00:00 2021-12-16 00:00:00 3 "a" 2021-12-16 01:00:00 2021-12-16 02:00:00 2021-12-16 01:00:00 1 "a" 2021-12-16 02:00:00 2021-12-16 03:00:00 2021-12-16 02:00:00 2 "a" 2021-12-16 03:00:00 2021-12-16 04:00:00 2021-12-16 03:00:00 1 "b" 2021-12-16 01:00:00 2021-12-16 02:00:00 2021-12-16 01:00:00 2 "b" 2021-12-16 02:00:00 2021-12-16 03:00:00 2021-12-16 02:00:00 1
采样 resampling
df = pl. DataFrame(
{
"time" : pl. datetime_range(
start= datetime( 2021 , 12 , 16 ) ,
end= datetime( 2021 , 12 , 16 , 3 ) ,
interval= "30m" ,
eager= True ,
) ,
"groups" : [ "a" , "a" , "a" , "b" , "b" , "a" , "a" ] ,
"values" : [ 1.0 , 2.0 , 3.0 , 4.0 , 5.0 , 6.0 , 7.0 ] ,
}
)
print ( df)
df. upsample( time_column= 'time' , every= '10m' ) . interpolate( )
shape: (7, 3)
┌─────────────────────┬────────┬────────┐
│ time ┆ groups ┆ values │
│ --- ┆ --- ┆ --- │
│ datetime[μs] ┆ str ┆ f64 │
╞═════════════════════╪════════╪════════╡
│ 2021-12-16 00:00:00 ┆ a ┆ 1.0 │
│ 2021-12-16 00:30:00 ┆ a ┆ 2.0 │
│ 2021-12-16 01:00:00 ┆ a ┆ 3.0 │
│ 2021-12-16 01:30:00 ┆ b ┆ 4.0 │
│ 2021-12-16 02:00:00 ┆ b ┆ 5.0 │
│ 2021-12-16 02:30:00 ┆ a ┆ 6.0 │
│ 2021-12-16 03:00:00 ┆ a ┆ 7.0 │
└─────────────────────┴────────┴────────┘
shape: (19, 3)
time groups values datetime[μs] str f64 2021-12-16 00:00:00 "a" 1.0 2021-12-16 00:10:00 null 1.333333 2021-12-16 00:20:00 null 1.666667 2021-12-16 00:30:00 "a" 2.0 2021-12-16 00:40:00 null 2.333333 … … … 2021-12-16 02:20:00 null 5.666667 2021-12-16 02:30:00 "a" 6.0 2021-12-16 02:40:00 null 6.333333 2021-12-16 02:50:00 null 6.666667 2021-12-16 03:00:00 "a" 7.0