Julia DataFrames ----LINQ (语言集成查询)

最新推荐文章于 2024-06-11 23:51:21 发布

October-

最新推荐文章于 2024-06-11 23:51:21 发布

阅读量886

点赞数 1

分类专栏： julia机器学习&科学计算文章标签： Julia DataFramesMeta Query LINQ

本文链接：https://blog.csdn.net/weixin_41715077/article/details/104031753

版权

julia机器学习&科学计算专栏收录该内容

70 篇文章 28 订阅

订阅专栏

1、功能概述：

2、DataFramesMeta 使用代码样例

3、Query代码样例

1、功能概述：

LINQ（语言集成查询），原是微软的一项技术，新增一种自然查询的SQL语法到.NET Framework的编程语言中。从技术角度而言，LINQ定义了大约40个查询操作符，如select、from、in、where以及order by（C#中）。

dplyr:是一个强大的R软件包，用于处理，清理和汇总非结构化数据。简而言之，它使得R中的数据探索和数据操作变得简单快捷。dplyr软件包的设计目的是进行数据分析。dplyr函数的名称类似于SQL命令，如用于选择变量的select（），group_by（） - 通过对变量进行分组来组合数据，join（） - 将两个数据集合在一起。还包括inner_join（）和left_join（）。它也支持SQL常用的子查询。常用的dplyr的函数

Julia 有目前有两个库提供了类似LINQ的功能：DataFramesMeta.jl和Query.jl（参考：Query.jl documentation）。相对DataFramesMeta.jl 而言 Query.jl 的功能更加强大一点，Query.jl支持数据结构要多一些，包括JuliaDb.jl。下面是一些功能对比。

DataFramesMeta.jl	Query.jl	dplyr	LINQ
------------------	------------	----------------	------------
@where	@filter	filter	Where
@transform	@mutate	mutate	Select (?)
@by	@groupby		GroupBy
groupby		group_by
@based_on		summarise/do
@orderby	@orderby	arrange	OrderBy
@select	@select	select	Select
	@join/@groupjoin #对列的计算
	@rename
	@map()/@mapmany/@take/@drop/@unique #对列的计算

2、DataFramesMeta 使用代码样例

using DataFrames, DataFramesMeta
using Lazy
using BenchmarkTools
import Lazy:@>
import DataFramesMeta:@select,@orderby,@transform,@by,@orderby,@with,groupby

#DataFramesMeta 使用样例
#单个宏使用
#@with是其他元编程工具使用的基本宏。
#@with创建了一个函数，所以@with中的作用域是一个局部作用域。可以读取父类中的变量。
#根据父作用域的类型，写入父作用域中的变量是不同的。如果父范围是全局范围，
#那么不使用全局关键字就不能分配变量。如果父作用域是局部作用域(例如在函数或let块内)，
#则不需要global关键字来分配到父作用域。
df = DataFrame(x = 1:3, y = [2, 1, 2])
x = [2, 1, 0]

@with(df, :y .+ 1)
@with(df, :x + x)
@with(df, df[:x .> 1, ^(:y)]) # The ^ means leave the :y alone
@where(df, :x .> 1)
@select(df, :x, :y)
@select(df, x2 = 2 * :x, :x, :y)
@transform(df, newCol = cos.(:x), anotherCol = :x.^2 + 3*:x .+ 4)
@byrow! df if :x > :y; :x = :x * :y end

# @linq 是一个指定链式操作的宏定义，所有DataFramesMeta定义的其他宏都可以通过@linq 连接到一起.
# 优点：相对于单个宏的使用，链接看起来更清晰、更明显，这种方法还避免了填充有限的宏名称空间。
# 缺点：主要的缺点是很多的细节被掩盖了
df = DataFrame(name=["John", "Sally", "Roger"],
                      age=[54., 34., 79.],
                      children=[0, 2, 4])
@linq df |>
    where(:age .> 40) |>
    select(number_of_children=:children, :name)
    # 2×2 DataFrame
    # │ Row │ number_of_children │ name   │
    # │     │ Int64              │ String │
    # ├─────┼────────────────────┼────────┤
    # │ 1   │ 0                  │ John   │
    # │ 2   │ 4                  │ Roger  │


df = DataFrame(key=repeat(1:3, 4), value=1:12)
@linq df |>
    where(:value .> 3) |>
    by(:key, min=minimum(:value), max=maximum(:value)) |>
    select(:key, range=:max - :min)
    # 3×2 DataFrame
    # │ Row │ key   │ range │
    # │     │ Int64 │ Int64 │
    # ├─────┼───────┼───────┤
    # │ 1   │ 1     │ 6     │
    # │ 2   │ 2     │ 6     │
    # │ 3   │ 3     │ 6     │

@linq df |>
    where(:value .> 9) |>
    groupby(:key) |>
    transform(value0 = :value .- minimum(:value))
    # 3×3 DataFrame
    # │ Row │ key   │ value │ value0 │
    # │     │ Int64 │ Int64 │ Int64  │
    # ├─────┼───────┼───────┼────────┤
    # │ 1   │ 1     │ 10    │ 0      │
    # │ 2   │ 2     │ 11    │ 0      │
    # │ 3   │ 3     │ 12    │ 0      │


df = DataFrame(x = 1:3, y = [2, 1, 2],a = 1:3, b = [2, 1, 2])
x_thread = @linq df |>
        transform(y = 10 * :x) |>
        where(:a .> 2) |>
        by(:b, meanX = mean(:x), meanY = mean(:y)) |>
        orderby(:meanX) |>
        select(:meanX, :meanY, var = :b)
        # 1×3 DataFrame
        # │ Row │ meanX   │ meanY   │ var   │
        # │     │ Float64 │ Float64 │ Int64 │
        # ├─────┼─────────┼─────────┼───────┤
        # │ 1   │ 3.0     │ 30.0    │ 2     │

#还有另外一种使用方式 但是要使用Lazy.jl @> macro ,Pkg.add("Lazy")
x_thread = @> begin
    df
    @transform(y = 10 * :x)
    @where(:a .> 2)
    @by(:b, meanX = mean(:x), meanY = mean(:y))
    @orderby(:meanX)
    @select(:meanX, :meanY, var = :b)
end

3、Query代码样例

using DataFrames,  Query, Statistics
import Query:@select
export DataFramesMeta

#Query 所有的宏都支持单独使用，并且可以用  |>  管道操作符连接到一起。
#注意两个符号 _ 和 __，可以看做匿名函数,这种语法只适用于独立的查询命令. _表示第一个数据结构参数，__表示第二数据结构参数
df = DataFrame(fruit=["Apple","Banana","Cherry"],amount=[2,6,1000],price=[1.2,2.0,0.4],isyellow=[false,true,false])
# 3×4 DataFrame
# │ Row │ fruit  │ amount │ price   │ isyellow │
# │     │ String │ Int64  │ Float64 │ Bool     │
# ├─────┼────────┼────────┼─────────┼──────────┤
# │ 1   │ Apple  │ 2      │ 1.2     │ false    │
# │ 2   │ Banana │ 6      │ 2.0     │ true     │
# │ 3   │ Cherry │ 1000   │ 0.4     │ false    │

q1 = df |> @select(2:3, occursin("ui"), -:amount) |> DataFrame
# 3×2 DataFrame
# │ Row │ price   │ fruit  │
# │     │ Float64 │ String │
# ├─────┼─────────┼────────┤
# │ 1   │ 1.2     │ Apple  │
# │ 2   │ 2.0     │ Banana │
# │ 3   │ 0.4     │ Cherry │

x = df |> @filter(_.amount > 6 && _.price > 0) |> DataFrame
# 1×4 DataFrame
# │ Row │ fruit  │ amount │ price   │ isyellow │
# │     │ String │ Int64  │ Float64 │ Bool     │
# ├─────┼────────┼────────┼─────────┼──────────┤
# │ 1   │ Cherry │ 1000   │ 0.4     │ false    │

q = df |> @mutate(price = 2 * _.price + _.amount, isyellow = _.fruit == "Apple") |> DataFrame
# 3×4 DataFrame
# │ Row │ fruit  │ amount │ price   │ isyellow │
# │     │ String │ Int64  │ Float64 │ Bool     │
# ├─────┼────────┼────────┼─────────┼──────────┤
# │ 1   │ Apple  │ 2      │ 4.4     │ true     │
# │ 2   │ Banana │ 6      │ 10.0    │ false    │
# │ 3   │ Cherry │ 1000   │ 1000.8  │ false    │

df_parents = DataFrame(Name=["John", "Sally"])
df_children = DataFrame(Name=["Bill", "Joe", "Mary"], Parent=["John", "John", "Sally"])

df_parents |> @join(df_children, _.Name, _.Parent, {Parent=_.Name, Child=__.Name}) |> DataFrame
# 3×2 DataFrame
# │ Row │ Parent │ Child  │
# │     │ String │ String │
# ├─────┼────────┼────────┤
# │ 1   │ John   │ Bill   │
# │ 2   │ John   │ Joe    │
# │ 3   │ Sally  │ Mary   │


#链式操作一般从 @from 开始,@collect结束
#@collect 指定返回的类型，如果没有默认返回标准的 julia 迭代器
#@select 如果返回DataFrame 需要使用{},如果不用{} 返回类型为 Vector{Tuple{...}}
df = DataFrame(name=["John", "Sally", "Roger"],
                      age=[54., 34., 79.],
                      children=[0, 2, 4])

#过滤，选择，排序，
q1 = @from i in df begin
    @where i.age > 40
    @orderby i.age
    @select {number_of_children=i.children, i.name}
    @collect DataFrame
end
# 2×2 DataFrame
# │ Row │ number_of_children │ name   │
# │     │ Int64              │ String │
# ├─────┼────────────────────┼────────┤
# │ 1   │ 2                  │ Sally  │
# │ 2   │ 2                  │ Kirk   │

#默认返回Vector{Tuple{...}}
q3 = @from i in df begin
            @where i.age > 40 && i.children > 0
            @select i.name,i.age
            @collect
end
# 2-element Array{Tuple{String,Float64},1}:
#  ("Sally", 42.0)
#  ("Kirk", 59.0)

#连接操作
df1 = DataFrame(a=[1,2,3], b=[1.,2.,3.])
df2 = DataFrame(c=[2,4,2], d=["John", "Jim","Sally"])

x = @from i in df1 begin
    @join j in df2 on i.a equals j.c
    @select {i.a,i.b,j.c,j.d}
    @collect DataFrame
end
# 2×4 DataFrame
# │ Row │ a     │ b       │ c     │ d      │
# │     │ Int64 │ Float64 │ Int64 │ String │
# ├─────┼───────┼─────────┼───────┼────────┤
# │ 1   │ 2     │ 2.0     │ 2     │ John   │
# │ 2   │ 2     │ 2.0     │ 2     │ Sally  │


#Split-Apply-Combine  对数据结构上做各种统计计算
df = DataFrame(name=repeat(["John", "Sally", "Kirk"],inner=[1],outer=[2]),
     age=vcat([10., 20., 30.],[10., 20., 30.].+3),
     children=repeat([3,2,2],inner=[1],outer=[2]),state=[:a,:a,:a,:b,:b,:b])

x = @from i in df begin
    @group i by i.state into g
    @select {group=key(g),mage=mean(g.age), oldest=maximum(g.age), youngest=minimum(g.age)}
    @collect DataFrame
end
# 2×4 DataFrame
# │ Row │ group  │ mage    │ oldest  │ youngest │
# │     │ Symbol │ Float64 │ Float64 │ Float64  │
# ├─────┼────────┼─────────┼─────────┼──────────┤
# │ 1   │ a      │ 20.0    │ 30.0    │ 10.0     │
# │ 2   │ b      │ 23.0    │ 33.0    │ 13.0     │

#通过let 宏 引入新的变量
df = DataFrame(name=["John", "Sally", "Kirk"], age=[23., 42., 59.], children=[3,2,2])
x = @from i in df begin
    @let count = length(i.name)
    @let kids_per_year = i.children / i.age
    @where count > 4
    @select {Name=i.name, Count=count, KidsPerYear=kids_per_year}
    @collect DataFrame
end
# 1×3 DataFrame
# │ Row │ Name   │ Count │ KidsPerYear │
# │     │ String │ Int64 │ Float64     │
# ├─────┼────────┼───────┼─────────────┤
# │ 1   │ Sally  │ 5     │ 0.047619    │