dataframe的数据选择

import numpy as np
import pandas as pd
dates = pd.date_range('1/1/2000', periods=8)
dfa = pd.DataFrame(np.random.randn(8, 4),index=dates, columns=['A', 'B', 'C', 'D'])
dfa
ABCD
2000-01-010.080673-0.692551-0.383200-0.218372
2000-01-020.7275060.9969981.1675291.647872
2000-01-031.045695-1.170663-0.187076-0.547368
2000-01-04-0.6385131.1753561.0661860.098747
2000-01-05-0.2137620.877078-0.9298413.416403
2000-01-060.164324-0.5011470.9232300.104105
2000-01-070.544615-0.569205-0.2045081.895929
2000-01-08-1.5116570.4223400.0936480.093373
dfa.A = list(range(len(dfa.index)))  # ok if A already exists
dfa.A
2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
2000-01-08    7
Freq: D, Name: A, dtype: int64
dfa['A'] = list(range(len(dfa.index)))  # use this form to create a new column
dfa.A
2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
2000-01-08    7
Freq: D, Name: A, dtype: int64
#You can also assign a dict to a row of a DataFrame:
x = pd.DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]})
x.iloc[1] = {'x': 9, 'y': 99}  #先取第二行,然后再根据标签赋值
x
xy
013
1999
235
dfa[:5]
dfa[::2]
dfa[::-1] #反向输出
ABCD
2000-01-0870.4223400.0936480.093373
2000-01-076-0.569205-0.2045081.895929
2000-01-065-0.5011470.9232300.104105
2000-01-0540.877078-0.9298413.416403
2000-01-0431.1753561.0661860.098747
2000-01-032-1.170663-0.187076-0.547368
2000-01-0210.9969981.1675291.647872
2000-01-010-0.692551-0.383200-0.218372
dfa.loc["20000107":]
ABCD
2000-01-076-0.569205-0.2045081.895929
2000-01-0870.4223400.0936480.093373
dfa.loc["20000103"] > 0
A     True
B    False
C    False
D    False
Name: 2000-01-03 00:00:00, dtype: bool
dfa.loc[:,dfa.loc["20000103"] > 0]
A
2000-01-010
2000-01-021
2000-01-032
2000-01-043
2000-01-054
2000-01-065
2000-01-076
2000-01-087
dfa.iloc[:3]
ABCD
2000-01-010-0.692551-0.383200-0.218372
2000-01-0210.9969981.1675291.647872
2000-01-032-1.170663-0.187076-0.547368
 dfa.iloc[1:5, 2:4]
CD
2000-01-021.1675291.647872
2000-01-03-0.187076-0.547368
2000-01-041.0661860.098747
2000-01-05-0.9298413.416403
dfa.iloc[[1, 3, 5], [1, 3]]
BD
2000-01-020.9969981.647872
2000-01-041.1753560.098747
2000-01-06-0.5011470.104105
dfa.iloc[1:3, :]
ABCD
2000-01-0210.9969981.1675291.647872
2000-01-032-1.170663-0.187076-0.547368
dfa.loc[dfa.index[[0,2]],"A"]    #实现了数值和标签混用
2000-01-01    0
2000-01-03    2
Name: A, dtype: int64
dfa.iloc[[0,2],dfa.columns.get_loc("A")]       #实现了数值和标签混用
2000-01-01    0
2000-01-03    2
Freq: 2D, Name: A, dtype: int64
dfa.iloc[[0,2],dfa.columns.get_indexer(["A","C"])]   #实现了数值和标签混用
AC
2000-01-010-0.383200
2000-01-032-0.187076
#索引边界问题
dfa.iloc[7:10]        #该方法可以越界索引,不会报错
dfa.iloc[[6,7,9,10]]   #该方法越界索引将会报错
ABCD
2000-01-076-0.569205-0.2045081.895929
2000-01-0870.4223400.0936480.093373
#好的,接下我们实现以下根据另一个dataframe选择数据
df1 = pd.DataFrame(np.random.randn(6, 4),index=list('abcdef'),columns=list('ABCD'))
df1
ABCD
a0.547939-1.667014-1.663774-0.718178
b-0.0911700.6360300.942320-0.498451
c0.4652750.5029780.1376620.285483
d-1.531874-0.545184-0.574339-0.052297
e1.231483-0.8130940.3680481.691316
f-0.199404-1.5028150.4202480.537462
df1.loc[lambda dfa: dfa['A'] > 0, :]
ABCD
a0.547939-1.667014-1.663774-0.718178
c0.4652750.5029780.1376620.285483
e1.231483-0.8130940.3680481.691316
df1.loc[:, lambda df: ['A', 'B']]  #df1.iloc[:, lambda df: [0, 1]]
AB
a0.547939-1.667014
b-0.0911700.636030
c0.4652750.502978
d-1.531874-0.545184
e1.231483-0.813094
f-0.199404-1.502815
#where函数
#根据布尔值选择数据,并且返回与原数据相同的维度
df1.where(df1 > 0)
ABCD
a0.547939NaNNaNNaN
bNaN0.6360300.942320NaN
c0.4652750.5029780.1376620.285483
dNaNNaNNaNNaN
e1.231483NaN0.3680481.691316
fNaNNaN0.4202480.537462
dfa > 0
ABCD
2000-01-01FalseFalseFalseFalse
2000-01-02TrueTrueTrueTrue
2000-01-03TrueFalseFalseFalse
2000-01-04TrueTrueTrueTrue
2000-01-05TrueTrueFalseTrue
2000-01-06TrueFalseTrueTrue
2000-01-07TrueFalseFalseTrue
2000-01-08TrueTrueTrueTrue
dfa.iloc[0:6] > 0
ABCD
2000-01-01FalseFalseFalseFalse
2000-01-02TrueTrueTrueTrue
2000-01-03TrueFalseFalseFalse
2000-01-04TrueTrueTrueTrue
2000-01-05TrueTrueFalseTrue
2000-01-06TrueFalseTrueTrue
df1.where(dfa.iloc[0:7] > 0)  #不知道到了这一步为甚恶魔不行了
ABCD
aNaNNaNNaNNaN
bNaNNaNNaNNaN
cNaNNaNNaNNaN
dNaNNaNNaNNaN
eNaNNaNNaNNaN
fNaNNaNNaNNaN
df1.where(df1 > 0,dfa.iloc[0:7],inplace=True) #wc.居然也没有发生替换  好像是因为改变了元数据了
df1
ABCD
a0.547939NaNNaNNaN
bNaN0.6360300.942320NaN
c0.4652750.5029780.1376620.285483
dNaNNaNNaNNaN
e1.231483NaN0.3680481.691316
fNaNNaN0.4202480.537462
df1.apply(lambda x, y: x.where(x > 0, y), y=df1['A'])
ABCD
a0.5479390.5479390.5479390.547939
bNaN0.6360300.942320NaN
c0.4652750.5029780.1376620.285483
dNaNNaNNaNNaN
e1.2314831.2314830.3680481.691316
fNaNNaN0.4202480.537462
df1.where(lambda x: x > 4, lambda x: x + 10)
ABCD
a10.547939NaNNaNNaN
bNaN10.63603010.942320NaN
c10.46527510.50297810.13766210.285483
dNaNNaNNaNNaN
e11.231483NaN10.36804811.691316
fNaNNaN10.42024810.537462
#mask函数刚好与where函数相反,符合条件的被替换成NAN了
#强大简洁的query函数
df = pd.DataFrame(np.random.rand(10, 3), columns=list('abc'))
df[(df['a'] < df['b']) & (df['b'] < df['c'])]
abc
00.1537260.3301540.708700
20.2630180.3729410.709408
df.query('(a < b) & (b < c)')  #注意括号里的表达式需要引号包裹起来
abc
00.1537260.3301540.708700
20.2630180.3729410.709408
df = pd.DataFrame(np.random.randint(10, size=(10, 2)), columns=list('bc'))
df
bc
087
146
203
364
471
550
621
721
866
910
#If instead you don’t want to or cannot name your index, you can use the name index in your query expression:
df.query('index < b < c') #最令人惊叹的地方
bc
146
df = pd.DataFrame(np.random.rand(10, 3), columns=list('abc'))
df
abc
00.7443270.5770670.567892
10.2470890.5075760.454971
20.0987700.9393220.905982
30.3963890.9437430.357621
40.0668920.0569970.537665
50.1942260.9451770.101624
60.6400240.5335600.793600
70.1935130.3984290.031752
80.7314690.1607660.175481
90.8592000.0892470.611211
df2 = pd.DataFrame(np.random.rand(12, 3), columns=df.columns)
df2
abc
00.4457680.9650030.793259
10.0365390.7644960.098846
20.7618570.7906460.059684
30.1183600.6138960.363632
40.7801240.8684230.832651
50.9680180.0855310.163136
60.7871290.5932850.341474
70.4208280.3590690.182200
80.9295840.8379840.789769
90.0501630.2130840.134281
100.4589490.1028850.669524
110.2222810.1789580.602505
df1.query("df2["a"] > 0 & df2["c"] < 0")
  File "<ipython-input-78-0cace329886b>", line 1
    df1.query("df2["a"] > 0 & df2["c"] < 0")
                    ^
SyntaxError: invalid syntax

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值