# from http://pandas.pydata.org/pandas-docs/stable/indexing.html
# pandas 0.22.0
import pandas as pd
import numpy as np
"""
尽量利用pandas提供的专用索引方式,而不是python通用的切片方式。
三种主要的索引方式:indexers
.loc label based 基于标签, 可以是标签名,可以是布尔值,可以是一元函数
.iloc integer psition based 基于整数位置(from 0 to length-1 of the axis),和python切片类似
[]
"""
"""
Object Type Selection Return Value Type
------------------------------------------------------------
Series series[label] scalar value
DataFrame frame[colname] Series corresponding to colname
Panel panel[itemname] DataFrame corresponding to the itemname
"""
'\n尽量利用pandas提供的专用索引方式,而不是python通用的切片方式。\n三种主要的索引方式:indexers\n.loc label based 基于标签, 可以是标签名,可以是布尔值,可以是一元函数\n.iloc integer psition based 基于整数位置(from 0 to length-1 of the axis),和python切片类似\n[]\n\n'
'\nObject Type \t\tIndexers\nSeries \t\t\t\ts.loc[indexer]\nDataFrame \t\t\tdf.loc[row_indexer,column_indexer]\nPanel \t\t\t\tp.loc[item_indexer,major_indexer,minor_indexer]\n'
# Here we construct a simple time series data set to use for illustrating the indexing functionality
# 构造时间序列,举例说明索引的功能
dates = pd.date_range('1/1/2000', periods=8)
dates
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df
DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
'2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
dtype='datetime64[ns]', freq='D')
| A | B | C | D |
---|
2000-01-01 | 0.136628 | 0.324097 | 0.856313 | -0.145259 |
---|
2000-01-02 | 0.113819 | -0.718630 | 0.016217 | -1.571263 |
---|
2000-01-03 | -0.603598 | -0.143643 | -1.143063 | -0.425266 |
---|
2000-01-04 | -0.486200 | -0.136663 | -2.016020 | -0.815514 |
---|
2000-01-05 | 0.835318 | 1.036607 | -0.502919 | 0.878680 |
---|
2000-01-06 | -2.376205 | 0.362577 | -0.484754 | -0.478711 |
---|
2000-01-07 | 0.193371 | 1.330468 | 0.544160 | 1.030900 |
---|
2000-01-08 | 0.476533 | -0.476653 | -0.434356 | 0.744500 |
---|
panel = pd.Panel({'one' : df, 'two' : df - df.mean()}) # 多维表格
panel
panel['one']
panel['two']
<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 8 (major_axis) x 4 (minor_axis)
Items axis: one to two
Major_axis axis: 2000-01-01 00:00:00 to 2000-01-08 00:00:00
Minor_axis axis: A to D
| A | B | C | D |
---|
2000-01-01 | 0.136628 | 0.324097 | 0.856313 | -0.145259 |
---|
2000-01-02 | 0.113819 | -0.718630 | 0.016217 | -1.571263 |
---|
2000-01-03 | -0.603598 | -0.143643 | -1.143063 | -0.425266 |
---|
2000-01-04 | -0.486200 | -0.136663 | -2.016020 | -0.815514 |
---|
2000-01-05 | 0.835318 | 1.036607 | -0.502919 | 0.878680 |
---|
2000-01-06 | -2.376205 | 0.362577 | -0.484754 | -0.478711 |
---|
2000-01-07 | 0.193371 | 1.330468 | 0.544160 | 1.030900 |
---|
2000-01-08 | 0.476533 | -0.476653 | -0.434356 | 0.744500 |
---|
| A | B | C | D |
---|
2000-01-01 | 0.350419 | 0.126827 | 1.251866 | -0.047518 |
---|
2000-01-02 | 0.327611 | -0.915900 | 0.411770 | -1.473522 |
---|
2000-01-03 | -0.389806 | -0.340913 | -0.747511 | -0.327524 |
---|
2000-01-04 | -0.272408 | -0.333933 | -1.620467 | -0.717772 |
---|
2000-01-05 | 1.049110 | 0.839337 | -0.107366 | 0.976422 |
---|
2000-01-06 | -2.162413 | 0.165307 | -0.089201 | -0.380969 |
---|
2000-01-07 | 0.407163 | 1.133198 | 0.939713 | 1.128641 |
---|
2000-01-08 | 0.690325 | -0.673923 | -0.038803 | 0.842241 |
---|
# Thus, as per above, we have the most basic indexing using []:
# 最基本的索引方法,使用[]
s = df['A']
s
dates[5] # 注意切片从0计数
s[dates[5]]
2000-01-01 0.136628
2000-01-02 0.113819
2000-01-03 -0.603598
2000-01-04 -0.486200
2000-01-05 0.835318
2000-01-06 -2.376205
2000-01-07 0.193371
2000-01-08 0.476533
Freq: D, Name: A, dtype: float64
Timestamp('2000-01-06 00:00:00', freq='D')
-2.376204948581219
# 在[]中传入列名的列表,如[ 'A', "B" ]
columns_l = ['A', 'B']
df[columns_l]
df[[ 'A', "B" ]] # 相当于上面,注意两重[]
df
| A | B |
---|
2000-01-01 | 0.324097 | 0.136628 |
---|
2000-01-02 | -0.718630 | 0.113819 |
---|
2000-01-03 | -0.143643 | -0.603598 |
---|
2000-01-04 | -0.136663 | -0.486200 |
---|
2000-01-05 | 1.036607 | 0.835318 |
---|
2000-01-06 | 0.362577 | -2.376205 |
---|
2000-01-07 | 1.330468 | 0.193371 |
---|
2000-01-08 | -0.476653 | 0.476533 |
---|
| A | B |
---|
2000-01-01 | 0.324097 | 0.136628 |
---|
2000-01-02 | -0.718630 | 0.113819 |
---|
2000-01-03 | -0.143643 | -0.603598 |
---|
2000-01-04 | -0.136663 | -0.486200 |
---|
2000-01-05 | 1.036607 | 0.835318 |
---|
2000-01-06 | 0.362577 | -2.376205 |
---|
2000-01-07 | 1.330468 | 0.193371 |
---|
2000-01-08 | -0.476653 | 0.476533 |
---|
| A | B | C | D |
---|
2000-01-01 | 0.324097 | 0.136628 | 0.856313 | -0.145259 |
---|
2000-01-02 | -0.718630 | 0.113819 | 0.016217 | -1.571263 |
---|
2000-01-03 | -0.143643 | -0.603598 | -1.143063 | -0.425266 |
---|
2000-01-04 | -0.136663 | -0.486200 | -2.016020 | -0.815514 |
---|
2000-01-05 | 1.036607 | 0.835318 | -0.502919 | 0.878680 |
---|
2000-01-06 | 0.362577 | -2.376205 | -0.484754 | -0.478711 |
---|
2000-01-07 | 1.330468 | 0.193371 | 0.544160 | 1.030900 |
---|
2000-01-08 | -0.476653 | 0.476533 | -0.434356 | 0.744500 |
---|
df[['A', 'B']] = df[['B', 'A']] # 交换两列的值
df
df.loc[:,['B', 'A']]
# 下式不能交换两列的值
# This will not modify df because the column alignment is before value assignment. ? 不理解?
# 列赋值在值赋值之前?
df.loc[:,['B', 'A']] = df[['A', 'B']]
df
# df.loc[:,['B', 'A']] = df[:, ['A', 'B']] # 错误?
# df
# 正确的方式:
df.loc[:,['B', 'A']] = df[['A', 'B']].values # 取列的值
df
| A | B | C | D |
---|
2000-01-01 | 0.136628 | 0.324097 | 0.856313 | -0.145259 |
---|
2000-01-02 | 0.113819 | -0.718630 | 0.016217 | -1.571263 |
---|
2000-01-03 | -0.603598 | -0.143643 | -1.143063 | -0.425266 |
---|
2000-01-04 | -0.486200 | -0.136663 | -2.016020 | -0.815514 |
---|
2000-01-05 | 0.835318 | 1.036607 | -0.502919 | 0.878680 |
---|
2000-01-06 | -2.376205 | 0.362577 | -0.484754 | -0.478711 |
---|
2000-01-07 | 0.193371 | 1.330468 | 0.544160 | 1.030900 |
---|
2000-01-08 | 0.476533 | -0.476653 | -0.434356 | 0.744500 |
---|
| B | A |
---|
2000-01-01 | 0.324097 | 0.136628 |
---|
2000-01-02 | -0.718630 | 0.113819 |
---|
2000-01-03 | -0.143643 | -0.603598 |
---|
2000-01-04 | -0.136663 | -0.486200 |
---|
2000-01-05 | 1.036607 | 0.835318 |
---|
2000-01-06 | 0.362577 | -2.376205 |
---|
2000-01-07 | 1.330468 | 0.193371 |
---|
2000-01-08 | -0.476653 | 0.476533 |
---|
| A | B | C | D |
---|
2000-01-01 | 0.136628 | 0.324097 | 0.856313 | -0.145259 |
---|
2000-01-02 | 0.113819 | -0.718630 | 0.016217 | -1.571263 |
---|
2000-01-03 | -0.603598 | -0.143643 | -1.143063 | -0.425266 |
---|
2000-01-04 | -0.486200 | -0.136663 | -2.016020 | -0.815514 |
---|
2000-01-05 | 0.835318 | 1.036607 | -0.502919 | 0.878680 |
---|
2000-01-06 | -2.376205 | 0.362577 | -0.484754 | -0.478711 |
---|
2000-01-07 | 0.193371 | 1.330468 | 0.544160 | 1.030900 |
---|
2000-01-08 | 0.476533 | -0.476653 | -0.434356 | 0.744500 |
---|
| A | B | C | D |
---|
2000-01-01 | 0.324097 | 0.136628 | 0.856313 | -0.145259 |
---|
2000-01-02 | -0.718630 | 0.113819 | 0.016217 | -1.571263 |
---|
2000-01-03 | -0.143643 | -0.603598 | -1.143063 | -0.425266 |
---|
2000-01-04 | -0.136663 | -0.486200 | -2.016020 | -0.815514 |
---|
2000-01-05 | 1.036607 | 0.835318 | -0.502919 | 0.878680 |
---|
2000-01-06 | 0.362577 | -2.376205 | -0.484754 | -0.478711 |
---|
2000-01-07 | 1.330468 | 0.193371 | 0.544160 | 1.030900 |
---|
2000-01-08 | -0.476653 | 0.476533 | -0.434356 | 0.744500 |
---|
# Attribute Access¶
# 属性访问,属性存取
# You may access an index on a Series, column on a DataFrame,
# and an item on a Panel directly as an attribute:
sa = pd.Series([1,2,3],index=list('abc'))
sa
dfa = df.copy()
dfa
a 1
b 2
c 3
dtype: int64
| A | B | C | D |
---|
2000-01-01 | 0.324097 | 0.136628 | 0.856313 | -0.145259 |
---|
2000-01-02 | -0.718630 | 0.113819 | 0.016217 | -1.571263 |
---|
2000-01-03 | -0.143643 | -0.603598 | -1.143063 | -0.425266 |
---|
2000-01-04 | -0.136663 | -0.486200 | -2.016020 | -0.815514 |
---|
2000-01-05 | 1.036607 | 0.835318 | -0.502919 | 0.878680 |
---|
2000-01-06 | 0.362577 | -2.376205 | -0.484754 | -0.478711 |
---|
2000-01-07 | 1.330468 | 0.193371 | 0.544160 | 1.030900 |
---|
2000-01-08 | -0.476653 | 0.476533 | -0.434356 | 0.744500 |
---|
# 对item column等赋值
# 就像属性一样的存取,但需要注意:
# 1. 名称应符合python命名规则,由字母、数字和下划线组成
# 2. 不能和已有的方法名称重名,例如 min
# 3. 不能与pandas内部“关键字”重名,例如 index axis items labels
# 以上情况,可以使用 [ "" ] 引用
sa.a = 5
sa
dfa.index
dfa.A = list(range(len(dfa.index)))
dfa
a 5
b 2
c 3
dtype: int64
DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
'2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
dtype='datetime64[ns]', freq='D')
| A | B | C | D |
---|
2000-01-01 | 0 | 0.136628 | 0.856313 | -0.145259 |
---|
2000-01-02 | 1 | 0.113819 | 0.016217 | -1.571263 |
---|
2000-01-03 | 2 | -0.603598 | -1.143063 | -0.425266 |
---|
2000-01-04 | 3 | -0.486200 | -2.016020 | -0.815514 |
---|
2000-01-05 | 4 | 0.835318 | -0.502919 | 0.878680 |
---|
2000-01-06 | 5 | -2.376205 | -0.484754 | -0.478711 |
---|
2000-01-07 | 6 | 0.193371 | 0.544160 | 1.030900 |
---|
2000-01-08 | 7 | 0.476533 | -0.434356 | 0.744500 |
---|
x = pd.DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]}) # 字典key值为列名
x
dict(x=9, y=99)
x.iloc[1]
x.iloc[1] = dict(x=9, y=99)
x
{'x': 9, 'y': 99}
x 2
y 4
Name: 1, dtype: int64
df = pd.DataFrame({'one': [1., 2., 3.]})
df
df.two = [4, 5, 6] # 错误,不能增加一列,利用属性的方式不能对没有的列赋值
df.two # 但是增加了一项属性,而且可以取得这项属性
df
df['two'] = [4, 5, 6] # 可以增加一列
df
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:3: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
This is separate from the ipykernel package so we can avoid doing imports until
[4, 5, 6]
# Slicing ranges
# 切片范围
# iloc 方法是最稳健和兼容的
# 下面介绍 方括号 [] 操作符 作为切片
# 对series操作
# 取值
s
s[:5]
s[::2]
s[::-1]
2000-01-01 0.324097
2000-01-02 -0.718630
2000-01-03 -0.143643
2000-01-04 -0.136663
2000-01-05 1.036607
2000-01-06 0.362577
2000-01-07 1.330468
2000-01-08 -0.476653
Freq: D, Name: A, dtype: float64
2000-01-01 0.324097
2000-01-02 -0.718630
2000-01-03 -0.143643
2000-01-04 -0.136663
2000-01-05 1.036607
Freq: D, Name: A, dtype: float64
2000-01-01 0.324097
2000-01-03 -0.143643
2000-01-05 1.036607
2000-01-07 1.330468
Freq: 2D, Name: A, dtype: float64
2000-01-08 -0.476653
2000-01-07 1.330468
2000-01-06 0.362577
2000-01-05 1.036607
2000-01-04 -0.136663
2000-01-03 -0.143643
2000-01-02 -0.718630
2000-01-01 0.324097
Freq: -1D, Name: A, dtype: float64
# 赋值
s2 = s.copy()
s2
s2[:5] = 0
s2
2000-01-01 0.324097
2000-01-02 -0.718630
2000-01-03 -0.143643
2000-01-04 -0.136663
2000-01-05 1.036607
2000-01-06 0.362577
2000-01-07 1.330468
2000-01-08 -0.476653
Freq: D, Name: A, dtype: float64
2000-01-01 0.000000
2000-01-02 0.000000
2000-01-03 0.000000
2000-01-04 0.000000
2000-01-05 0.000000
2000-01-06 0.362577
2000-01-07 1.330468
2000-01-08 -0.476653
Freq: D, Name: A, dtype: float64
# 对DataFrame操作
# [] 操作 选择的是行
df
df[:3]
df[::-1]
# Selection By Label
# 通过标签选择
# 这种方式可能依靠上下文关系,有时候会调用链式赋值,这应该避免。参考:
# Returning a view versus a copy 返回视图 对比 拷贝
# When setting values in a pandas object,
# care must be taken to avoid what is called chained indexing.
# Here is an example.
[list('abcd'),list('efgh'),list('ijkl'),list('mnop')]
[['one','two'],['first','second']]
dfmi = pd.DataFrame([list('abcd'),list('efgh'),list('ijkl'),list('mnop')], # 定义四列数值
columns=pd.MultiIndex.from_product(
[['one','two'],['first','second']])) # 定义多重索引,第一个list是第一层,以下类推
dfmi
[['a', 'b', 'c', 'd'],
['e', 'f', 'g', 'h'],
['i', 'j', 'k', 'l'],
['m', 'n', 'o', 'p']]
[['one', 'two'], ['first', 'second']]
| one | two |
---|
| first | second | first | second |
---|
0 | a | b | c | d |
---|
1 | e | f | g | h |
---|
2 | i | j | k | l |
---|
3 | m | n | o | p |
---|
# Compare these two access methods:
dfmi['one']['second'] # chained
dfmi.loc[:,('one','second')]
0 b
1 f
2 j
3 n
Name: second, dtype: object
0 b
1 f
2 j
3 n
Name: (one, second), dtype: object
# 两种方式分析
# 第一种方式:链式 使用两个 []
dfmi['one'] # 第一个[] 先生成了一个DataFrame
dfmi['one']['second'] # pandas 把两个[] 作为分开的事件,他们执行分开的两步调用 __getitem__
# 第二种方式:loc
# 通过嵌套的元组切片
df_s = (slice(None), ('one', 'second'))
dfmi.loc[df_s] # 只调用了一次 __getitem__
0 b
1 f
2 j
3 n
Name: (one, second), dtype: object
# Selection By Label
# 通过标签选择
# loc要注意索引的数据类型,必须与索引的数据类型一致才可以,
# 例如 datetimeIndex 中,使用loc[2:3] ,即整数型的slice 将会出现TypeError
dfl = pd.DataFrame(np.random.randn(5,4), columns=list('ABCD'),
index=pd.date_range('20130101',periods=5))
dfl
# dfl.loc[2:3] # 错误的loc
dfl.loc['20130102':'20130104'] # 使用可转换为datetime的字符串
dfl.loc['20130202':'20130204'] # 不报错,返回为空DataFrame
# dfl.loc['20130202'] # 报错,错误信息是index无此值
dfl.loc['20130104':'20130114'] # 只返回存在的数据
| A | B | C | D |
---|
2013-01-01 | -0.887358 | 1.068362 | 0.551961 | -0.378400 |
---|
2013-01-02 | 1.239840 | -0.986149 | -0.880655 | 2.112731 |
---|
2013-01-03 | -0.785526 | 1.583703 | -0.871005 | -0.659880 |
---|
2013-01-04 | -1.267462 | 2.500886 | -0.980569 | 1.308624 |
---|
2013-01-05 | -0.842107 | -0.921086 | 1.020196 | -0.055930 |
---|
| A | B | C | D |
---|
2013-01-02 | 1.239840 | -0.986149 | -0.880655 | 2.112731 |
---|
2013-01-03 | -0.785526 | 1.583703 | -0.871005 | -0.659880 |
---|
2013-01-04 | -1.267462 | 2.500886 | -0.980569 | 1.308624 |
---|
| A | B | C | D |
---|
2013-01-04 | -1.267462 | 2.500886 | -0.980569 | 1.308624 |
---|
2013-01-05 | -0.842107 | -0.921086 | 1.020196 | -0.055930 |
---|
# loc可以使用整数,但此时的整数不代表位置,而是label
# loc是基本的取值方法
# 给loc的输入,即在[]中的值,可以是:
# 1. 单独的label e.g. 5 or 'a
# 2. labels的list ['a', 'b', 'c']
# 3. slice对象 'a':'f' !! 注意:与python的切片不同,pandas的切片包括开始和结尾,而python不包括结尾
# 4. 布尔值
# 5. 调用函数 [lambda df: df.A > 0, :]
# Series
s1 = pd.Series(np.random.randn(6),index=list('abcdef'))
s1
s1.loc['c':]
s1.loc['b']
s1.loc['c':] = 0
s1
a 0.796911
b -1.341250
c 0.008152
d -0.745881
e 0.674385
f 1.108411
dtype: float64
c 0.008152
d -0.745881
e 0.674385
f 1.108411
dtype: float64
-1.3412499335785426
a 0.796911
b -1.341250
c 0.000000
d 0.000000
e 0.000000
f 0.000000
dtype: float64
# DataFrame
df1 = pd.DataFrame(np.random.randn(6,4), # 6X4 阵列
index=list('abcdef'), # 索引
columns=list('ABCD')) # 行号
df1
| A | B | C | D |
---|
a | -1.912141 | -0.835589 | -0.188341 | -1.024797 |
---|
b | -0.977498 | -2.050214 | 0.355172 | -0.291794 |
---|
c | -0.183401 | -0.376330 | -0.188848 | -2.116438 |
---|
d | -1.008359 | 0.230593 | -0.099235 | -0.426229 |
---|
e | -0.027338 | 2.125459 | 0.066200 | -0.247813 |
---|
f | -1.132103 | 1.945235 | 1.891179 | 1.549750 |
---|
df1.loc[['a', 'b', 'd'], :] # 先是行label选择,再是列label选择
df1.loc['d':, 'A':'C']
# 取得 a cross section 截面,用单个的label,返回Series
# 以下三式等同
df1.loc['a']
df1.loc['a',:]
df1.xs('a')
type(df1.loc['a'])
| A | B | C | D |
---|
a | -1.912141 | -0.835589 | -0.188341 | -1.024797 |
---|
b | -0.977498 | -2.050214 | 0.355172 | -0.291794 |
---|
d | -1.008359 | 0.230593 | -0.099235 | -0.426229 |
---|
| A | B | C |
---|
d | -1.008359 | 0.230593 | -0.099235 |
---|
e | -0.027338 | 2.125459 | 0.066200 |
---|
f | -1.132103 | 1.945235 | 1.891179 |
---|
A -1.912141
B -0.835589
C -0.188341
D -1.024797
Name: a, dtype: float64
A -1.912141
B -0.835589
C -0.188341
D -1.024797
Name: a, dtype: float64
A -1.912141
B -0.835589
C -0.188341
D -1.024797
Name: a, dtype: float64
pandas.core.series.Series
# 通过布尔值数组取值
df1.loc['a'] > 0
df1.loc[:, df1.loc['a'] > 0]
A False
B False
C False
D False
Name: a, dtype: bool
# 获取一个值
# this is also equivalent to ``df1.at['a','A']``
df1.loc['a', 'A']
df1.at['a','A']
-1.9121410098540752
-1.9121410098540752
# Slicing with labels¶
# 用labels切片
s = pd.Series(list('abcde'), index=[0,3,2,5,4])
s
s.loc[3:5] # 包含 5, 注意不是 3 4 5 ,而是从 标签3 到 标签5
0 a
3 b
2 c
5 d
4 e
dtype: object
3 b
2 c
5 d
dtype: object
# s.loc[3:6] # 2个错误,ValueError: index must be monotonic increasing or decreasing KeyError: 6
# 如果排序的话,可以超出范围
s.sort_index() # 不改变原值
s
s.sort_index().loc[1:6] # 可以超出范围
s.sort_index().loc[6:8] # 即使一个都没有
# s.sort_index().loc[8] # 但不能是单值,必须是切片
0 a
2 c
3 b
4 e
5 d
dtype: object
0 a
3 b
2 c
5 d
4 e
dtype: object
2 c
3 b
4 e
5 d
dtype: object
Series([], dtype: object)
# Selection By Position
# 通过位置选择,仅通过基于索引的整数,与python和numpy类似,从0开始,且不包括最后一个
# iloc的输入:
# 1. 整数
# 2. 整数的list
# 3. 整数的切片
# 4. 布尔值数组
# 5. 调用函数
# Series
s1 = pd.Series(np.random.randn(5), index=list(range(0,10,2)))
s1
s1.iloc[:3]
s1.iloc[3]
s1.iloc[:3] = 0
s1
0 -0.312716
2 1.425936
4 1.716575
6 2.099666
8 0.262365
dtype: float64
0 -0.312716
2 1.425936
4 1.716575
dtype: float64
2.099665679869975
0 0.000000
2 0.000000
4 0.000000
6 2.099666
8 0.262365
dtype: float64
# DataFrame
df1 = pd.DataFrame(np.random.randn(6,4),
index=list(range(0,12,2)),
columns=list(range(0,8,2)))
df1
| 0 | 2 | 4 | 6 |
---|
0 | 1.635993 | -0.512450 | 1.786760 | -0.002533 |
---|
2 | 0.173188 | 0.275977 | -0.044987 | -1.077772 |
---|
4 | 1.985020 | 1.604020 | 0.127853 | -1.003384 |
---|
6 | 0.250428 | -0.102090 | 1.566787 | -1.708521 |
---|
8 | -2.111103 | -1.232141 | 0.863753 | -0.545229 |
---|
10 | -1.762999 | 1.009840 | 0.274013 | 0.786940 |
---|
df1.iloc[:3]
df1.iloc[1:5, 2:4]
df1.iloc[[1, 3, 5], [1, 3]]
df1.iloc[1:3, :]
df1.iloc[:, 1:3]
| 0 | 2 | 4 | 6 |
---|
0 | 1.635993 | -0.512450 | 1.786760 | -0.002533 |
---|
2 | 0.173188 | 0.275977 | -0.044987 | -1.077772 |
---|
4 | 1.985020 | 1.604020 | 0.127853 | -1.003384 |
---|
| 4 | 6 |
---|
2 | -0.044987 | -1.077772 |
---|
4 | 0.127853 | -1.003384 |
---|
6 | 1.566787 | -1.708521 |
---|
8 | 0.863753 | -0.545229 |
---|
| 2 | 6 |
---|
2 | 0.275977 | -1.077772 |
---|
6 | -0.102090 | -1.708521 |
---|
10 | 1.009840 | 0.786940 |
---|
| 0 | 2 | 4 | 6 |
---|
2 | 0.173188 | 0.275977 | -0.044987 | -1.077772 |
---|
4 | 1.985020 | 1.604020 | 0.127853 | -1.003384 |
---|
| 2 | 4 |
---|
0 | -0.512450 | 1.786760 |
---|
2 | 0.275977 | -0.044987 |
---|
4 | 1.604020 | 0.127853 |
---|
6 | -0.102090 | 1.566787 |
---|
8 | -1.232141 | 0.863753 |
---|
10 | 1.009840 | 0.274013 |
---|
# this is also equivalent to ``df1.iat[1,1]``
# 取单个的值
df1.iloc[1, 1]
df1.iat[1, 1]
0.2759774784621013
0.2759774784621013
# For getting a cross section using an integer position (equiv to df.xs(1))
# 取截面,得到Series
df1.iloc[1]
df1.iloc[:,1]
0 0.173188
2 0.275977
4 -0.044987
6 -1.077772
Name: 2, dtype: float64
0 -0.512450
2 0.275977
4 1.604020
6 -0.102090
8 -1.232141
10 1.009840
Name: 2, dtype: float64
# Out of range slice indexes are handled gracefully just as in Python/Numpy.
# 超过索引的切片处理,与python和numpy一样
# 注意:不能是单独索引,或列表中,有超过界限的值,只可以是slice,即带冒号的切片才不会提示错误
x = list('abcdef')
x[4:10]
x[8:10]
s = pd.Series(x)
s.iloc[4:10]
s.iloc[8:10] # 超过界限bound返回空
['e', 'f']
[]
4 e
5 f
dtype: object
Series([], dtype: object)
dfl = pd.DataFrame(np.random.randn(5,2), columns=list('AB'))
dfl
dfl.iloc[:, 2:3]
dfl.iloc[:, 1:3]
dfl.iloc[4:6]
| A | B |
---|
0 | -0.069941 | 1.124145 |
---|
1 | -0.025781 | 0.940736 |
---|
2 | -0.117417 | 0.503736 |
---|
3 | 0.882286 | 0.302845 |
---|
4 | -0.136374 | 0.276822 |
---|
| B |
---|
0 | 1.124145 |
---|
1 | 0.940736 |
---|
2 | 0.503736 |
---|
3 | 0.302845 |
---|
4 | 0.276822 |
---|
# Selection By Callable
# 通过调用函数进行选择
df1 = pd.DataFrame(np.random.randn(6,4),
index=list("abcdef"),
columns=list("ABCD"))
df1
df1.A
| A | B | C | D |
---|
a | -1.947578 | 0.874286 | 1.139484 | -3.004564 |
---|
b | 0.565255 | 0.028440 | 0.685688 | 0.973264 |
---|
c | -1.275992 | 0.732339 | -0.324490 | 1.116887 |
---|
d | 0.433325 | 0.002567 | -1.310127 | 0.844756 |
---|
e | 0.341412 | -0.606646 | 0.034623 | 0.772968 |
---|
f | 1.518936 | -0.590351 | 0.604839 | -1.461750 |
---|
a -1.947578
b 0.565255
c -1.275992
d 0.433325
e 0.341412
f 1.518936
Name: A, dtype: float64
df1.loc[lambda df: df.A > 0, :]
df1.loc[:, lambda df: ['A', 'B']]
df1.iloc[:, lambda df: [0, 1]]
df1[lambda df: df.columns[0]]
| A | B | C | D |
---|
b | 0.565255 | 0.028440 | 0.685688 | 0.973264 |
---|
d | 0.433325 | 0.002567 | -1.310127 | 0.844756 |
---|
e | 0.341412 | -0.606646 | 0.034623 | 0.772968 |
---|
f | 1.518936 | -0.590351 | 0.604839 | -1.461750 |
---|
| A | B |
---|
a | -1.947578 | 0.874286 |
---|
b | 0.565255 | 0.028440 |
---|
c | -1.275992 | 0.732339 |
---|
d | 0.433325 | 0.002567 |
---|
e | 0.341412 | -0.606646 |
---|
f | 1.518936 | -0.590351 |
---|
| A | B |
---|
a | -1.947578 | 0.874286 |
---|
b | 0.565255 | 0.028440 |
---|
c | -1.275992 | 0.732339 |
---|
d | 0.433325 | 0.002567 |
---|
e | 0.341412 | -0.606646 |
---|
f | 1.518936 | -0.590351 |
---|
a -1.947578
b 0.565255
c -1.275992
d 0.433325
e 0.341412
f 1.518936
Name: A, dtype: float64
df1.A
df1.A.loc[lambda s: s > 0]
df1.A.loc[df1.A > 0]
a -1.947578
b 0.565255
c -1.275992
d 0.433325
e 0.341412
f 1.518936
Name: A, dtype: float64
b 0.565255
d 0.433325
e 0.341412
f 1.518936
Name: A, dtype: float64
b 0.565255
d 0.433325
e 0.341412
f 1.518936
Name: A, dtype: float64
# 使用这些方法或索引,可以使用链式的选择方法,而不用中间的临时变量。链式方法,不是链式[]
bb = pd.read_csv('data/baseball.csv', index_col='id')
bb.groupby(['year', 'team']).sum().loc[lambda df: df.r > 100]
# IX Indexer is Deprecated
# 不推荐使用ix
# in favor of the more strict .iloc and .loc indexers.
# 使用.iloc和.loc代替.ix
dfd = pd.DataFrame({'A': [1, 2, 3],'B': [4, 5, 6]},index=list('abc'))
dfd
dfd.ix[[0, 2], 'A']
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
"""Entry point for launching an IPython kernel.
a 1
c 3
Name: A, dtype: int64
dfd.index[[0, 2]] # 取得索引的名称
dfd.loc[dfd.index[[0, 2]], 'A']
dfd.columns.get_loc('A') # 取得列的索引值
dfd.iloc[[0, 2], dfd.columns.get_loc('A')]
Index(['a', 'c'], dtype='object')
a 1
c 3
Name: A, dtype: int64
0
a 1
c 3
Name: A, dtype: int64
# Indexing with list with missing labels is Deprecated
# 不推荐用有缺失标签的list进行索引
# using .loc or [] with a list with one or more missing labels, is deprecated, in favor of .reindex.
# 推荐使用.reindex
s = pd.Series([1, 2, 3])
s
0 1
1 2
2 3
dtype: int64
s.loc[[1, 2]] # list中的keys都存在,则没有变化
1 2
2 3
dtype: int64
s.loc[[1, 2, 3]] # 当有缺失时,赋值为NaN
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:1: FutureWarning:
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
"""Entry point for launching an IPython kernel.
1 2.0
2 3.0
3 NaN
dtype: float64
# Reindexing
s.reindex([1, 2, 3])
1 2.0
2 3.0
3 NaN
dtype: float64
# 如果仅选择有效的keys
labels = [1, 2, 3]
s.index.intersection(labels) # index和labes的交集
s.loc[s.index.intersection(labels)]
Int64Index([1, 2], dtype='int64')
1 2
2 3
dtype: int64
# reindex 索引中不能有重复的项
s = pd.Series(np.arange(4), index=['a', 'a', 'b', 'c'])
s
labels = ['c', 'd']
# s.reindex(labels) # 不能reindex
a 0
a 1
b 2
c 3
dtype: int32
# 可以先把交集切出来,再进行reindex
# 但是交集不能有重复的index
s.index.intersection(labels)
s.loc[s.index.intersection(labels)]
s.loc[s.index.intersection(labels)].reindex(labels)
Index(['c'], dtype='object')
c 3
dtype: int32
c 3.0
d NaN
dtype: float64
# Selecting Random Samples
# 随机取样选择
# 默认行取样
s = pd.Series([0,1,2,3,4,5])
s
0 0
1 1
2 2
3 3
4 4
5 5
dtype: int64
s.sample() # 默认取1行
s.sample(n=3) # 取3行
s.sample(frac=0.5) # 小数,行数百分数
s.sample(frac=0.8) # 小数
s
1 1
dtype: int64
3 3
4 4
5 5
dtype: int64
2 2
3 3
4 4
dtype: int64
2 2
1 1
3 3
0 0
5 5
dtype: int64
0 0
1 1
2 2
3 3
4 4
5 5
dtype: int64
s.sample(n=6, replace=False) # 默认
s.sample(n=6, replace=True) # replace,不改变本身
s
5 5
4 4
1 1
0 0
2 2
3 3
dtype: int64
0 0
4 4
0 0
2 2
0 0
0 0
dtype: int64
0 0
1 1
2 2
3 3
4 4
5 5
dtype: int64
# 默认每行都有同样的概率被抽样到,也可以指定每行的概率比重
example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]
s.sample(n=3, weights=example_weights)
example_weights2 = [0.5, 0, 0, 0, 0, 0] # 权重将会自动的归一化
s.sample(n=1, weights=example_weights2)
2 2
3 3
5 5
dtype: int64
0 0
dtype: int64
# 可以指定DataFrame的某列作为权重
df2 = pd.DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]})
df2.sample(n = 3, weights = 'weight_column')
| col1 | weight_column |
---|
1 | 8 | 0.4 |
---|
2 | 7 | 0.1 |
---|
0 | 9 | 0.5 |
---|
# 对列进行抽样
df3 = pd.DataFrame({'col1':[1,2,3], 'col2':[2,3,4], 'col3':[3,4,5]})
df3
df3.sample(n=2, axis=1) # 指定axis=1,对列抽样,抽取的是列的组合
df3.sample(n=2, axis=0)
# 可以指定random seed 或者 numpy 的 randomState 对象,作为sample 随机数生成器的种子
# 一旦seed确定,随机数不变
df3.sample(n=2, random_state=2)
df3.sample(n=2, random_state=2)
df3.sample(n=2, random_state=200)
df3.sample(n=2, random_state=200)
# Setting With Enlargement
# loc或[] 操作 可以通过赋值不存在的键扩大Series或Dataframe
# Series
se = pd.Series([1,2,3])
se
se[5] = 5 # append
se
0 1
1 2
2 3
dtype: int64
0 1
1 2
2 3
5 5
dtype: int64
# DataFrame
dfi = pd.DataFrame(np.arange(6).reshape(3,2),columns=['A','B'])
dfi
dfi.loc[:,'C'] = dfi.loc[:,'A'] # enlarge 增加列
dfi
dfi.loc[3] = 5 # append 增加行
dfi
# Fast scalar value getting and setting
# 快速取得或设置标量的值
# [] 可以进行很多操作,所以它为了知道你要进行那种操作,会有一点计算开销
# 最快的标量访问方式是使用 at 和 iat 方法,他们可以在所有的数据结构上使用
# at 类似于 loc 基于 label
# iat 类似于 iloc 基于 整数index
s = pd.Series([0,1,2,3,4,5])
s
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df
0 0
1 1
2 2
3 3
4 4
5 5
dtype: int64
| A | B | C | D |
---|
2000-01-01 | 0.845046 | -0.119758 | 1.354224 | -0.134199 |
---|
2000-01-02 | -1.935579 | -1.469185 | -2.581439 | 0.355347 |
---|
2000-01-03 | -0.038740 | -1.524056 | 1.376257 | 1.572331 |
---|
2000-01-04 | -0.846971 | 0.189231 | -0.287885 | -0.561706 |
---|
2000-01-05 | -0.127290 | -0.043918 | 0.103347 | -1.055387 |
---|
2000-01-06 | 0.406437 | 1.917624 | 0.810463 | 0.367583 |
---|
2000-01-07 | 0.438904 | -0.230190 | 0.593607 | -0.438856 |
---|
2000-01-08 | -1.955122 | 1.531260 | 0.889124 | -0.014259 |
---|
# get value
s.iat[5]
df.at[dates[5], 'A']
df.iat[3, 0]
5
0.40643702489386246
-0.8469710793801154
# set value
df.at[dates[5], 'E'] = 7 # 没有列就增加一列,没有值的默认赋值为nan
df
| A | B | C | D | E |
---|
2000-01-01 | 0.845046 | -0.119758 | 1.354224 | -0.134199 | NaN |
---|
2000-01-02 | -1.935579 | -1.469185 | -2.581439 | 0.355347 | NaN |
---|
2000-01-03 | -0.038740 | -1.524056 | 1.376257 | 1.572331 | NaN |
---|
2000-01-04 | -0.846971 | 0.189231 | -0.287885 | -0.561706 | NaN |
---|
2000-01-05 | -0.127290 | -0.043918 | 0.103347 | -1.055387 | NaN |
---|
2000-01-06 | 0.406437 | 1.917624 | 0.810463 | 0.367583 | 7.0 |
---|
2000-01-07 | 0.438904 | -0.230190 | 0.593607 | -0.438856 | NaN |
---|
2000-01-08 | -1.955122 | 1.531260 | 0.889124 | -0.014259 | NaN |
---|
df.iat[3, 0] = 7
df
| A | B | C | D | E |
---|
2000-01-01 | 0.845046 | -0.119758 | 1.354224 | -0.134199 | NaN |
---|
2000-01-02 | -1.935579 | -1.469185 | -2.581439 | 0.355347 | NaN |
---|
2000-01-03 | -0.038740 | -1.524056 | 1.376257 | 1.572331 | NaN |
---|
2000-01-04 | 7.000000 | 0.189231 | -0.287885 | -0.561706 | NaN |
---|
2000-01-05 | -0.127290 | -0.043918 | 0.103347 | -1.055387 | NaN |
---|
2000-01-06 | 0.406437 | 1.917624 | 0.810463 | 0.367583 | 7.0 |
---|
2000-01-07 | 0.438904 | -0.230190 | 0.593607 | -0.438856 | NaN |
---|
2000-01-08 | -1.955122 | 1.531260 | 0.889124 | -0.014259 | NaN |
---|
df.at[dates[-1]+1, 0] = 7 # 行和列都扩展了
df
| A | B | C | D | E | 0 |
---|
2000-01-01 | 0.845046 | -0.119758 | 1.354224 | -0.134199 | NaN | NaN |
---|
2000-01-02 | -1.935579 | -1.469185 | -2.581439 | 0.355347 | NaN | NaN |
---|
2000-01-03 | -0.038740 | -1.524056 | 1.376257 | 1.572331 | NaN | NaN |
---|
2000-01-04 | 7.000000 | 0.189231 | -0.287885 | -0.561706 | NaN | NaN |
---|
2000-01-05 | -0.127290 | -0.043918 | 0.103347 | -1.055387 | NaN | NaN |
---|
2000-01-06 | 0.406437 | 1.917624 | 0.810463 | 0.367583 | 7.0 | NaN |
---|
2000-01-07 | 0.438904 | -0.230190 | 0.593607 | -0.438856 | NaN | NaN |
---|
2000-01-08 | -1.955122 | 1.531260 | 0.889124 | -0.014259 | NaN | NaN |
---|
2000-01-09 | NaN | NaN | NaN | NaN | NaN | 7.0 |
---|
# Boolean indexing
# 使用布尔向量过滤数据
# 操作符包括:| for or, & for and, and ~ for not ,必须用括号进行分组
s = pd.Series(range(-3, 4)) # 不用把range先list,直接可以series
s
0 -3
1 -2
2 -1
3 0
4 1
5 2
6 3
dtype: int64
s[s >= 0] # 直接用series,不用取其值
s[~(s < 0)]
s[(s < -1) | (s > 0.5)]
3 0
4 1
5 2
6 3
dtype: int64
3 0
4 1
5 2
6 3
dtype: int64
0 -3
1 -2
4 1
5 2
6 3
dtype: int64
df[df['A'] > 0]
| A | B | C | D | E | 0 |
---|
2000-01-01 | 0.845046 | -0.119758 | 1.354224 | -0.134199 | NaN | NaN |
---|
2000-01-04 | 7.000000 | 0.189231 | -0.287885 | -0.561706 | NaN | NaN |
---|
2000-01-06 | 0.406437 | 1.917624 | 0.810463 | 0.367583 | 7.0 | NaN |
---|
2000-01-07 | 0.438904 | -0.230190 | 0.593607 | -0.438856 | NaN | NaN |
---|
# List comprehensions and map method of Series can also be used to produce more complex criteria:
# 列表生成式和map方法 也可以用来生成 更复杂的条件判断
df2 = pd.DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
'c' : np.random.randn(7)})
df2
| a | b | c |
---|
0 | one | x | 1.706402 |
---|
1 | one | y | 0.491977 |
---|
2 | two | y | 1.357986 |
---|
3 | three | x | -1.023513 |
---|
4 | two | y | -0.653028 |
---|
5 | one | x | 0.041052 |
---|
6 | six | x | 1.021882 |
---|
# 判别式
criterion = df2['a'].map(lambda x: x.startswith('t')) # 选择 two 和 three
criterion
0 False
1 False
2 True
3 True
4 True
5 False
6 False
Name: a, dtype: bool
df2[criterion] # 根据 a 列的判别式 选择数据表的一部分,包含其他列
| a | b | c |
---|
2 | two | y | 1.357986 |
---|
3 | three | x | -1.023513 |
---|
4 | two | y | -0.653028 |
---|
# 等价的方式,但是速度慢一些
df2[[x.startswith('t') for x in df2['a']]] # 不适用map,而是用列表生成式
| a | b | c |
---|
2 | two | y | 1.357986 |
---|
3 | three | x | -1.023513 |
---|
4 | two | y | -0.653028 |
---|
# 复合判断
df2[criterion & (df2['b'] == 'x')] # a 列 和 b 列 均符合某类要求
# 布尔向量选择 可以与索引选择一并使用
df2.loc[criterion & (df2['b'] == 'x'),'b':'c'] # 只选择 b 和 c 两列,不选择c列
# Indexing with isin
# 用isin索引
# series
# 对列数据进行判断
s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')
s
4 0
3 1
2 2
1 3
0 4
dtype: int64
s.isin([2, 4, 6]) # 列数据中是否存在列表中的值,返回布尔值
s[s.isin([2, 4, 6])] # 可以利用返回的布尔值进行选择
4 False
3 False
2 True
1 False
0 True
dtype: bool
2 2
0 4
dtype: int64
# 也可以对index obj 进行筛选
s[s.index.isin([2, 4, 6])]
s.reindex([2, 4, 6]) # reindex不同,列表中没有的值返回了nan,且原来的int64返回了float64数据类型
4 0
2 2
dtype: int64
2 2.0
4 0.0
6 NaN
dtype: float64
# 对应多重索引,可以单独选择索引级别
s_mi = pd.Series(np.arange(6),index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))
s_mi
0 a 0
b 1
c 2
1 a 3
b 4
c 5
dtype: int32
s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])]
s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)] # 指定索引级别,在第二级索引中选择
0 c 2
1 a 3
dtype: int32
0 a 0
c 2
1 a 3
c 5
dtype: int32
# DataFrame
df = pd.DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],'ids2': ['a', 'n', 'c', 'n']})
df
| ids | ids2 | vals |
---|
0 | a | a | 1 |
---|
1 | b | n | 2 |
---|
2 | f | c | 3 |
---|
3 | n | n | 4 |
---|
values = ['a', 'b', 1, 3]
df.isin(values) # 匹配所有的值
| ids | ids2 | vals |
---|
0 | True | True | True |
---|
1 | True | False | False |
---|
2 | False | False | True |
---|
3 | False | False | False |
---|
values = {'ids': ['a', 'b'], 'ids2': ['a', 'c'], 'vals': [1, 3]}
row_mask = df.isin(values) # 对不用的列,分别匹配某些值
row_mask
# ?row_mask.all # Return whether all elements are True over requested axis
row_mask = row_mask.all(1)
row_mask
df[row_mask] # 选择全是True的行
| ids | ids2 | vals |
---|
0 | True | True | True |
---|
1 | True | False | False |
---|
2 | False | True | True |
---|
3 | False | False | False |
---|
0 True
1 False
2 False
3 False
dtype: bool
# The where() Method and Masking
# To guarantee that selection output has the same shape as the original data
# 保证选集输出与原数据有同样的shape形态
# series
s
s[s > 0] # 只返回满足的项
s.where(s > 0) # 全部返回,不满足的项,赋值nan
4 0
3 1
2 2
1 3
0 4
dtype: int64
3 1
2 2
1 3
0 4
dtype: int64
4 NaN
3 1.0
2 2.0
1 3.0
0 4.0
dtype: float64
# DataFrame 使用布尔值选择时,返回值保留原数据结构
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df
| A | B | C | D |
---|
2000-01-01 | 0.383691 | -0.500453 | 0.101632 | 0.848213 |
---|
2000-01-02 | -1.048619 | 0.856605 | -0.295425 | 1.060710 |
---|
2000-01-03 | -0.214918 | -1.989228 | 0.278514 | 1.088771 |
---|
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 1.457230 |
---|
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | -0.089382 |
---|
2000-01-06 | -1.041194 | 0.533706 | -1.084442 | 1.824709 |
---|
2000-01-07 | -1.040706 | -2.336161 | 0.565496 | 0.269414 |
---|
2000-01-08 | 0.166739 | -0.075381 | -0.951126 | -0.347865 |
---|
df[df < 0]
df.where(df < 0) # 等价于上式
df.where(df < 0, -df) # where 可以传入另一个参数,用于替换条件为 False 的项,返回copy数据拷贝,不修改原值
df
| A | B | C | D |
---|
2000-01-01 | NaN | -0.500453 | NaN | NaN |
---|
2000-01-02 | -1.048619 | NaN | -0.295425 | NaN |
---|
2000-01-03 | -0.214918 | -1.989228 | NaN | NaN |
---|
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | NaN |
---|
2000-01-05 | NaN | NaN | NaN | -0.089382 |
---|
2000-01-06 | -1.041194 | NaN | -1.084442 | NaN |
---|
2000-01-07 | -1.040706 | -2.336161 | NaN | NaN |
---|
2000-01-08 | NaN | -0.075381 | -0.951126 | -0.347865 |
---|
| A | B | C | D |
---|
2000-01-01 | NaN | -0.500453 | NaN | NaN |
---|
2000-01-02 | -1.048619 | NaN | -0.295425 | NaN |
---|
2000-01-03 | -0.214918 | -1.989228 | NaN | NaN |
---|
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | NaN |
---|
2000-01-05 | NaN | NaN | NaN | -0.089382 |
---|
2000-01-06 | -1.041194 | NaN | -1.084442 | NaN |
---|
2000-01-07 | -1.040706 | -2.336161 | NaN | NaN |
---|
2000-01-08 | NaN | -0.075381 | -0.951126 | -0.347865 |
---|
| A | B | C | D |
---|
2000-01-01 | -0.383691 | -0.500453 | -0.101632 | -0.848213 |
---|
2000-01-02 | -1.048619 | -0.856605 | -0.295425 | -1.060710 |
---|
2000-01-03 | -0.214918 | -1.989228 | -0.278514 | -1.088771 |
---|
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | -1.457230 |
---|
2000-01-05 | -0.656919 | -1.081810 | -1.148303 | -0.089382 |
---|
2000-01-06 | -1.041194 | -0.533706 | -1.084442 | -1.824709 |
---|
2000-01-07 | -1.040706 | -2.336161 | -0.565496 | -0.269414 |
---|
2000-01-08 | -0.166739 | -0.075381 | -0.951126 | -0.347865 |
---|
| A | B | C | D |
---|
2000-01-01 | 0.383691 | -0.500453 | 0.101632 | 0.848213 |
---|
2000-01-02 | -1.048619 | 0.856605 | -0.295425 | 1.060710 |
---|
2000-01-03 | -0.214918 | -1.989228 | 0.278514 | 1.088771 |
---|
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 1.457230 |
---|
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | -0.089382 |
---|
2000-01-06 | -1.041194 | 0.533706 | -1.084442 | 1.824709 |
---|
2000-01-07 | -1.040706 | -2.336161 | 0.565496 | 0.269414 |
---|
2000-01-08 | 0.166739 | -0.075381 | -0.951126 | -0.347865 |
---|
# You may wish to set values based on some boolean criteria. This can be done intuitively like so:
# 设置基于布尔值的项值
s2 = s.copy()
s2
s2[s2 < 3] = 0
s2
df2 = df.copy()
df2
df2[df2 < 0] = 0
df2
4 0
3 1
2 2
1 3
0 4
dtype: int64
4 0
3 0
2 0
1 3
0 4
dtype: int64
| A | B | C | D |
---|
2000-01-01 | 0.383691 | -0.500453 | 0.101632 | 0.848213 |
---|
2000-01-02 | -1.048619 | 0.856605 | -0.295425 | 1.060710 |
---|
2000-01-03 | -0.214918 | -1.989228 | 0.278514 | 1.088771 |
---|
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 1.457230 |
---|
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | -0.089382 |
---|
2000-01-06 | -1.041194 | 0.533706 | -1.084442 | 1.824709 |
---|
2000-01-07 | -1.040706 | -2.336161 | 0.565496 | 0.269414 |
---|
2000-01-08 | 0.166739 | -0.075381 | -0.951126 | -0.347865 |
---|
| A | B | C | D |
---|
2000-01-01 | 0.383691 | 0.000000 | 0.101632 | 0.848213 |
---|
2000-01-02 | 0.000000 | 0.856605 | 0.000000 | 1.060710 |
---|
2000-01-03 | 0.000000 | 0.000000 | 0.278514 | 1.088771 |
---|
2000-01-04 | 0.000000 | 0.000000 | 0.000000 | 1.457230 |
---|
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | 0.000000 |
---|
2000-01-06 | 0.000000 | 0.533706 | 0.000000 | 1.824709 |
---|
2000-01-07 | 0.000000 | 0.000000 | 0.565496 | 0.269414 |
---|
2000-01-08 | 0.166739 | 0.000000 | 0.000000 | 0.000000 |
---|
# where 默认返回修改后的数据拷贝,原值不变;可以设置inplace参数,直接修改原值,而不是创建拷贝
df_orig = df.copy()
df_orig.where(df > 0, -df, inplace=True)
df_orig
| A | B | C | D |
---|
2000-01-01 | 0.383691 | 0.500453 | 0.101632 | 0.848213 |
---|
2000-01-02 | 1.048619 | 0.856605 | 0.295425 | 1.060710 |
---|
2000-01-03 | 0.214918 | 1.989228 | 0.278514 | 1.088771 |
---|
2000-01-04 | 2.326385 | 0.225754 | 1.331598 | 1.457230 |
---|
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | 0.089382 |
---|
2000-01-06 | 1.041194 | 0.533706 | 1.084442 | 1.824709 |
---|
2000-01-07 | 1.040706 | 2.336161 | 0.565496 | 0.269414 |
---|
2000-01-08 | 0.166739 | 0.075381 | 0.951126 | 0.347865 |
---|
# 注意 :pandas 和 numpy 的where方法不一样
# 一般的,df1.where(m, df2) 相当于 np.where(m, df1, df2)
df.where(df < 0, -df) == np.where(df < 0, df, -df)
| A | B | C | D |
---|
2000-01-01 | True | True | True | True |
---|
2000-01-02 | True | True | True | True |
---|
2000-01-03 | True | True | True | True |
---|
2000-01-04 | True | True | True | True |
---|
2000-01-05 | True | True | True | True |
---|
2000-01-06 | True | True | True | True |
---|
2000-01-07 | True | True | True | True |
---|
2000-01-08 | True | True | True | True |
---|
# alignment 定位,对齐
# where 可以选择局部(部分区域)的布尔条件
# This is analogous to partial setting via .loc (but on the contents rather than the axis labels)
df2 = df.copy()
df2
df2[1:4] # 行选择
df2.where(df2[1:4] > 0, 3) # 对不符合项的值进行赋值!
df2[ df2[1:4] > 0 ] = 3 # 只定位部分区域,对符合项的值进行赋值
df2
| A | B | C | D |
---|
2000-01-01 | 0.383691 | -0.500453 | 0.101632 | 0.848213 |
---|
2000-01-02 | -1.048619 | 0.856605 | -0.295425 | 1.060710 |
---|
2000-01-03 | -0.214918 | -1.989228 | 0.278514 | 1.088771 |
---|
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 1.457230 |
---|
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | -0.089382 |
---|
2000-01-06 | -1.041194 | 0.533706 | -1.084442 | 1.824709 |
---|
2000-01-07 | -1.040706 | -2.336161 | 0.565496 | 0.269414 |
---|
2000-01-08 | 0.166739 | -0.075381 | -0.951126 | -0.347865 |
---|
| A | B | C | D |
---|
2000-01-02 | -1.048619 | 0.856605 | -0.295425 | 1.060710 |
---|
2000-01-03 | -0.214918 | -1.989228 | 0.278514 | 1.088771 |
---|
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 1.457230 |
---|
| A | B | C | D |
---|
2000-01-01 | 3.0 | 3.000000 | 3.000000 | 3.000000 |
---|
2000-01-02 | 3.0 | 0.856605 | 3.000000 | 1.060710 |
---|
2000-01-03 | 3.0 | 3.000000 | 0.278514 | 1.088771 |
---|
2000-01-04 | 3.0 | 3.000000 | 3.000000 | 1.457230 |
---|
2000-01-05 | 3.0 | 3.000000 | 3.000000 | 3.000000 |
---|
2000-01-06 | 3.0 | 3.000000 | 3.000000 | 3.000000 |
---|
2000-01-07 | 3.0 | 3.000000 | 3.000000 | 3.000000 |
---|
2000-01-08 | 3.0 | 3.000000 | 3.000000 | 3.000000 |
---|
| A | B | C | D |
---|
2000-01-01 | 0.383691 | -0.500453 | 0.101632 | 0.848213 |
---|
2000-01-02 | -1.048619 | 3.000000 | -0.295425 | 3.000000 |
---|
2000-01-03 | -0.214918 | -1.989228 | 3.000000 | 3.000000 |
---|
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 3.000000 |
---|
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | -0.089382 |
---|
2000-01-06 | -1.041194 | 0.533706 | -1.084442 | 1.824709 |
---|
2000-01-07 | -1.040706 | -2.336161 | 0.565496 | 0.269414 |
---|
2000-01-08 | 0.166739 | -0.075381 | -0.951126 | -0.347865 |
---|
# Where can also accept axis and level parameters to align the input when performing the where.
# where 可以接受 轴参数axis 和 级别参数level
df2 = df.copy()
df2
df2.where(df2>0,df2['A'],axis='index') # 小于等于0的值,赋值为A列的值
df2
df2.apply(lambda x, y: x.where(x>0,y), y=df['A']) # 相当于上式,但此式较慢,同样不改变原值,而是生成一个拷贝copy
df2
| A | B | C | D |
---|
2000-01-01 | 0.383691 | -0.500453 | 0.101632 | 0.848213 |
---|
2000-01-02 | -1.048619 | 0.856605 | -0.295425 | 1.060710 |
---|
2000-01-03 | -0.214918 | -1.989228 | 0.278514 | 1.088771 |
---|
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 1.457230 |
---|
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | -0.089382 |
---|
2000-01-06 | -1.041194 | 0.533706 | -1.084442 | 1.824709 |
---|
2000-01-07 | -1.040706 | -2.336161 | 0.565496 | 0.269414 |
---|
2000-01-08 | 0.166739 | -0.075381 | -0.951126 | -0.347865 |
---|
| A | B | C | D |
---|
2000-01-01 | 0.383691 | 0.383691 | 0.101632 | 0.848213 |
---|
2000-01-02 | -1.048619 | 0.856605 | -1.048619 | 1.060710 |
---|
2000-01-03 | -0.214918 | -0.214918 | 0.278514 | 1.088771 |
---|
2000-01-04 | -2.326385 | -2.326385 | -2.326385 | 1.457230 |
---|
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | 0.656919 |
---|
2000-01-06 | -1.041194 | 0.533706 | -1.041194 | 1.824709 |
---|
2000-01-07 | -1.040706 | -1.040706 | 0.565496 | 0.269414 |
---|
2000-01-08 | 0.166739 | 0.166739 | 0.166739 | 0.166739 |
---|
| A | B | C | D |
---|
2000-01-01 | 0.383691 | -0.500453 | 0.101632 | 0.848213 |
---|
2000-01-02 | -1.048619 | 0.856605 | -0.295425 | 1.060710 |
---|
2000-01-03 | -0.214918 | -1.989228 | 0.278514 | 1.088771 |
---|
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 1.457230 |
---|
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | -0.089382 |
---|
2000-01-06 | -1.041194 | 0.533706 | -1.084442 | 1.824709 |
---|
2000-01-07 | -1.040706 | -2.336161 | 0.565496 | 0.269414 |
---|
2000-01-08 | 0.166739 | -0.075381 | -0.951126 | -0.347865 |
---|
| A | B | C | D |
---|
2000-01-01 | 0.383691 | 0.383691 | 0.101632 | 0.848213 |
---|
2000-01-02 | -1.048619 | 0.856605 | -1.048619 | 1.060710 |
---|
2000-01-03 | -0.214918 | -0.214918 | 0.278514 | 1.088771 |
---|
2000-01-04 | -2.326385 | -2.326385 | -2.326385 | 1.457230 |
---|
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | 0.656919 |
---|
2000-01-06 | -1.041194 | 0.533706 | -1.041194 | 1.824709 |
---|
2000-01-07 | -1.040706 | -1.040706 | 0.565496 | 0.269414 |
---|
2000-01-08 | 0.166739 | 0.166739 | 0.166739 | 0.166739 |
---|
| A | B | C | D |
---|
2000-01-01 | 0.383691 | -0.500453 | 0.101632 | 0.848213 |
---|
2000-01-02 | -1.048619 | 0.856605 | -0.295425 | 1.060710 |
---|
2000-01-03 | -0.214918 | -1.989228 | 0.278514 | 1.088771 |
---|
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 1.457230 |
---|
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | -0.089382 |
---|
2000-01-06 | -1.041194 | 0.533706 | -1.084442 | 1.824709 |
---|
2000-01-07 | -1.040706 | -2.336161 | 0.565496 | 0.269414 |
---|
2000-01-08 | 0.166739 | -0.075381 | -0.951126 | -0.347865 |
---|
# where 可以接受一个函数调用,此函数只能有一个参数,且返回有效的布尔条件
df3 = pd.DataFrame({'A': [1, 2, 3],'B': [4, 5, 6],'C': [7, 8, 9]})
df3
df3.where(lambda x: x > 4, lambda x: x + 10) # x<=4 的值 赋值为 x+10
# mask 遮罩 mask is the inverse boolean operation of where. 反向的布尔值操作
s
s.mask(s >= 0) # 选择<0 的值, 不符合项 置为nan
df
df.mask(df >= 0)
4 0
3 1
2 2
1 3
0 4
dtype: int64
4 NaN
3 NaN
2 NaN
1 NaN
0 NaN
dtype: float64
| A | B | C | D |
---|
2000-01-01 | 0.383691 | -0.500453 | 0.101632 | 0.848213 |
---|
2000-01-02 | -1.048619 | 0.856605 | -0.295425 | 1.060710 |
---|
2000-01-03 | -0.214918 | -1.989228 | 0.278514 | 1.088771 |
---|
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 1.457230 |
---|
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | -0.089382 |
---|
2000-01-06 | -1.041194 | 0.533706 | -1.084442 | 1.824709 |
---|
2000-01-07 | -1.040706 | -2.336161 | 0.565496 | 0.269414 |
---|
2000-01-08 | 0.166739 | -0.075381 | -0.951126 | -0.347865 |
---|
| A | B | C | D |
---|
2000-01-01 | NaN | -0.500453 | NaN | NaN |
---|
2000-01-02 | -1.048619 | NaN | -0.295425 | NaN |
---|
2000-01-03 | -0.214918 | -1.989228 | NaN | NaN |
---|
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | NaN |
---|
2000-01-05 | NaN | NaN | NaN | -0.089382 |
---|
2000-01-06 | -1.041194 | NaN | -1.084442 | NaN |
---|
2000-01-07 | -1.040706 | -2.336161 | NaN | NaN |
---|
2000-01-08 | NaN | -0.075381 | -0.951126 | -0.347865 |
---|
# The query() Method (Experimental) 对 DataFrame对象 使用表达式进行选择
# 例如 :选择b列中 在a列和c列两值中间的 行,a<b<c
n = 10
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))
df
| a | b | c |
---|
0 | 0.283094 | 0.051807 | 0.126487 |
---|
1 | 0.020097 | 0.373023 | 0.147193 |
---|
2 | 0.091921 | 0.830956 | 0.143214 |
---|
3 | 0.340304 | 0.527246 | 0.709769 |
---|
4 | 0.651722 | 0.344524 | 0.151233 |
---|
5 | 0.396685 | 0.524376 | 0.540237 |
---|
6 | 0.502751 | 0.627708 | 0.708038 |
---|
7 | 0.472338 | 0.269770 | 0.586165 |
---|
8 | 0.937522 | 0.239560 | 0.861873 |
---|
9 | 0.661879 | 0.465536 | 0.271580 |
---|
# pure python
df[(df.a < df.b) & (df.b < df.c)]
| a | b | c |
---|
3 | 0.340304 | 0.527246 | 0.709769 |
---|
5 | 0.396685 | 0.524376 | 0.540237 |
---|
6 | 0.502751 | 0.627708 | 0.708038 |
---|
# query,传入的是str表达式
df.query('(a < b) & (b < c)') # 比纯py慢?!
| a | b | c |
---|
3 | 0.340304 | 0.527246 | 0.709769 |
---|
5 | 0.396685 | 0.524376 | 0.540237 |
---|
6 | 0.502751 | 0.627708 | 0.708038 |
---|
# 对于命名的index索引是低效的,比利用列名称
# 而且如果索引的名称和列名同名,列名优先
df = pd.DataFrame(np.random.randint(n / 2, size=(n, 2)), columns=list('bc'))
# df
df.index.name = "a"
df
| b | c |
---|
a | | |
---|
0 | 0 | 4 |
---|
1 | 1 | 2 |
---|
2 | 3 | 0 |
---|
3 | 0 | 3 |
---|
4 | 4 | 3 |
---|
5 | 0 | 0 |
---|
6 | 0 | 4 |
---|
7 | 1 | 4 |
---|
8 | 3 | 0 |
---|
9 | 1 | 4 |
---|
df.query('a <= b and b <= c')
# 可以不使用索引的名称,而是直接用index,这样同时可以避免与列名重名
df.query('index <= b <= c')
# MultiIndex query() Syntax
# 对于多重索引
n = 10
colors = np.random.choice(['red', 'green'], size=n)
colors
foods = np.random.choice(['eggs', 'ham'], size=n)
foods
array(['green', 'green', 'green', 'green', 'green', 'green', 'green',
'red', 'green', 'green'], dtype='<U5')
array(['ham', 'eggs', 'eggs', 'eggs', 'eggs', 'ham', 'eggs', 'eggs',
'ham', 'ham'], dtype='<U4')
index = pd.MultiIndex.from_arrays([colors, foods], names=['color', 'food'])
index
df = pd.DataFrame(np.random.randn(n, 2), index=index)
df
MultiIndex(levels=[['green', 'red'], ['eggs', 'ham']],
labels=[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 1, 0, 0, 1, 1]],
names=['color', 'food'])
| | 0 | 1 |
---|
color | food | | |
---|
green | ham | -0.295761 | 1.399719 |
---|
eggs | 1.807185 | 0.498136 |
---|
eggs | -0.119640 | 2.279162 |
---|
eggs | -0.238709 | -0.650418 |
---|
eggs | 2.235827 | 1.066954 |
---|
ham | 1.156794 | 1.694717 |
---|
eggs | -0.037158 | -0.529213 |
---|
red | eggs | 0.046799 | 0.763592 |
---|
green | ham | 0.488240 | -0.455112 |
---|
ham | -0.169486 | -0.646891 |
---|
df.query('color == "red"')
| | 0 | 1 |
---|
color | food | | |
---|
red | eggs | 0.046799 | 0.763592 |
---|
# 如果多重索引没有命名 可以使用特殊的名字
df.index.names = [None, None]
df
df.query('ilevel_0 == "red"') # The convention is ilevel_0, which means “index level 0” for the 0th level of the index.
| | 0 | 1 |
---|
green | ham | -0.295761 | 1.399719 |
---|
eggs | 1.807185 | 0.498136 |
---|
eggs | -0.119640 | 2.279162 |
---|
eggs | -0.238709 | -0.650418 |
---|
eggs | 2.235827 | 1.066954 |
---|
ham | 1.156794 | 1.694717 |
---|
eggs | -0.037158 | -0.529213 |
---|
red | eggs | 0.046799 | 0.763592 |
---|
green | ham | 0.488240 | -0.455112 |
---|
ham | -0.169486 | -0.646891 |
---|
| | 0 | 1 |
---|
red | eggs | 0.046799 | 0.763592 |
---|
# query() Use Cases 使用示例
# A use case for query() is when you have a collection of DataFrame objects
# that have a subset of column names (or index levels/names) in common. ## 有公共列名称子集的df
# You can pass the same query to both frames
# without having to specify which frame you’re interested in querying. ## 传入同样的query
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))
df2 = pd.DataFrame(np.random.rand(n + 2, 3), columns=df.columns)
df
df2
| a | b | c |
---|
0 | 0.096087 | 0.799309 | 0.112070 |
---|
1 | 0.083101 | 0.505151 | 0.830588 |
---|
2 | 0.501472 | 0.692792 | 0.663570 |
---|
3 | 0.872821 | 0.442541 | 0.904903 |
---|
4 | 0.513651 | 0.375617 | 0.786898 |
---|
5 | 0.184368 | 0.285290 | 0.055147 |
---|
6 | 0.284951 | 0.243556 | 0.732871 |
---|
7 | 0.431525 | 0.676385 | 0.458296 |
---|
8 | 0.743057 | 0.326079 | 0.434655 |
---|
9 | 0.610921 | 0.717639 | 0.580765 |
---|
| a | b | c |
---|
0 | 0.804932 | 0.533058 | 0.713190 |
---|
1 | 0.305893 | 0.460048 | 0.879321 |
---|
2 | 0.344438 | 0.970870 | 0.685098 |
---|
3 | 0.348010 | 0.839449 | 0.799309 |
---|
4 | 0.844713 | 0.267562 | 0.771202 |
---|
5 | 0.403534 | 0.088786 | 0.950782 |
---|
6 | 0.181616 | 0.567118 | 0.989711 |
---|
7 | 0.363736 | 0.852080 | 0.140771 |
---|
8 | 0.866127 | 0.285365 | 0.025491 |
---|
9 | 0.329751 | 0.121716 | 0.782729 |
---|
10 | 0.029253 | 0.419409 | 0.051255 |
---|
11 | 0.057406 | 0.106595 | 0.559687 |
---|
expr = '0.0 <= a <= c <= 0.5'
mp = map(lambda frame: frame.query(expr), [df, df2]) # 同一个表达式,作用在有同样列名的多个df上
for i in mp:
print(i)
a b c
0 0.096087 0.799309 0.112070
7 0.431525 0.676385 0.458296
a b c
10 0.029253 0.419409 0.051255
# query() Python versus pandas Syntax Comparison
# Full numpy-like syntax
df = pd.DataFrame(np.random.randint(n, size=(n, 3)), columns=list('abc'))
df
| a | b | c |
---|
0 | 8 | 4 | 5 |
---|
1 | 7 | 4 | 7 |
---|
2 | 9 | 3 | 2 |
---|
3 | 4 | 8 | 7 |
---|
4 | 4 | 5 | 8 |
---|
5 | 7 | 8 | 8 |
---|
6 | 1 | 7 | 6 |
---|
7 | 5 | 9 | 1 |
---|
8 | 3 | 3 | 8 |
---|
9 | 7 | 1 | 5 |
---|
df.query('(a < b) & (b < c)')
df[(df.a < df.b) & (df.b < df.c)]
# 几种其他的不同的写法
df.query('a < b & b < c') # 去掉括号
df.query('a < b and b < c') # 使用英文and
df.query('a < b < c') # 连写,优雅的表达
# The in and not in operators
# get all rows where columns "a" and "b" have overlapping values
# 得到a列和b列有重叠值的行,a列中的值在 in b列的值中
df = pd.DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'),
'c': np.random.randint(5, size=12),
'd': np.random.randint(9, size=12)})
df
| a | b | c | d |
---|
0 | a | a | 0 | 2 |
---|
1 | a | a | 0 | 6 |
---|
2 | b | a | 3 | 3 |
---|
3 | b | a | 1 | 2 |
---|
4 | c | b | 1 | 2 |
---|
5 | c | b | 1 | 2 |
---|
6 | d | b | 3 | 8 |
---|
7 | d | b | 1 | 0 |
---|
8 | e | c | 2 | 7 |
---|
9 | e | c | 0 | 1 |
---|
10 | f | c | 0 | 1 |
---|
11 | f | c | 3 | 1 |
---|
df.query('a in b') # 第一次运行113ms 第二次明显加快 25ms
| a | b | c | d |
---|
0 | a | a | 0 | 2 |
---|
1 | a | a | 0 | 6 |
---|
2 | b | a | 3 | 3 |
---|
3 | b | a | 1 | 2 |
---|
4 | c | b | 1 | 2 |
---|
5 | c | b | 1 | 2 |
---|
df[df.a.isin(df.b)] # How you'd do it in pure Python 仍然比query快 35ms
| a | b | c | d |
---|
0 | a | a | 0 | 2 |
---|
1 | a | a | 0 | 6 |
---|
2 | b | a | 3 | 3 |
---|
3 | b | a | 1 | 2 |
---|
4 | c | b | 1 | 2 |
---|
5 | c | b | 1 | 2 |
---|
df[~df.a.isin(df.b)]
| a | b | c | d |
---|
6 | d | b | 3 | 8 |
---|
7 | d | b | 1 | 0 |
---|
8 | e | c | 2 | 7 |
---|
9 | e | c | 0 | 1 |
---|
10 | f | c | 0 | 1 |
---|
11 | f | c | 3 | 1 |
---|
df.query('a not in b')
| a | b | c | d |
---|
6 | d | b | 3 | 8 |
---|
7 | d | b | 1 | 0 |
---|
8 | e | c | 2 | 7 |
---|
9 | e | c | 0 | 1 |
---|
10 | f | c | 0 | 1 |
---|
11 | f | c | 3 | 1 |
---|
# rows where cols a and b have overlapping values and col c's values are less than col d's
df.query('a in b and c < d')
df[df.a.isin(df.b) & (df.c < df.d)]
| a | b | c | d |
---|
0 | a | a | 0 | 2 |
---|
1 | a | a | 0 | 6 |
---|
3 | b | a | 1 | 2 |
---|
4 | c | b | 1 | 2 |
---|
5 | c | b | 1 | 2 |
---|
| a | b | c | d |
---|
0 | a | a | 0 | 2 |
---|
1 | a | a | 0 | 6 |
---|
3 | b | a | 1 | 2 |
---|
4 | c | b | 1 | 2 |
---|
5 | c | b | 1 | 2 |
---|
# 注意:?????
# Note that in and not in are evaluated in Python, since numexpr has no equivalent of this operation.
# However, only the in/not in expression itself is evaluated in vanilla Python.
# For example, in the expression
df.query('a in b + c + d')
# (b + c + d) is evaluated by numexpr and then the in operation is evaluated in plain Python. In general, any operations that can be evaluated using numexpr will be.
# Special use of the == operator with list objects
# 特别的用法,== 用于list
# Comparing a list of values to a column using ==/!= works similarly to in/not in
# 用==/!=比较列表的值与列的值,类似于in/not in
df
| a | b | c | d |
---|
0 | a | a | 0 | 2 |
---|
1 | a | a | 0 | 6 |
---|
2 | b | a | 3 | 3 |
---|
3 | b | a | 1 | 2 |
---|
4 | c | b | 1 | 2 |
---|
5 | c | b | 1 | 2 |
---|
6 | d | b | 3 | 8 |
---|
7 | d | b | 1 | 0 |
---|
8 | e | c | 2 | 7 |
---|
9 | e | c | 0 | 1 |
---|
10 | f | c | 0 | 1 |
---|
11 | f | c | 3 | 1 |
---|
df.query('b == ["a", "b", "c"]') # b列中有列表中值的
| a | b | c | d |
---|
0 | a | a | 0 | 2 |
---|
1 | a | a | 0 | 6 |
---|
2 | b | a | 3 | 3 |
---|
3 | b | a | 1 | 2 |
---|
4 | c | b | 1 | 2 |
---|
5 | c | b | 1 | 2 |
---|
6 | d | b | 3 | 8 |
---|
7 | d | b | 1 | 0 |
---|
8 | e | c | 2 | 7 |
---|
9 | e | c | 0 | 1 |
---|
10 | f | c | 0 | 1 |
---|
11 | f | c | 3 | 1 |
---|
df.query('c == [1, 2]')
df.query('[1, 2] in c')
df[df.c.isin([1, 2])]
| a | b | c | d |
---|
3 | b | a | 1 | 2 |
---|
4 | c | b | 1 | 2 |
---|
5 | c | b | 1 | 2 |
---|
7 | d | b | 1 | 0 |
---|
8 | e | c | 2 | 7 |
---|
| a | b | c | d |
---|
3 | b | a | 1 | 2 |
---|
4 | c | b | 1 | 2 |
---|
5 | c | b | 1 | 2 |
---|
7 | d | b | 1 | 0 |
---|
8 | e | c | 2 | 7 |
---|
| a | b | c | d |
---|
3 | b | a | 1 | 2 |
---|
4 | c | b | 1 | 2 |
---|
5 | c | b | 1 | 2 |
---|
7 | d | b | 1 | 0 |
---|
8 | e | c | 2 | 7 |
---|
# Boolean Operators 布尔操作符
# ~ 或 not
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))
df
df['bools'] = np.random.rand(len(df)) > 0.5
df
| a | b | c |
---|
0 | 0.528671 | 0.265870 | 0.932892 |
---|
1 | 0.520069 | 0.047895 | 0.478818 |
---|
2 | 0.648595 | 0.180744 | 0.838445 |
---|
3 | 0.145614 | 0.906190 | 0.762163 |
---|
4 | 0.612637 | 0.027232 | 0.778020 |
---|
5 | 0.640565 | 0.535538 | 0.018280 |
---|
6 | 0.633664 | 0.124654 | 0.185709 |
---|
7 | 0.635088 | 0.377300 | 0.914968 |
---|
8 | 0.702684 | 0.504459 | 0.107014 |
---|
9 | 0.635757 | 0.261144 | 0.665611 |
---|
| a | b | c | bools |
---|
0 | 0.528671 | 0.265870 | 0.932892 | False |
---|
1 | 0.520069 | 0.047895 | 0.478818 | True |
---|
2 | 0.648595 | 0.180744 | 0.838445 | True |
---|
3 | 0.145614 | 0.906190 | 0.762163 | False |
---|
4 | 0.612637 | 0.027232 | 0.778020 | True |
---|
5 | 0.640565 | 0.535538 | 0.018280 | True |
---|
6 | 0.633664 | 0.124654 | 0.185709 | True |
---|
7 | 0.635088 | 0.377300 | 0.914968 | True |
---|
8 | 0.702684 | 0.504459 | 0.107014 | False |
---|
9 | 0.635757 | 0.261144 | 0.665611 | False |
---|
df.query('~bools')
df.query('not bools')
| a | b | c | bools |
---|
0 | 0.528671 | 0.265870 | 0.932892 | False |
---|
3 | 0.145614 | 0.906190 | 0.762163 | False |
---|
8 | 0.702684 | 0.504459 | 0.107014 | False |
---|
9 | 0.635757 | 0.261144 | 0.665611 | False |
---|
| a | b | c | bools |
---|
0 | 0.528671 | 0.265870 | 0.932892 | False |
---|
3 | 0.145614 | 0.906190 | 0.762163 | False |
---|
8 | 0.702684 | 0.504459 | 0.107014 | False |
---|
9 | 0.635757 | 0.261144 | 0.665611 | False |
---|
# 任意组合布尔表达式
shorter = df.query('a < b < c and (not bools) or bools > 2') # short query syntax
longer = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] # equivalent in pure Python
shorter
longer
# Performance of query()
# DataFrame.query() using numexpr is slightly faster than Python for large frames
# Note : You will only see the performance benefits of using the numexpr engine with DataFrame.query()
# if your frame has more than approximately 200,000 rows
# query() 效能在大数据时高于数字表达式
# Duplicate Data
# 识别和剔除重复数据,两个方法:duplicated and drop_duplicates,他们的参数都是列
# duplicated 返回布尔向量,标示重复的行
# drop_duplicates 删除重复列
# 默认保留第一个找到的值,但可以通过keep参数指定保留的值
# keep='first' (default): mark / drop duplicates except for the first occurrence. 只保留第一个
# keep='last': mark / drop duplicates except for the last occurrence. 只保留最后一个重复值
# keep=False: mark / drop all duplicates. 剔除所有重复值
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'two', 'two', 'three', 'four'],
'b': ['x', 'y', 'x', 'y', 'x', 'x', 'x'],
'c': np.random.randn(7)})
df2
| a | b | c |
---|
0 | one | x | 2.206877 |
---|
1 | one | y | -1.852209 |
---|
2 | two | x | -0.706555 |
---|
3 | two | y | -1.007860 |
---|
4 | two | x | -0.185356 |
---|
5 | three | x | -0.687592 |
---|
6 | four | x | -2.052029 |
---|
df2.duplicated('a')
df2.duplicated("a", keep="last")
df2.duplicated('a', keep=False)
0 False
1 True
2 False
3 True
4 True
5 False
6 False
dtype: bool
0 True
1 False
2 True
3 True
4 False
5 False
6 False
dtype: bool
0 True
1 True
2 True
3 True
4 True
5 False
6 False
dtype: bool
df2.drop_duplicates('a')
df2.drop_duplicates('a', keep="last")
df2.drop_duplicates('a', keep=False)
| a | b | c |
---|
0 | one | x | 2.206877 |
---|
2 | two | x | -0.706555 |
---|
5 | three | x | -0.687592 |
---|
6 | four | x | -2.052029 |
---|
| a | b | c |
---|
1 | one | y | -1.852209 |
---|
4 | two | x | -0.185356 |
---|
5 | three | x | -0.687592 |
---|
6 | four | x | -2.052029 |
---|
| a | b | c |
---|
5 | three | x | -0.687592 |
---|
6 | four | x | -2.052029 |
---|
# 传入一个列表作为参数
df2.duplicated(['a', 'b']) # a b 两列看做一个整体,标示重复值
df2.drop_duplicates(['a', 'b'])
0 False
1 False
2 False
3 False
4 True
5 False
6 False
dtype: bool
| a | b | c |
---|
0 | one | x | 2.206877 |
---|
1 | one | y | -1.852209 |
---|
2 | two | x | -0.706555 |
---|
3 | two | y | -1.007860 |
---|
5 | three | x | -0.687592 |
---|
6 | four | x | -2.052029 |
---|
# index索引去重 index.duplicated
df3 = pd.DataFrame({'a': np.arange(6),'b': np.random.randn(6)},index=['a', 'a', 'b', 'c', 'b', 'a'])
df3
| a | b |
---|
a | 0 | 1.446937 |
---|
a | 1 | -1.740284 |
---|
b | 2 | -0.604590 |
---|
c | 3 | 0.096239 |
---|
b | 4 | 0.823314 |
---|
a | 5 | 1.990803 |
---|
df3.index.duplicated()
df3[~df3.index.duplicated()]
df3[~df3.index.duplicated(keep='last')]
df3[~df3.index.duplicated(keep=False)]
array([False, True, False, False, True, True])
| a | b |
---|
a | 0 | 1.446937 |
---|
b | 2 | -0.604590 |
---|
c | 3 | 0.096239 |
---|
| a | b |
---|
c | 3 | 0.096239 |
---|
b | 4 | 0.823314 |
---|
a | 5 | 1.990803 |
---|
# Dictionary-like get() method get方法
# Each of Series, DataFrame, and Panel have a get method which can return a default value.
s = pd.Series([1,2,3], index=['a','b','c'])
s.get("a") # 相当于s["a"]
s.get("x", default=-1) # 可以对不存在的index赋值
1
-1
# The lookup() Method
# 按一定的顺序取得行/列的值
dflookup = pd.DataFrame(np.random.rand(20,4), columns = ['A','B','C','D'])
dflookup
| A | B | C | D |
---|
0 | 0.942679 | 0.727316 | 0.658345 | 0.465770 |
---|
1 | 0.437278 | 0.225158 | 0.436522 | 0.164805 |
---|
2 | 0.270947 | 0.280223 | 0.309800 | 0.015967 |
---|
3 | 0.559836 | 0.630962 | 0.673678 | 0.712503 |
---|
4 | 0.535372 | 0.989887 | 0.661567 | 0.361962 |
---|
5 | 0.726322 | 0.601192 | 0.547858 | 0.477509 |
---|
6 | 0.829411 | 0.583613 | 0.871647 | 0.460966 |
---|
7 | 0.365722 | 0.563660 | 0.164954 | 0.134314 |
---|
8 | 0.817334 | 0.523003 | 0.684492 | 0.369386 |
---|
9 | 0.710906 | 0.693633 | 0.618877 | 0.888263 |
---|
10 | 0.026953 | 0.185217 | 0.746235 | 0.568846 |
---|
11 | 0.192765 | 0.927200 | 0.431736 | 0.084300 |
---|
12 | 0.988460 | 0.784320 | 0.180145 | 0.039405 |
---|
13 | 0.233349 | 0.148678 | 0.888210 | 0.004917 |
---|
14 | 0.105130 | 0.146724 | 0.261370 | 0.936558 |
---|
15 | 0.701224 | 0.791860 | 0.420083 | 0.391538 |
---|
16 | 0.290186 | 0.307993 | 0.139429 | 0.618879 |
---|
17 | 0.182132 | 0.174420 | 0.845501 | 0.647986 |
---|
18 | 0.732009 | 0.919950 | 0.197361 | 0.582814 |
---|
19 | 0.568096 | 0.539125 | 0.269016 | 0.537584 |
---|
list(range(0,10,2))
dflookup.lookup(list(range(0,10,2)), ['B','C','A','B','D'])
[0, 2, 4, 6, 8]
array([0.72731646, 0.30979986, 0.53537223, 0.58361289, 0.36938598])
# index objets 索引对象
# 索引可以通过list或其他序列对象直接创建
index = pd.Index(['e', 'd', 'a', 'b'])
index
# 可以命名
index = pd.Index(['e', 'd', 'a', 'b'], name='something')
index.name
index
Index(['e', 'd', 'a', 'b'], dtype='object')
'something'
Index(['e', 'd', 'a', 'b'], dtype='object', name='something')
index = pd.Index(list(range(5)), name='rows')
index
columns = pd.Index(['A', 'B', 'C'], name='cols')
columns
Int64Index([0, 1, 2, 3, 4], dtype='int64', name='rows')
Index(['A', 'B', 'C'], dtype='object', name='cols')
df = pd.DataFrame(np.random.randn(5, 3), index=index, columns=columns) # 使用索引给列命名,列名是一个索引对象
df
cols | A | B | C |
---|
rows | | | |
---|
0 | 1.868165 | 0.704614 | -2.049472 |
---|
1 | -0.878810 | 0.706876 | -0.741121 |
---|
2 | -2.649523 | -0.952211 | 0.806387 |
---|
3 | 0.369280 | -0.052788 | -0.995775 |
---|
4 | 0.218246 | -0.034493 | -0.198815 |
---|
# Setting metadata 设置元数据
# 索引“多半”是“不可变”,但是可以设置和改变其元数据,比如索引的名称,或者,多重索引的级别和标签
# 可以使用 rename, set_names, set_levels, and set_labels 设置这些属性。
# 默认返回一个拷贝(不修改原值),也可以就地修改in place
ind = pd.Index([1, 2, 3])
ind
Int64Index([1, 2, 3], dtype='int64')
ind.name = "ind"
ind
ind.rename("apple")
ind
ind.name = "apple"
ind
ind.set_names(["bob"], inplace=True)
ind
Int64Index([1, 2, 3], dtype='int64', name='ind')
Int64Index([1, 2, 3], dtype='int64', name='apple')
Int64Index([1, 2, 3], dtype='int64', name='ind')
Int64Index([1, 2, 3], dtype='int64', name='apple')
Int64Index([1, 2, 3], dtype='int64', name='bob')
# set_names, set_levels, and set_labels also take an optional level` argument
# df.query('ilevel_0 == "red"') 在query中使用 ilevel_0 ……
index = pd.MultiIndex.from_product([range(3), ['one', 'two']], names=['first', 'second'])
index
MultiIndex(levels=[[0, 1, 2], ['one', 'two']],
labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
names=['first', 'second'])
index.levels[1]
index.levels[0]
index.set_levels(["a", "b"], level=1)
Index(['one', 'two'], dtype='object', name='second')
Int64Index([0, 1, 2], dtype='int64', name='first')
MultiIndex(levels=[[0, 1, 2], ['a', 'b']],
labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
names=['first', 'second'])
# Set operations on Index objects 索引对象中的集合操作
# Note: The resulting index from a set operation will be sorted in ascending order.
# 注意: 索引的集合操作返回的是升序排序后的结果
a = pd.Index(['c', 'b', 'a'])
b = pd.Index(['c', 'e', 'd'])
a | b # 并集
a & b # 交集
# a - b # 不支持
a.difference(b) # 差集,非对称
a.symmetric_difference(b) # 对称差集,相当于idx1.difference(idx2).union(idx2.difference(idx1))
a ^ b # 对称差集
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
Index(['c'], dtype='object')
Index(['a', 'b'], dtype='object')
Index(['a', 'b', 'd', 'e'], dtype='object')
Index(['a', 'b', 'd', 'e'], dtype='object')
# Missing values 缺失值
# 注意:索引能够容纳缺失值,但是应该避免这样的情况。因为可能出现不可预料的结果,例如有些操作默认排除缺失值。
idx1 = pd.Index([1, np.nan, 3, 4])
idx1
Float64Index([1.0, nan, 3.0, 4.0], dtype='float64')
idx1.fillna(2) # 缺失值赋值为2,nan的index赋值为2
Float64Index([1.0, 2.0, 3.0, 4.0], dtype='float64')
idx2 = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')]) # pd的缺失值是NaT,与np不同
idx2
DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03'], dtype='datetime64[ns]', freq=None)
idx2.fillna(pd.Timestamp('2011-01-02'))
DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], dtype='datetime64[ns]', freq=None)
# Set / Reset Index 设置/重设索引
# DataFrame
data = pd.DataFrame({"a":["bar", "bar", "foo", "foo"],
"b":["one","two","one","two"],
"c":["z","y","x", "w"],
"d":range(1,5)}) # range 不用list
data
| a | b | c | d |
---|
0 | bar | one | z | 1 |
---|
1 | bar | two | y | 2 |
---|
2 | foo | one | x | 3 |
---|
3 | foo | two | w | 4 |
---|
indexed1 = data.set_index("c") # 把df的某列设置为index索引
indexed1
| a | b | d |
---|
c | | | |
---|
z | bar | one | 1 |
---|
y | bar | two | 2 |
---|
x | foo | one | 3 |
---|
w | foo | two | 4 |
---|
indexed2 = data.set_index(['a', 'b']) # 把df的多列设置为多重索引
indexed2
| | c | d |
---|
a | b | | |
---|
bar | one | z | 1 |
---|
two | y | 2 |
---|
foo | one | x | 3 |
---|
two | w | 4 |
---|
frame = data.set_index('c', drop=False) # drop参数,可以设置索引时不删除列,默认为删除列
frame
frame = frame.set_index(['a', 'b'], append=True) # append参数,在原有索引的基础上,增加索引,变成复合索引
frame
| a | b | c | d |
---|
c | | | | |
---|
z | bar | one | z | 1 |
---|
y | bar | two | y | 2 |
---|
x | foo | one | x | 3 |
---|
w | foo | two | w | 4 |
---|
| | | c | d |
---|
c | a | b | | |
---|
z | bar | one | z | 1 |
---|
y | bar | two | y | 2 |
---|
x | foo | one | x | 3 |
---|
w | foo | two | w | 4 |
---|
data.set_index('c', drop=False) # 把某列设为索引,但不删除
data
data.set_index(['a', 'b'], inplace=True) # 就地修改,而不是返回拷贝
data
| a | b | c | d |
---|
c | | | | |
---|
z | bar | one | z | 1 |
---|
y | bar | two | y | 2 |
---|
x | foo | one | x | 3 |
---|
w | foo | two | w | 4 |
---|
| a | b | c | d |
---|
0 | bar | one | z | 1 |
---|
1 | bar | two | y | 2 |
---|
2 | foo | one | x | 3 |
---|
3 | foo | two | w | 4 |
---|
| | c | d |
---|
a | b | | |
---|
bar | one | z | 1 |
---|
two | y | 2 |
---|
foo | one | x | 3 |
---|
two | w | 4 |
---|
# Reset the index 重置索引
# Note: The reset_index method used to be called delevel which is now deprecated. 不推荐使用delevel
data
data.index.name # 无返回值?
data.index.names # 索引列的名称
data.reset_index() # 重置索引,原索引变为普通列,原索引名称变为列名
| | c | d |
---|
a | b | | |
---|
bar | one | z | 1 |
---|
two | y | 2 |
---|
foo | one | x | 3 |
---|
two | w | 4 |
---|
FrozenList(['a', 'b'])
| a | b | c | d |
---|
0 | bar | one | z | 1 |
---|
1 | bar | two | y | 2 |
---|
2 | foo | one | x | 3 |
---|
3 | foo | two | w | 4 |
---|
frame
frame.reset_index(level=1) # 对于多重索引,可以指定重置哪一级索引,而不是全部重置
| | | c | d |
---|
c | a | b | | |
---|
z | bar | one | z | 1 |
---|
y | bar | two | y | 2 |
---|
x | foo | one | x | 3 |
---|
w | foo | two | w | 4 |
---|
| | a | c | d |
---|
c | b | | | |
---|
z | one | bar | z | 1 |
---|
y | two | bar | y | 2 |
---|
x | one | foo | x | 3 |
---|
w | two | foo | w | 4 |
---|
# reset_index方法可以使用drop参数,若其为true,则仅是把索引剔除,而不转换为df数据列
frame
frame.reset_index(level=2, drop=True)
| | | c | d |
---|
c | a | b | | |
---|
z | bar | one | z | 1 |
---|
y | bar | two | y | 2 |
---|
x | foo | one | x | 3 |
---|
w | foo | two | w | 4 |
---|
| | c | d |
---|
c | a | | |
---|
z | bar | z | 1 |
---|
y | bar | y | 2 |
---|
x | foo | x | 3 |
---|
w | foo | w | 4 |
---|
# Adding an ad hoc index
# If you create an index yourself, you can just assign it to the index field
# 创建index对象,并赋值给df
index = pd.MultiIndex.from_product([range(2), ['one', 'two']], names=['first', 'second'])
index
data
data.index = index # 注意:index的长度要与df向适应
data
MultiIndex(levels=[[0, 1], ['one', 'two']],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
names=['first', 'second'])
| a | b | c | d |
---|
0 | bar | one | z | 1 |
---|
1 | bar | two | y | 2 |
---|
2 | foo | one | x | 3 |
---|
3 | foo | two | w | 4 |
---|
| | a | b | c | d |
---|
first | second | | | | |
---|
0 | one | bar | one | z | 1 |
---|
two | bar | two | y | 2 |
---|
1 | one | foo | one | x | 3 |
---|
two | foo | two | w | 4 |
---|
# Returning a view versus a copy 返回视图 对比 返回拷贝
# 避免链式操作,即多个连接的[],具体见前面的说明
# Why does assignment fail when using chained indexing? 为什么链式索引赋值失败
# python解释器,对以下赋值的区别
# 第一种:非链式赋值
dfmi.loc[:,('one','second')] = value
# becomes, python解释为:
dfmi.loc.__setitem__((slice(None), ('one', 'second')), value) # 直接set
# Of course, dfmi.loc.__getitem__(idx) may be a view or a copy of dfmi.
# 第二种:链式赋值
dfmi['one']['second'] = value # pd会抛出异常SettingWithCopy,链式操作具有不确定性!
# becomes, python解释为:
dfmi.__getitem__('one').__setitem__('second', value) # 先get,再set,前面的get返回的是一个copy
# SettingWithCopy有时候会在没有明显的链式操作的情况下出现,例如:
def do_something(df):
foo = df[['bar', 'baz']] # Is foo a view? A copy? Nobody knows!
# ... many lines here ...
foo['quux'] = value # We don't know whether this will modify df or not! # “隐式”的链式操作
return foo
# Evaluation order matters 赋值命令事情
# 链式赋值操作引起的SettingWithCopyWarning,可以通过设置option mode取消或抑制。
dfb = pd.DataFrame({'a' : ['one', 'one', 'two','three', 'two', 'one', 'six'],'c' : np.arange(7)})
dfb
| a | c |
---|
0 | one | 0 |
---|
1 | one | 1 |
---|
2 | two | 2 |
---|
3 | three | 3 |
---|
4 | two | 4 |
---|
5 | one | 5 |
---|
6 | six | 6 |
---|
# This will show the SettingWithCopyWarning
# but the frame values will be set
dfb['c'][dfb.a.str.startswith('o')] = 42
dfb
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
This is separate from the ipykernel package so we can avoid doing imports until
| a | c |
---|
0 | one | 42 |
---|
1 | one | 42 |
---|
2 | two | 2 |
---|
3 | three | 3 |
---|
4 | two | 4 |
---|
5 | one | 42 |
---|
6 | six | 6 |
---|
# This however is operating on a copy and will not work.
# 把option的提示级别由warn改为raise ????
pd.set_option('mode.chained_assignment','warn') # This however is operating on a copy and will not work. # ??
# A chained assignment can also crop up in setting in a mixed dtype frame. 出现在
# Note : These setting rules apply to all of .loc/.iloc
# This is the correct access method 正确的存取方法
dfc = pd.DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]})
dfc
dfc.loc[0,'A'] = 11
dfc
dfc = dfc.copy()
dfc
dfc['A'][0] = 111 # This can work at times, but is not guaranteed, and so should be avoided 有时可以工作,但不保证,应避免使用
dfc
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
This is separate from the ipykernel package so we can avoid doing imports until
pd.set_option('mode.chained_assignment', 'raise') # 链式赋值将不会执行
# http://pandas.pydata.org/pandas-docs/stable/indexing.html 全文完
# 2018-02-19