DataFrame官网参考API资料
DataFrame
DataFrame 是一个表格型的数据结构,它含有一组有序的列,每列可以是不同的值类型(数值,字符串,布尔值等) DataFrame 即有行索引也有列索引,它可以被看作由Series组成的字典(共用一个索引)
创建 DataFrame
from pandas import DataFrame
data = { 'state' : [ 'Ohio' , 'Ohio' , 'Ohio' , 'Nevada' , 'Nevada' ] ,
'year' : [ 2000 , 2001 , 2002 , 2001 , 2002 ] ,
'pop' : [ 1.5 , 1.7 , 3.6 , 2.4 , 2.9 ] }
frame = DataFrame( data)
frame
pop state year 0 1.5 Ohio 2000 1 1.7 Ohio 2001 2 3.6 Ohio 2002 3 2.4 Nevada 2001 4 2.9 Nevada 2002
DataFrame的列按照指定顺序进行排序
DataFrame( data, columns= [ 'year' , 'state' , 'pop' ] )
year state pop 0 2000 Ohio 1.5 1 2001 Ohio 1.7 2 2002 Ohio 3.6 3 2001 Nevada 2.4 4 2002 Nevada 2.9
索引重命名
DataFrame( data, columns= [ 'year' , 'state' , 'pop' ] , index= [ 'one' , 'two' , 'three' , 'four' , 'five' ] )
year state pop one 2000 Ohio 1.5 two 2001 Ohio 1.7 three 2002 Ohio 3.6 four 2001 Nevada 2.4 five 2002 Nevada 2.9
创建空列
传入的列在数据中找不到,会产生NaN值
DataFrame( data, columns= [ 'year' , 'state' , 'pop' , 'debt' ] , index= [ 'one' , 'two' , 'three' , 'four' , 'five' ] )
year state pop debt one 2000 Ohio 1.5 NaN two 2001 Ohio 1.7 NaN three 2002 Ohio 3.6 NaN four 2001 Nevada 2.4 NaN five 2002 Nevada 2.9 NaN
列之间进行对比,创建 布尔值列
frame = DataFrame( data, columns= [ 'year' , 'state' , 'pop' , 'debt' ] , index= [ 'one' , 'two' , 'three' , 'four' , 'five' ] )
frame[ 'eastern' ] = frame. state == 'Ohio'
frame
year state pop debt eastern one 2000 Ohio 1.5 NaN True two 2001 Ohio 1.7 NaN True three 2002 Ohio 3.6 NaN True four 2001 Nevada 2.4 NaN False five 2002 Nevada 2.9 NaN False
通过字典嵌套(字典的字典) 进行创建
外层字典的键作为列,内层键作为行
pop = { 'Nevada' : { 2001 : 2.4 , 2002 : 2.9 } ,
'Ohio' : { 2000 : 1.5 , 2001 : 1.7 , 2002 : 3.6 } }
frame = DataFrame( pop)
frame
Nevada Ohio 2000 NaN 1.5 2001 2.4 1.7 2002 2.9 3.6
pop = { 'Nevada' : { 2001 : 2.4 , 2002 : 2.9 } ,
'Ohio' : { 2000 : 1.5 , 2001 : 1.7 , 2002 : 3.6 } }
frame = DataFrame( pop, index= [ 2001 , 2002 , 2003 ] )
frame
Nevada Ohio 2001 2.4 1.7 2002 2.9 3.6 2003 NaN NaN
pop = { 'Nevada' : { 2001 : 2.4 , 2002 : 2.9 } ,
'Ohio' : { 2000 : 1.5 , 2001 : 1.7 , 2002 : 3.6 } }
frame = DataFrame( pop)
pdata = { 'Ohio' : frame[ 'Ohio' ] [ : - 1 ] ,
'Nevada' : frame[ 'Nevada' ] [ : 2 ] }
DataFrame( pdata)
Nevada Ohio 2000 NaN 1.5 2001 2.4 1.7
给索引 赋名
pop = { 'Nevada' : { 2001 : 2.4 , 2002 : 2.9 } ,
'Ohio' : { 2000 : 1.5 , 2001 : 1.7 , 2002 : 3.6 } }
frame = DataFrame( pop)
frame. index. name = 'year'
frame
Nevada Ohio year 2000 NaN 1.5 2001 2.4 1.7 2002 2.9 3.6
给列 赋名
frame. columns. name = 'state'
frame
state Nevada Ohio year 2000 NaN 1.5 2001 2.4 1.7 2002 2.9 3.6
转置
pop = { 'Nevada' : { 2001 : 2.4 , 2002 : 2.9 } ,
'Ohio' : { 2000 : 1.5 , 2001 : 1.7 , 2002 : 3.6 } }
frame = DataFrame( pop)
frame. T
2000 2001 2002 Nevada NaN 2.4 2.9 Ohio 1.5 1.7 3.6
.values 属性以二维ndarray形式返回DataFrame中数据
pop = { 'Nevada' : { 2001 : 2.4 , 2002 : 2.9 } ,
'Ohio' : { 2000 : 1.5 , 2001 : 1.7 , 2002 : 3.6 } }
frame = DataFrame( pop)
frame. values
array([[nan, 1.5],
[2.4, 1.7],
[2.9, 3.6]])
删除列值 del
pop = { 'Nevada' : { 2001 : 2.4 , 2002 : 2.9 } ,
'Ohio' : { 2000 : 1.5 , 2001 : 1.7 , 2002 : 3.6 } }
frame = DataFrame( pop)
del frame[ 'Ohio' ]
frame
Nevada 2000 NaN 2001 2.4 2002 2.9
索取
获取列值
frame = DataFrame( data, columns= [ 'year' , 'state' , 'pop' , 'debt' ] , index= [ 'one' , 'two' , 'three' , 'four' , 'five' ] )
frame[ 'state' ]
one Ohio
two Ohio
three Ohio
four Nevada
five Nevada
Name: state, dtype: object
frame. year
one 2000
two 2001
three 2002
four 2001
five 2002
Name: year, dtype: int64
获取所有列名 .columns
frame. columns
Index(['year', 'state', 'pop', 'debt'], dtype='object')
获取所有索引名 .index
frame. index
Index(['one', 'two', 'three', 'four', 'five'], dtype='object')
获取行值
frame = DataFrame( data, columns= [ 'year' , 'state' , 'pop' , 'debt' ] , index= [ 'one' , 'two' , 'three' , 'four' , 'five' ] )
frame. ix[ 'three' ]
/Users/wuyihong/anaconda2/envs/python35/lib/python3.5/site-packages/ipykernel/__main__.py:2: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
from ipykernel import kernelapp as app
year 2002
state Ohio
pop 3.6
debt NaN
Name: three, dtype: object
赋值
frame = DataFrame( data, columns= [ 'year' , 'state' , 'pop' , 'debt' ] , index= [ 'one' , 'two' , 'three' , 'four' , 'five' ] )
frame[ 'debt' ] = 16.5
frame
year state pop debt one 2000 Ohio 1.5 16.5 two 2001 Ohio 1.7 16.5 three 2002 Ohio 3.6 16.5 four 2001 Nevada 2.4 16.5 five 2002 Nevada 2.9 16.5
import numpy as np
frame[ 'debt' ] = np. arange( 5 )
frame
year state pop debt one 2000 Ohio 1.5 0 two 2001 Ohio 1.7 1 three 2002 Ohio 3.6 2 four 2001 Nevada 2.4 3 five 2002 Nevada 2.9 4
将 Series 赋值给 DataFrame
赋值的是一个Series,会精确匹配DataFrame的索引,所有的空位都将被填上缺失值
from pandas import Series
val = Series( [ - 1.2 , - 1.5 , - 1.7 ] , index= [ 'two' , 'four' , 'five' ] )
frame[ 'debt' ] = val
frame
year state pop debt one 2000 Ohio 1.5 NaN two 2001 Ohio 1.7 -1.2 three 2002 Ohio 3.6 NaN four 2001 Nevada 2.4 -1.5 five 2002 Nevada 2.9 -1.7
索引对象
Index 对象是不可修改的,这样才能使Index对象在多个数据结构之间安全共享
from pandas import Seriesies
obj = Series( range ( 3 ) , index= [ 'a' , 'b' , 'c' ] )
obj
a 0
b 1
c 2
dtype: int64
index = obj. index
index
Index(['a', 'b', 'c'], dtype='object')
index[ 1 : ]
Index(['b', 'c'], dtype='object')
pd.Index()
最泛化的Index对象,将轴标签表示为一个由python对象组成的NumPy数组
import numpy as np
import pandas as pd
pd. Index( np. arange( 3 ) )
Int64Index([0, 1, 2], dtype='int64')
index = pd. Index( np. arange( 3 ) )
obj = Series( [ 1.5 , - 2.5 , 0 ] , index= index)
obj
0 1.5
1 -2.5
2 0.0
dtype: float64
obj. index is index
True
用逻辑变量 返回索引所包含的数据
from pandas import DataFrame, Series
pop = { 'Nevada' : { 2001 : 2.4 , 2002 : 2.9 } ,
'Ohio' : { 2000 : 1.5 , 2001 : 1.7 , 2002 : 3.6 } }
frame = DataFrame( pop)
frame. index. name = 'year'
frame. columns. name = 'state'
frame
state Nevada Ohio year 2000 NaN 1.5 2001 2.4 1.7 2002 2.9 3.6
'Ohio' in frame. columns
True
2003 in frame. index
False
基本功能
.reindex
其作用是创建一个适应新索引的新对象 参数
index 用作索引的新序列。即可以是index实例,也可以是其他序列型的python数据结构。 index会被完全使用,就像没有任何复制一样 method 插值(填充)方法 fill_value 在重索引的过程中,需要引入缺失值时使用的代替值 limit 前向或后向填充时的最大填充量 level 在MultiIndex的指定级别上匹配简答索引,否则选取其子集 copy 默认为True,无论如何都复制;如果为False,则新旧相等就不复制
obj = Series( [ 4.5 , 7.2 , - 5.3 , 3.6 ] , index= [ 'd' , 'b' , 'a' , 'c' ] )
obj
d 4.5
b 7.2
a -5.3
c 3.6
dtype: float64
obj = obj. reindex( [ 'a' , 'b' , 'c' , 'd' , 'e' ] )
obj
a -5.3
b 7.2
c 3.6
d 4.5
e NaN
dtype: float64
fill_value 参数
obj = Series( [ 4.5 , 7.2 , - 5.3 , 3.6 ] , index= [ 'd' , 'b' , 'a' , 'c' ] )
obj. reindex( [ 'a' , 'b' , 'c' , 'd' , 'e' ] , fill_value= 0 )
a -5.3
b 7.2
c 3.6
d 4.5
e 0.0
dtype: float64
method方法
ffill或pad 前向填充(或搬运)值
bfill或backfill 后向填充(或搬运)值
obj = Series( [ 'blue' , 'purple' , 'yellow' ] , index= [ 0 , 2 , 4 ] )
obj
0 blue
2 purple
4 yellow
dtype: object
obj. reindex( range ( 6 ) , method= 'ffill' )
0 blue
1 blue
2 purple
3 purple
4 yellow
5 yellow
dtype: object
frame = DataFrame( np. arange( 9 ) . reshape( ( 3 , 3 ) ) ,
columns= [ 'Ohio' , 'Texas' , 'California' ] , index= [ 'a' , 'c' , 'd' ] )
frame
Ohio Texas California a 0 1 2 c 3 4 5 d 6 7 8
frame. reindex( [ 'a' , 'b' , 'c' , 'd' ] )
Ohio Texas California a 0.0 1.0 2.0 b NaN NaN NaN c 3.0 4.0 5.0 d 6.0 7.0 8.0
states = [ 'Texas' , 'Utah' , 'California' ]
frame. reindex( columns= states)
Texas Utah California a 1 NaN 2 c 4 NaN 5 d 7 NaN 8
frame = DataFrame( np. arange( 9 ) . reshape( ( 3 , 3 ) ) ,
columns= [ 'Ohio' , 'Texas' , 'California' ] , index= [ 'a' , 'c' , 'd' ] )
states = [ 'Texas' , 'Utah' , 'California' ]
frame = frame. reindex( columns= states)
frame. reindex( index= [ 'a' , 'b' , 'c' , 'd' ] , method= 'ffill' , columns= states)
Texas Utah California a 1 NaN 2 b 1 NaN 2 c 4 NaN 5 d 7 NaN 8
.ix
frame = DataFrame( np. arange( 9 ) . reshape( ( 3 , 3 ) ) ,
columns= [ 'Ohio' , 'Texas' , 'California' ] , index= [ 'a' , 'c' , 'd' ] )
states = [ 'Texas' , 'Utah' , 'California' ]
frame = frame. reindex( columns= states)
frame. reindex( index= [ 'a' , 'b' , 'c' , 'd' ] , columns= states)
Texas Utah California a 1.0 NaN 2.0 b NaN NaN NaN c 4.0 NaN 5.0 d 7.0 NaN 8.0
丢弃指定轴上的项
drop 方法返回的是一个在指定轴上删除了指定值的新对象
obj = Series( np. arange( 5 ) , index= [ 'a' , 'b' , 'c' , 'd' , 'e' ] )
obj
a 0
b 1
c 2
d 3
e 4
dtype: int64
obj. drop( 'c' )
a 0
b 1
d 3
e 4
dtype: int64
obj. drop( [ 'd' , 'c' ] )
a 0
b 1
e 4
dtype: int64
data = DataFrame( np. arange( 16 ) . reshape( ( 4 , 4 ) ) ,
index= [ 'Ohio' , 'Colorado' , 'Utah' , 'New York' ] , columns= [ 'one' , 'two' , 'three' , 'four' ] )
data
one two three four Ohio 0 1 2 3 Colorado 4 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
删除行
data. drop( [ 'Colorado' , 'Ohio' ] )
one two three four Utah 8 9 10 11 New York 12 13 14 15
删除列
data. drop( 'two' , axis= 1 )
one three four Ohio 0 2 3 Colorado 4 6 7 Utah 8 10 11 New York 12 14 15
data. drop( [ 'two' , 'four' ] , axis= 1 )
one three Ohio 0 2 Colorado 4 6 Utah 8 10 New York 12 14