pandas入门《利用Python进行数据分析》读书笔记第5章

最新推荐文章于 2022-03-07 13:52:26 发布

yisuoyanyv

最新推荐文章于 2022-03-07 13:52:26 发布

阅读量915

点赞数 1

分类专栏： python-pandas 文章标签： python 读书笔记数据分析 pandas

本文链接：https://blog.csdn.net/yisuoyanyv/article/details/71155980

版权

python-pandas 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

pandas入门

源码点这里

#引入pandas
from pandas import Series,DataFrame
import pandas as pd

pandas的两个主要数据结构：Series和DataFrame

Series 是一中类似于一维数组的对象，它由一组数据以及一组与之相关的数据标签（即索引）组成。

obj=Series([4,7,-5,3])

obj

0 4 1 7 2 -5 3 3 dtype: int64

obj.values

obj.index

RangeIndex(start=0, stop=4, step=1)

#对各个数据点进行标记的索引
obj2=Series([4,7,-5,3],index=['d','b','a','c'])
obj2

d 4 b 7 a -5 c 3 dtype: int64

obj2['a']

-5

obj2['d']

obj2[['c','a','d']]

c 3 a -5 d 4 dtype: int64

#Numpy数运算（如根据布尔型数组进行过滤、标量乘法、应用数学函数等）都会保留索引和值之间的连接
obj2[obj2>0]

d 4 b 7 c 3 dtype: int64

obj2*2

d 8 b 14 a -10 c 6 dtype: int64

import  numpy as np
np.exp(obj2)

d 54.598150 b 1096.633158 a 0.006738 c 20.085537 dtype: float64

#可以将Series看成是一个定长的有序字典，因为它是索引值到数据值的一个映射
#可用通过字典来创建Series
sdata={'0hio':35000,'Texta':79300,'Ohjsodf':16000,'Jsdf':5000}
obj3=Series(sdata)
obj3

0hio 35000 Jsdf 5000 Ohjsodf 16000 Texta 79300 dtype: int64

states=['California','0hio','Texta','Ohjsodf']
obj4=Series(sdata,index=states)
obj4

California NaN 0hio 35000.0 Texta 79300.0 Ohjsodf 16000.0 dtype: float64

#NaN在pandas中表示缺失或NA值
pd.isnull(obj4)#检测缺失值

California True 0hio False Texta False Ohjsodf False dtype: bool

pd.notnull(obj4)

California False 0hio True Texta True Ohjsodf True dtype: bool

#Series也有类似的实例方法
obj4.isnull()

California True 0hio False Texta False Ohjsodf False dtype: bool **下面重点关注如何处理缺失数据**

#Series一个重要的功能：它在算数运算中会自动对齐不同索引的数据
obj3

0hio 35000 Jsdf 5000 Ohjsodf 16000 Texta 79300 dtype: int64

obj4

California NaN 0hio 35000.0 Texta 79300.0 Ohjsodf 16000.0 dtype: float64

obj3+obj4

0hio 70000.0 California NaN Jsdf NaN Ohjsodf 32000.0 Texta 158600.0 dtype: float64

#Series对象本身及其索引都有一个name属性，该属性跟pandas其他的关键功能关系非常密切
obj4.name='population'
obj4.index.name='state'
obj4

state California NaN 0hio 35000.0 Texta 79300.0 Ohjsodf 16000.0 Name: population, dtype: float64

obj

0 4 1 7 2 -5 3 3 dtype: int64

obj.index=['Bob','Steve','Jeff','Ryan']#通过直接赋值方式就地修改Series的索引

obj

Bob 4 Steve 7 Jeff -5 Ryan 3 dtype: int64

DataFrame

####DataFrame是一个表格型的数据结构，它含有一组有序的列，每列可以是不同的值类型（数值、字符串、布尔值等） ####DataFrame既又行索引也有列索引，它可以被看做Series组成的字典（共同用一个索引）

#DataFrame的构造
data={'state':['0hi0','0hio','0hio','Nevada','Nevada'],
      'year':[2000,2001,2002,2001,2002],
      'pop':[1.5,1.7,3.6,2.4,2.9]
      }
frame=DataFrame(data)#DataFrame会自动加上索引，且全部序列会被有序排列
frame

	pop	state	year
0	1.5	0hi0	2000
1	1.7	0hio	2001
2	3.6	0hio	2002
3	2.4	Nevada	2001
4	2.9	Nevada	2002

#如果指定了序列，则DataFrame的列就会按照指定顺序进行排列
DataFrame(data,columns=['year','state','pop'])

	year	state	pop
0	2000	0hi0	1.5
1	2001	0hio	1.7
2	2002	0hio	3.6
3	2001	Nevada	2.4
4	2002	Nevada	2.9

#如果传入的列在数据中找不到，就会产生NA值
frame2=DataFrame(data,columns=['year','state','pop','debt'],index=['one','two','three','four','five'])
frame2

	year	state	pop	debt
one	2000	0hi0	1.5	NaN
two	2001	0hio	1.7	NaN
three	2002	0hio	3.6	NaN
four	2001	Nevada	2.4	NaN
five	2002	Nevada	2.9	NaN

frame2.columns

Index([‘year’, ‘state’, ‘pop’, ‘debt’], dtype=’object’)

#可以将DataFrame的列获取为一个Series：
frame2['state']

one 0hi0 two 0hio three 0hio four Nevada five Nevada Name: state, dtype: object

frame2.year

one 2000 two 2001 three 2002 four 2001 five 2002 Name: year, dtype: int64

#行可以通过位置或名称的方式进行获取，比如用索引字段ix
frame2.ix['three']

year 2002 state 0hio pop 3.6 debt NaN Name: three, dtype: object

#列可以通过赋值的方式进行修改
frame2['debt']=16.
frame2

frame2['debt']=np.arange(5.)
frame2

	year	state	pop	debt
one	2000	0hi0	1.5	0.0
two	2001	0hio	1.7	1.0
three	2002	0hio	3.6	2.0
four	2001	Nevada	2.4	3.0
five	2002	Nevada	2.9	4.0

#将列表或数组赋值给某个列时，其长度必须跟DataFrame的长度相匹配。
# 如果赋值的是一个Series，就会精确匹配DataFreme的索引，所有的空位将会被填上缺失值
val=Series([-1.2,-1.5,-1.7],index=['two','four','five'])
frame2['debt']=val
frame2

	year	state	pop	debt
one	2000	0hi0	1.5	NaN
two	2001	0hio	1.7	-1.2
three	2002	0hio	3.6	NaN
four	2001	Nevada	2.4	-1.5
five	2002	Nevada	2.9	-1.7

#为不存在的列赋值会创建出一个新列。关键字del用于删除列
frame2['eastern']=frame2.state=='0hi0'
frame2

	year	state	pop	debt	eastern
one	2000	0hi0	1.5	NaN	True
two	2001	0hio	1.7	-1.2	False
three	2002	0hio	3.6	NaN	False
four	2001	Nevada	2.4	-1.5	False
five	2002	Nevada	2.9	-1.7	False

del frame2['eastern']
frame2.columns

Index([‘year’, ‘state’, ‘pop’, ‘debt’], dtype=’object’)

###通过索引方式返回的列知识相应数据的视图而已，并不是副本。因此对返回的Series所作的任何修改
###全都会反映到源DataFrame上。通过Series的copy方法即可以显式地复制列
#另一种常见的数据形式是嵌套字典（也就是字典的字典）
pop={'Nevada':{2001:2.4,2002:2.9},'0hio':{2000:1.5,2001:1.7,2002:3.6}}
#外层字典的键作为列，内层字典的键作为行索引
frame3=DataFrame(pop)
frame3

	0hio	Nevada
2000	1.5	NaN
2001	1.7	2.4
2002	3.6	2.9

frame3.T

	2000	2001	2002
0hio	1.5	1.7	3.6
Nevada	NaN	2.4	2.9

DataFrame(pop,index=[2001,2002,2003])

	0hio	Nevada
2001	1.7	2.4
2002	3.6	2.9
2003	NaN	NaN

#如果设置了DataFrame的index和columns的name属性，则这些信息也会被显示出来
frame3.index.name='year'
frame3.columns.name='state'
frame3

state	0hio	Nevada
year
2000	1.5	NaN
2001	1.7	2.4
2002	3.6	2.9

#跟Series一样，values属性也会以二维ndarray的形式返回DataFrame中的数据
frame3.values

array([[ 1.5, nan], [ 1.7, 2.4], [ 3.6, 2.9]])

#如果DataFrame各列的数据类型不同，则值数组的数据类型就会选用能兼容所有列的数据类型
frame2.values

array([[2000, ‘0hi0’, 1.5, nan], [2001, ‘0hio’, 1.7, -1.2], [2002, ‘0hio’, 3.6, nan], [2001, ‘Nevada’, 2.4, -1.5], [2002, ‘Nevada’, 2.9, -1.7]], dtype=object)

索引对象

obj=Series(range(3),index=['a','b','c'])
index=obj.index
index

Index([‘a’, ‘b’, ‘c’], dtype=’object’)

index[1:]

Index([‘b’, ‘c’], dtype=’object’)

#Index对象是不能修改的（immutable），因此用户不能对其进行修改
index[1]='d'

————————————————————————— TypeError Traceback (most recent call last) in () 1 #Index对象是不能修改的（immutable），因此用户不能对其进行修改 —-> 2 index[1]=’d’ C:\Users\ZJL\AppData\Local\Programs\Python\Python35\lib\site-packages\pandas\indexes\base.py in __setitem__(self, key, value) 1402 1403 def __setitem__(self, key, value): -> 1404 raise TypeError(“Index does not support mutable operations”) 1405 1406 def __getitem__(self, key): TypeError: Index does not support mutable operations

#不可修改行保证了Index对象在多个数据结构之间安全共享
index=pd.Index(np.arange(3))
obj2=Series([1.5,-2.5,0],index=index)

obj2.index is index

True

frame3

'0hio' in frame3.columns

True

2002in frame3.index

True ##重新索引

#pandas对象的一个重要方法是reindex,其作用是创建一个适应新索引的新对象。
obj=Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
obj

d 4.5 b 7.2 a -5.3 c 3.6 dtype: float64

obj2=obj.reindex(['a','b','c','d','e'])#索引值不存在引入缺失值
obj2

a -5.3 b 7.2 c 3.6 d 4.5 e NaN dtype: float64

obj.reindex(['a','b','c','d','e'],fill_value=0)

a -5.3 b 7.2 c 3.6 d 4.5 e 0.0 dtype: float64

#对于时间序列这样的有序数据，重新索引时可能需要做一些插值处理。method选项可达到此目的
obj3=Series(['blue','purple','yellow'],index=[0,2,4])
obj3.reindex(range(6),method='ffill')

0 blue 1 blue 2 purple 3 purple 4 yellow 5 yellow dtype: object

#reindex的（插值） method选项  ffill或pad 前向填充（或搬运）值
#bfill或backfill 后向填充（或搬运）值
#如果仅传入一个序列，则会重新索引行
frame=DataFrame(np.arange(9).reshape((3,3)),index=['a','b','c'],columns=['0hio','Texas','California'])
frame

	0hio	Texas	California
a	0	1	2
b	3	4	5
c	6	7	8

frame2=frame.reindex(['a','b','c','d'])
frame2

	0hio	Texas	California
a	0.0	1.0	2.0
b	3.0	4.0	5.0
c	6.0	7.0	8.0
d	NaN	NaN	NaN

states=['Texas','Utah','California']
frame.reindex(columns=states)

	Texas	Utah	California
a	1	NaN	2
b	4	NaN	5
c	7	NaN	8

#也可以同时对行和列进行重新索引，而插值只能按行应用（即轴0）
frame.reindex(index=['a','b','c','d'],method='ffill',columns=states)

	Texas	Utah	California
a	1	NaN	2
b	4	NaN	5
c	7	NaN	8
d	7	NaN	8

#利用ix的标签索引功能，重新索引任务可以变得更简洁
frame.ix[['a','b','c','d'],states]

	Texas	Utah	California
a	1.0	NaN	2.0
b	4.0	NaN	5.0
c	7.0	NaN	8.0
d	NaN	NaN	NaN

丢弃指定轴上的项

drop方法返回一个在指定轴上删除了指定值的新对象

obj=Series(np.arange(5.0),index=['a','b','c','d','e'])
new_obj=obj.drop('c')
new_obj

a 0.0 b 1.0 d 3.0 e 4.0 dtype: float64

obj.drop(['d','c'])

a 0.0 b 1.0 e 4.0 dtype: float64

#对于DataFrame，可以删除任意轴上的索引值
data=DataFrame(np.arange(16).reshape((4,4)),index=['0hio','Colorado','Utah','New York'],columns=['one','two','three','four'])
data

	one	two	three	four
0hio	0	1	2	3
Colorado	4	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

data.drop(['Colorado','0hio'])

data.drop('two',axis=1)

	one	three	four
0hio	0	2	3
Colorado	4	6	7
Utah	8	10	11
New York	12	14	15

data.drop(['two','four'],axis=1)

	one	three
0hio	0	2
Colorado	4	6
Utah	8	10
New York	12	14

索引、选取和过滤

obj=Series(np.arange(4.),index=['a','b','c','d'])
obj['b']

1.0

obj[1]

1.0

obj[2:4]

c 2.0 d 3.0 dtype: float64

obj[['b','a','d']]

b 1.0 a 0.0 d 3.0 dtype: float64

obj[[1,3]]

b 1.0 d 3.0 dtype: float64

obj[obj<2]

a 0.0 b 1.0 dtype: float64

#利用标签的切片运算与普通的Python切片运算不同，其末端是包含的
obj['b':'c']

b 1.0 c 2.0 dtype: float64

obj['b':'c']=5
obj

a 0.0 b 5.0 c 5.0 d 3.0 dtype: float64

#对DataFrame进行索引其实就是获取一个或多个列
data=DataFrame(np.arange(16).reshape((4,4)),index=['0hio','Colorado','Utah','New York'],columns=['one','two','three','four'])
data

	one	two	three	four
0hio	0	1	2	3
Colorado	4	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

data['two']

0hio 1 Colorado 5 Utah 9 New York 13 Name: two, dtype: int32

data[['two','three']]

	two	three
0hio	1	2
Colorado	5	6
Utah	9	10
New York	13	14

#这种索引有几个特殊的情况。首先通过切片或布尔型数组选取行
data[:2]

	one	two	three	four
0hio	0	1	2	3
Colorado	4	5	6	7

data[data['three']>5]

	one	two	three	four
Colorado	4	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

data<5

	one	two	three	four
0hio	True	True	True	True
Colorado	True	False	False	False
Utah	False	False	False	False
New York	False	False	False	False

data[data<5]=0
data

	one	two	three	four
0hio	0	0	0	0
Colorado	0	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

#引入专门的索引字段ix选取行和列的子集
data.ix['Colorado',['two','three']]

two 5 three 6 Name: Colorado, dtype: int32

data.ix[['Colorado','Utah'],[3,0,1]]

	four	one	two
Colorado	7	0	5
Utah	11	8	9

data.ix[data.three>5,:3]

	one	two	three
Colorado	0	5	6
Utah	8	9	10
New York	12	13	14

算术运算和数据对齐

#将对象相加时，如果存在不同的索引，则结果的索引就是该索引对的并集
s1=Series([7.3,-2.5,3.4,1.5],index=['a','c','d','e'])
s2=Series([-2.1,3.6,-1.5,4,3.1],index=['a','c','e','f','g'])
s1

a 7.3 c -2.5 d 3.4 e 1.5 dtype: float64

s2

a -2.1 c 3.6 e -1.5 f 4.0 g 3.1 dtype: float64

s1+s2

a 5.2 c 1.1 d NaN e 0.0 f NaN g NaN dtype: float64

#对于DataFrame,对齐操作会同时发生在行和列上：
df1=DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),index=['0hio','Texas','Colorado'])
df2=DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Utah','0hio','Texas','Oregon'])
df1

	b	c	d
0hio	0.0	1.0	2.0
Texas	3.0	4.0	5.0
Colorado	6.0	7.0	8.0

df2

	b	d	e
Utah	0.0	1.0	2.0
0hio	3.0	4.0	5.0
Texas	6.0	7.0	8.0
Oregon	9.0	10.0	11.0

df1+df2 #没有重叠的部分会产生NA值

	b	c	d	e
0hio	3.0	NaN	6.0	NaN
Colorado	NaN	NaN	NaN	NaN
Oregon	NaN	NaN	NaN	NaN
Texas	9.0	NaN	12.0	NaN
Utah	NaN	NaN	NaN	NaN

df1=DataFrame(np.arange(12.).reshape((3,4)),columns=list('abcd'))
df2=DataFrame(np.arange(20.).reshape((4,5)),columns=list('abcde'))
df1

	a	b	c	d
0	0.0	1.0	2.0	3.0
1	4.0	5.0	6.0	7.0
2	8.0	9.0	10.0	11.0

df2

	a	b	c	d	e
0	0.0	1.0	2.0	3.0	4.0
1	5.0	6.0	7.0	8.0	9.0
2	10.0	11.0	12.0	13.0	14.0
3	15.0	16.0	17.0	18.0	19.0

df1+df2

	a	b	c	d	e
0	0.0	2.0	4.0	6.0	NaN
1	9.0	11.0	13.0	15.0	NaN
2	18.0	20.0	22.0	24.0	NaN
3	NaN	NaN	NaN	NaN	NaN

df1.add(df2,fill_value=0)#空缺填充值  add sub div mul

	a	b	c	d	e
0	0.0	2.0	4.0	6.0	4.0
1	9.0	11.0	13.0	15.0	9.0
2	18.0	20.0	22.0	24.0	14.0
3	15.0	16.0	17.0	18.0	19.0

#与此类似，在对Series或DataFrame重新索引时，也可以指定一个填充值
df1.reindex(columns=df2.columns,fill_value=0)

	a	b	c	d
0	0.0	1.0	2.0	3.0
1	4.0	5.0	6.0	7.0
2	8.0	9.0	10.0	11.0

DataFrame和Series之间的运算

#计算一个二维数组与其某行之间的差：
arr=np.arange(12.).reshape((3,4))
arr

array([[ 0., 1., 2., 3.], [ 4., 5., 6., 7.], [ 8., 9., 10., 11.]])

arr[0]

array([ 0., 1., 2., 3.])

arr-arr[0]

array([[ 0., 0., 0., 0.], [ 4., 4., 4., 4.], [ 8., 8., 8., 8.]])

#这就叫做广播  DataFrame和Series之间的运算差不多也是如此

frame=DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Utah','0hio','Texas','Oregon'])
frame

	b	d	e
Utah	0.0	1.0	2.0
0hio	3.0	4.0	5.0
Texas	6.0	7.0	8.0
Oregon	9.0	10.0	11.0

series=frame.ix[0]
series

b 0.0 d 1.0 e 2.0 Name: Utah, dtype: float64

frame-series

	b	d	e
Utah	0.0	0.0	0.0
0hio	3.0	3.0	3.0
Texas	6.0	6.0	6.0
Oregon	9.0	9.0	9.0

#如果某个索引值在DataFrame的列或Series的索引中找不到，则参与运算的两个对象就会被重新索引以形成并集
series2=Series(range(3),index=['b','e','f'])
frame+series2

	b	d	e	f
Utah	0.0	NaN	3.0	NaN
0hio	3.0	NaN	6.0	NaN
Texas	6.0	NaN	9.0	NaN
Oregon	9.0	NaN	12.0	NaN

#如果希望匹配行且在列上广播，则必须使用算术运算方法
series3=frame['d']
frame

	b	d	e
Utah	0.0	1.0	2.0
0hio	3.0	4.0	5.0
Texas	6.0	7.0	8.0
Oregon	9.0	10.0	11.0

series3

Utah 1.0 0hio 4.0 Texas 7.0 Oregon 10.0 Name: d, dtype: float64

frame.sub(series3,axis=0)#传入的轴号就是希望匹配的轴，这里目的是匹配DataFrame的行索引进行广播

	b	e
Utah	-1.0	1.0
0hio	-1.0	1.0
Texas	-1.0	1.0
Oregon	-1.0	1.0

函数应用和映射

#Numpy的ufuncs(元素级数组方法）也可以用于操作pandas对象
frame=DataFrame(np.random.randn(4,3),columns=list('bde'),index=['Utah','0hio','Texas','Oregon'])
frame

	b	d	e
Utah	-0.191031	-0.004688	-0.329970
0hio	0.708249	0.265398	-2.346897
Texas	1.064349	-1.811846	-0.899921
Oregon	0.334061	-1.058506	0.655632

np.abs(frame)

	b	d	e
Utah	0.191031	0.004688	0.329970
0hio	0.708249	0.265398	2.346897
Texas	1.064349	1.811846	0.899921
Oregon	0.334061	1.058506	0.655632

#将函数应用到由各列或行所形成的一维数组上。DataFrame的apply方法即可实现此功能：
f=lambda x:x.max()-x.min()
frame.apply(f)

b 1.255380 d 2.077245 e 3.002529 dtype: float64

frame.apply(f,axis=1)#按列运算

Utah 0.325281 0hio 3.055145 Texas 2.876195 Oregon 1.714138 dtype: float64

#传给apply的函数还可以返回由多个值组成的Series:
def f(x):
    return Series([x.min(),x.max()],index=['min','max'])
frame.apply(f)

	b	d	e
min	-0.191031	-1.811846	-2.346897
max	1.064349	0.265398	0.655632

#frame中各个浮点值IDE格式化字符串，使用applymap(format)
format=lambda x:'%.2f' %x
frame.applymap(format)

	b	d	e
Utah	-0.19	-0.00	-0.33
0hio	0.71	0.27	-2.35
Texas	1.06	-1.81	-0.90
Oregon	0.33	-1.06	0.66

#之所以叫applymap,是因为Series有一个用于应用元素级函数的map方法
frame['e'].map(format)

Utah -0.33 0hio -2.35 Texas -0.90 Oregon 0.66 Name: e, dtype: object ###排序和排名

#sort_index方法返回一个已排序的新对象
obj=Series(range(4),index=['d','a','b','c'])
obj.sort_index()

a 1 b 2 c 3 d 0 dtype: int32

obj=Series(range(4),index=list('bacd'))
obj.sort_index()

a 1 b 0 c 2 d 3 dtype: int32

#对于DataFrame,则可以根据任意一个轴上的索引进行排序
frame=DataFrame(np.arange(8).reshape((2,4)),index=['three','one'],columns=['d','a','b','c'])
frame.sort_index()

	d	a	b	c
one	4	5	6	7
three	0	1	2	3

frame.sort_index(axis=1)

	a	b	c	d
three	1	2	3	0
one	5	6	7	4

frame.sort_index(axis=1,ascending=False)#数据默认是按升序排列，

	d	c	b	a
three	0	3	2	1
one	4	7	6	5

#按值对Series进行排序，可以使用其sort_value方法
obj=Series([4,7,-3,2])
obj.sort_values()

2 -3 3 2 0 4 1 7 dtype: int64

#在排序时，任何缺失值默认都会被放到Series的末尾
obj=Series([4,np.nan,7,np.nan,-3,2])
obj.sort_values()

4 -3.0 5 2.0 0 4.0 2 7.0 1 NaN 3 NaN dtype: float64

#DataFrame，可以根据一个或多个列的值进行排序
frame=DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
frame

	a	b
0	0	4
1	1	7
2	0	-3
3	1	2

frame.sort_values(by='b')#sort_index方法过时

	a	b
2	0	-3
3	1	2
0	0	4
1	1	7

#根据多个列进行排序，传入名称的列表
frame.sort_values(by=['a','b'])

	a	b
2	0	-3
0	0	4
3	1	2
1	1	7

#排名（ranking) 增设一个排名值 默认为各组分配一个平均排名
obj=Series([7,-5,7,4,2,0,4])
obj.rank()

0 6.5 1 1.0 2 6.5 3 4.5 4 3.0 5 2.0 6 4.5 dtype: float64

#根据值在原数据中出现的顺序给出排名
obj.rank(method='first')

0 6.0 1 1.0 2 7.0 3 4.0 4 3.0 5 2.0 6 5.0 dtype: float64

#按降序进行排名
obj.rank(ascending=False,method='max')

0 2.0 1 7.0 2 2.0 3 4.0 4 5.0 5 6.0 6 4.0 dtype: float64

#DataFrame可以在行或列上计算排名
frame=DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],'c':[-2,5,8,-2.5]})
frame

	a	b	c
0	0	4.3	-2.0
1	1	7.0	5.0
2	0	-3.0	8.0
3	1	2.0	-2.5

frame.rank(axis=1)

	a	b	c
0	2.0	3.0	1.0
1	1.0	3.0	2.0
2	2.0	1.0	3.0
3	2.0	3.0	1.0

带有重复值的轴索引

obj=Series(range(5),index=['a','a','b','b','c'])
obj

a 0 a 1 b 2 b 3 c 4 dtype: int32

#索引的is_unique值会告诉你它的值是否是唯一的
obj.index.is_unique

False

obj['a']

a 0 a 1 dtype: int32

obj['c']

df=DataFrame(np.random.randn(4,3),index=['a','a','b','b'])
df

	0	1	2
a	-0.524361	-0.145395	-1.322196
a	-0.666326	-0.496612	1.486401
b	-0.395841	-0.921194	0.260437
b	-0.187285	-0.456014	1.434571

df.ix['b']

	0	1	2
b	-0.395841	-0.921194	0.260437
b	-0.187285	-0.456014	1.434571

汇总和计算描述统计

df=DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two'])
df

	one	two
a	1.40	NaN
b	7.10	-4.5
c	NaN	NaN
d	0.75	-1.3

#调用DataFrame的sum方法将会返回一个含有列小计的Series
df.sum()

one 9.25 two -5.80 dtype: float64

#传入axis=1将会按行进行求和运算
df.sum(axis=1)

a 1.40 b 2.60 c NaN d -0.55 dtype: float64

#NA值会被自动排除，通过skipna选项可以禁用该功能
df.mean(axis=1,skipna=False)

a NaN b 1.300 c NaN d -0.275 dtype: float64

#有些方法（如idxmin和idxmax返回的是间接统计（比如达到最小值或最大值的索引）
df.idxmax()

one b two d dtype: object

#另一些方法则是累计型的
df.cumsum()

	one	two
a	1.40	NaN
b	8.50	-4.5
c	NaN	NaN
d	9.25	-5.8

#还有方法既不是约简型也不是累计型，describe就是一个例子，它用于一次产生多个汇总统计
df.describe()

	one	two
count	3.000000	2.000000
mean	3.083333	-2.900000
std	3.493685	2.262742
min	0.750000	-4.500000
25%	1.075000	-3.700000
50%	1.400000	-2.900000
75%	4.250000	-2.100000
max	7.100000	-1.300000

#对于非数值型数据，describe会产生另外一种汇总统计
obj=Series(['a','a','b','c']*4)
obj.describe()

count 16 unique 3 top a freq 8 dtype: object ##相关系数与协方差

#数据来自Yahoo！Finance的股票价格和成交量
# import pandas.io.data as web #版本更新改动了
import pandas_datareader.data as web
all_data={}
for ticker in ['AAPL','IBM','MSFT','GOOG']:
    all_data[ticker]=web.get_data_yahoo(ticker,'1/1/2000','1/1/2010')
price=DataFrame({tic:data['Adj Close'] for tic,data in all_data.items()})    
volume=DataFrame({tic:data['Volume'] for tic,data in all_data.items()})

#计算价格的百分数编号
returns=price.pct_change()
returns.tail()

	AAPL	GOOG	IBM	MSFT
Date
2009-12-24	0.034339	0.011117	0.004385	0.002587
2009-12-28	0.012294	0.007098	0.013326	0.005484
2009-12-29	-0.011861	-0.005571	-0.003477	0.007058
2009-12-30	0.012147	0.005376	0.005461	-0.013699
2009-12-31	-0.004300	-0.004416	-0.012597	-0.015504

#Series的corr方法用于计算两个Series中重叠的，非NA的，按索引对其的相关系数。与此类似，cov用于计算协方差
returns.MSFT.corr(returns.IBM)

0.49597963862836764

#DataFrame的corr和cov方法将以DataFrame的形式返回完整的相关系数或协方差矩阵
returns.corr()

	AAPL	GOOG	IBM	MSFT
AAPL	1.000000	0.470676	0.410011	0.424305
GOOG	0.470676	1.000000	0.390689	0.443587
IBM	0.410011	0.390689	1.000000	0.495980
MSFT	0.424305	0.443587	0.495980	1.000000

returns.cov()

	AAPL	GOOG	IBM	MSFT
AAPL	0.001027	0.000303	0.000252	0.000309
GOOG	0.000303	0.000580	0.000142	0.000205
IBM	0.000252	0.000142	0.000367	0.000216
MSFT	0.000309	0.000205	0.000216	0.000516

#DataFrame的corrwith方法，可以计算其列或行跟另一个Series或DataFrame之间的相关系数。传入一个Series将会返回一个相关系数值Series
#（针对各列进行计算）
returns.corrwith(returns.IBM)

AAPL 0.410011 GOOG 0.390689 IBM 1.000000 MSFT 0.495980 dtype: float64

#传入一个DataFrame则会计算按列名配对的相关系数。这里，计算百分比变化与成交量的相关系数
returns.corrwith(volume)

————————————————————————— NameError Traceback (most recent call last) in () 1 #传入一个DataFrame则会计算按列名配对的相关系数。这里，计算百分比变化与成交量的相关系数 —-> 2 returns.corrwith(volume) NameError: name ‘returns’ is not defined

#传入axis=1可按行进行计算，无论如何，在计算相关系数之前，所有的数据项都会按标签对齐

唯一值、值计数以及成员资格

obj=Series(['c','a','d','a','a','b','b','c','c'])

uniques=obj.unique()
uniques

array([‘c’, ‘a’, ‘d’, ‘b’], dtype=object)

obj.value_counts()

c 3 a 3 b 2 d 1 dtype: int64

#可以用于任何数组或序列
pd.value_counts(obj.values,sort=False)

d 1 b 2 a 3 c 3 dtype: int64

#isin用于判断矢量化集合的成员资格，选取子集
mask=obj.isin(['b','c'])
mask

0 True 1 False 2 False 3 False 4 False 5 True 6 True 7 True 8 True dtype: bool

obj[mask]

0 c 5 b 6 b 7 c 8 c dtype: object

#得到DataFrame中多个相关列的一张柱状图
data=DataFrame({'Qu1':[1,3,4,3,4],'Qu2':[2,3,1,2,3],'Qu3':[1,5,2,4,4]})
data

	Qu1	Qu2	Qu3
0	1	2	1
1	3	3	5
2	4	1	2
3	3	2	4
4	4	3	4

result=data.apply(pd.value_counts).fillna(0)
result

	Qu1	Qu2	Qu3
1	1.0	1.0	1.0
2	0.0	2.0	1.0
3	2.0	2.0	0.0
4	2.0	0.0	2.0
5	0.0	0.0	1.0

处理缺失数据

string_data=Series(['aaedvark','artichoke',np.nan,'avocado'])
string_data

0 aaedvark 1 artichoke 2 NaN 3 avocado dtype: object

string_data.isnull()

0 False 1 False 2 True 3 False dtype: bool

string_data[0]=None
string_data.isnull()

0 True 1 False 2 True 3 False dtype: bool ###滤除缺失数据

#对于一个Series,dropna返回一个仅含非空数据和索引值的Series
from numpy import nan as NA
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
data=Series([1,NA,3.5,NA,7])
data.dropna()

0 1.0 2 3.5 4 7.0 dtype: float64

data[data.notnull()]

0 1.0 2 3.5 4 7.0 dtype: float64

#对于DataFrame对象，dropna默认丢弃任何含有缺失值的行
data=DataFrame([[1.,6.5,3.],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]])
clearned=data.dropna()
data

	0	1	2
0	1.0	6.5	3.0
1	1.0	NaN	NaN
2	NaN	NaN	NaN
3	NaN	6.5	3.0

clearned

	0	1	2
0	1.0	6.5	3.0

data.dropna(how='all')#丢弃全为NA的行

	0	1	2
0	1.0	6.5	3.0
1	1.0	NaN	NaN
3	NaN	6.5	3.0

data[4]=NA
data

	0	1	2	4
0	1.0	6.5	3.0	NaN
1	1.0	NaN	NaN	NaN
2	NaN	NaN	NaN	NaN
3	NaN	6.5	3.0	NaN

data.dropna(axis=1,how='all')

	0	1	2
0	1.0	6.5	3.0
1	1.0	NaN	NaN
2	NaN	NaN	NaN
3	NaN	6.5	3.0

df=DataFrame(np.random.randn(7,3))
df.ix[:4,1]=NA
df.ix[:2,2]=NA
df

	0	1	2
0	-1.637463	NaN	NaN
1	-1.259674	NaN	NaN
2	-0.284635	NaN	NaN
3	0.818905	NaN	-1.878244
4	-2.402401	NaN	-0.533942
5	-0.623351	-1.472599	-0.860614
6	-0.194565	-1.757851	-1.251312

df.dropna(thresh=3)

	0	1	2
5	-0.623351	-1.472599	-0.860614
6	-0.194565	-1.757851	-1.251312

填充缺失数据

df.fillna(0)

	0	1	2
0	-1.637463	0.000000	0.000000
1	-1.259674	0.000000	0.000000
2	-0.284635	0.000000	0.000000
3	0.818905	0.000000	-1.878244
4	-2.402401	0.000000	-0.533942
5	-0.623351	-1.472599	-0.860614
6	-0.194565	-1.757851	-1.251312

df.fillna({1:0.5,2:-1})

	0	1	2
0	-1.637463	0.500000	-1.000000
1	-1.259674	0.500000	-1.000000
2	-0.284635	0.500000	-1.000000
3	0.818905	0.500000	-1.878244
4	-2.402401	0.500000	-0.533942
5	-0.623351	-1.472599	-0.860614
6	-0.194565	-1.757851	-1.251312

#fillna默认会返回新对象，但也可以对现有对象进行就地修改
_=df.fillna(0,inplace=True)
df

	0	1	2
0	-1.637463	0.000000	0.000000
1	-1.259674	0.000000	0.000000
2	-0.284635	0.000000	0.000000
3	0.818905	0.000000	-1.878244
4	-2.402401	0.000000	-0.533942
5	-0.623351	-1.472599	-0.860614
6	-0.194565	-1.757851	-1.251312

df=DataFrame(np.random.randn(6,3))
df.ix[2:,1]=NA
df.ix[4:,2]=NA
df

	0	1	2
0	0.173799	0.267422	0.480141
1	1.303258	-0.429756	-0.790661
2	-0.110613	NaN	0.878062
3	1.188953	NaN	-0.125561
4	-0.512800	NaN	NaN
5	-0.383978	NaN	NaN

df.fillna(method='ffill')

	0	1	2
0	0.173799	0.267422	0.480141
1	1.303258	-0.429756	-0.790661
2	-0.110613	-0.429756	0.878062
3	1.188953	-0.429756	-0.125561
4	-0.512800	-0.429756	-0.125561
5	-0.383978	-0.429756	-0.125561

df.fillna(method='ffill',limit=2)

	0	1	2
0	0.173799	0.267422	0.480141
1	1.303258	-0.429756	-0.790661
2	-0.110613	-0.429756	0.878062
3	1.188953	-0.429756	-0.125561
4	-0.512800	NaN	-0.125561
5	-0.383978	NaN	-0.125561

data=Series([1.,NA,3.5,NA,7])
data.fillna(data.mean())

0 1.000000 1 3.833333 2 3.500000 3 3.833333 4 7.000000 dtype: float64

层次化索引

#能在一个轴上拥有多个（两个以上）索引级别。能以低维度形式处理高纬度数据
data=Series(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]])
data

a 1 -2.059265 2 0.276982 3 -1.771092 b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 d 2 -1.697117 3 -0.659792 dtype: float64

data.index

MultiIndex(levels=[[‘a’, ‘b’, ‘c’, ‘d’], [1, 2, 3]], labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

data['b']

1 0.501535 2 1.547647 3 -0.038850 dtype: float64

data['b':'c']

b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 dtype: float64

data.ix[['b','c']]

b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 dtype: float64

data[:,2]

a 0.276982 b 1.547647 c -0.905470 d -1.697117 dtype: float64

data.unstack()

	1	2	3
a	-2.059265	0.276982	-1.771092
b	0.501535	1.547647	-0.038850
c	1.963156	-0.905470	NaN
d	NaN	-1.697117	-0.659792

data.unstack().stack()

a 1 -2.059265 2 0.276982 3 -1.771092 b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 d 2 -1.697117 3 -0.659792 dtype: float64

#对于一个DataFrame，每条轴都可以有分层索引：
frame=DataFrame(np.arange(12).reshape((4,3)),index=[['a','a','b','b'],[1,2,1,2]],
                columns=[['Ohio','Ohio','Colorado'],['Green','Red','Green']])
frame

		Ohio		Colorado
		Green	Red	Green
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

frame.index.names=['key1','key2']
frame.columns.names=['state','color']
frame

	state	Ohio		Colorado
	color	Green	Red	Green
key1	key2
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

frame['Ohio']

	color	Green	Red
key1	key2
a	1	0	1
a	2	3	4
b	1	6	7
b	2	9	10

重排分级顺序

frame.swaplevel('key1','key2')

	state	Ohio		Colorado
	color	Green	Red	Green
key2	key1
1	a	0	1	2
2	a	3	4	5
1	b	6	7	8
2	b	9	10	11

frame.sortlevel(1)

	state	Ohio		Colorado
	color	Green	Red	Green
key1	key2
a	1	0	1	2
b	1	6	7	8
a	2	3	4	5
b	2	9	10	11

frame.swaplevel(0,1).sortlevel(0)

	state	Ohio		Colorado
	color	Green	Red	Green
key2	key1
1	a	0	1	2
1	b	6	7	8
2	a	3	4	5
2	b	9	10	11

根据级别汇总统计

frame.sum(level='key2')

state	Ohio		Colorado
color	Green	Red	Green
key2
1	6	8	10
2	12	14	16

frame.sum(level='color',axis=1)

	color	Green	Red
key1	key2
a	1	2	1
a	2	8	4
b	1	14	7
b	2	20	10

使用DataFrame的列

#DataFrame中的列与行索引之间的变化
frame=DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],
                 'd':[0,1,2,0,1,2,3]})
frame

	a	b	c	d
0	0	7	one	0
1	1	6	one	1
2	2	5	one	2
3	3	4	two	0
4	4	3	two	1
5	5	2	two	2
6	6	1	two	3

frame2=frame.set_index(['c','d'])
frame2

		a	b
c	d
one	0	0	7
	1	1	6
	2	2	5
two	0	3	4
	1	4	3
	2	5	2
	3	6	1

frame.set_index(['c','d'],drop=False)

		a	b	c	d
c	d
one	0	0	7	one	0
	1	1	6	one	1
	2	2	5	one	2
two	0	3	4	two	0
	1	4	3	two	1
	2	5	2	two	2
	3	6	1	two	3

frame2.reset_index()

	c	d	a	b
0	one	0	0	7
1	one	1	1	6
2	one	2	2	5
3	two	0	3	4
4	two	1	4	3
5	two	2	5	2
6	two	3	6	1

其他有关pandas的话题

整数索引

ser=Series(np.arange(3.))
ser

0    0.0
1    1.0
2    2.0
dtype: float64

ser[1]

1.0

ser#这里会报错

ser2=Series(np.arange(3.),index=['a','b','c'])#非整数索引没有歧义
ser2[-1]

2.0

ser.ix[:1]

0    0.0
1    1.0
dtype: float64

ser3=Series(range(3),index=[-5,1,3])
ser3.iloc[2]#基于位置的索引 iget_value方法过时

frame=DataFrame(np.arange(6).reshape(3,2),index=[2,0,1])
frame.iloc[0]#irow和icol方法过时

0    0
1    1
Name: 2, dtype: int32

面板数据

#pandas中有一个Panel数据结构，可以用一个DataFrame对象组成的字典或一个三维ndarray来创建Panel对象
from pandas_datareader import data as web
pdata=pd.Panel(dict((stk,web.get_data_yahoo(stk,'1/1/2009','6/1/2012') ) for stk in ['AAPL','GOOG','MSFT','DELL']))
pdata#Panel中的每一项（类似于DataFrame中的列）都是一个DataFrame