Pandas入门
pandas的数据结构介绍
Series
# 创建Series
import pandas as pd
obj = pd.Series([4,7,-5,3])
obj
Out[3]:
0 4
1 7
2 -5
3 3
dtype: int64
# 获取values和index
obj.values
Out[5]:
array([ 4, 7, -5, 3])
obj.index
Out[6]:
RangeIndex(start=0, stop=4, step=1)
# 指定索引
obj2 = pd.Series([4,7,-5,3],index = ['a','b','c','d'])
obj2
Out[8]:
a 4
b 7
c -5
d 3
dtype: int64
# 通过索引选取值
obj2['b']
Out[9]:
7
obj2[['c','a','b']]
Out[10]:
c -5
a 4
b 7
dtype: int64
obj2[obj2 >0]
Out[11]:
a 4
b 7
d 3
dtype: int64
obj2*2
Out[12]:
a 8
b 14
c -10
d 6
dtype: int64
'b' in obj2
Out[14]:
True
# 通过字典创建Series
sdata = {'ohio':3500,'texas':71000,'oregon':16000,'utah':5000}
obj3 = pd.Series(sdata)
obj3
Out[15]:
ohio 3500
texas 71000
oregon 16000
utah 5000
dtype: int64
# 传入字排好序的字典的键以改变顺序
states = ['californis','ohio','oregon','texas']
obj4 = pd.Series(sdata,index = states)
obj4
Out[16]:
californis NaN
ohio 3500.0
oregon 16000.0
texas 71000.0
dtype: float64
# 检测缺失值
pd.isnull(obj4)
Out[17]:
californis True
ohio False
oregon False
texas False
dtype: bool
obj4.isnull()
Out[18]:
californis True
ohio False
oregon False
texas False
dtype: bool
# Series 的name属性
obj4.name = 'population'
obj4.index.name = 'stats'
obj4
Out[19]:
stats
californis NaN
ohio 3500.0
oregon 16000.0
texas 71000.0
Name: population, dtype: float64
# 直接修改索引
obj4.index = [1,2,3,4]
obj4
Out[20]:
1 NaN
2 3500.0
3 16000.0
4 71000.0
Name: population, dtype: float64
DataFrame
# 创建DataFrame
data = {'state':['ohio','ohio','ohio','nevada','nevada','nevada'],
'year':[2000,2001,2002,2001,2002,2003],
'pop':[1.5,1.7,3.6,2.4,2.9,3.2]}
frame = pd.DataFrame(data)
frame
Out[21]:
state year pop
0 ohio 2000 1.5
1 ohio 2001 1.7
2 ohio 2002 3.6
3 nevada 2001 2.4
4 nevada 2002 2.9
5 nevada 2003 3.2
# 选取前五行
frame.head()
Out[22]:
state year pop
0 ohio 2000 1.5
1 ohio 2001 1.7
2 ohio 2002 3.6
3 nevada 2001 2.4
4 nevada 2002 2.9
# 按指定顺序进行列排序
pd.DataFrame(data,columns = ['year','state','pop'])
Out[23]:
year state pop
0 2000 ohio 1.5
1 2001 ohio 1.7
2 2002 ohio 3.6
3 2001 nevada 2.4
4 2002 nevada 2.9
5 2003 nevada 3.2
# 将DataFrame的列获取为一个Series
frame['state']
Out[24]:
0 ohio
1 ohio
2 ohio
3 nevada
4 nevada
5 nevada
Name: state, dtype: object
frame.year
Out[25]:
0 2000
1 2001
2 2002
3 2001
4 2002
5 2003
Name: year, dtype: int64
frame.state
Out[26]:
0 ohio
1 ohio
2 ohio
3 nevada
4 nevada
5 nevada
Name: state, dtype: object
frame2
>>>
state year pop
0 ohio 2000 1.5
一 ohio 2001 1.7
二 ohio 2002 3.6
三 nevada 2001 2.4
四 nevada 2002 2.9
五 nevada 2003 3.2
# 获取行
frame2.loc['三']
Out[40]:
state nevada
year 2001
pop 2.4
Name: 三, dtype: object
# 直接赋值修改列
import numpy as np
frame2['pop'] = np.arange(6)
frame2
Out[46]:
state year pop
0 ohio 2000 0
一 ohio 2001 1
二 ohio 2002 2
三 nevada 2001 3
四 nevada 2002 4
五 nevada 2003 5
# 删除列
del frame2['pop']
frame2
Out[47]:
state year
0 ohio 2000
一 ohio 2001
二 ohio 2002
三 nevada 2001
四 nevada 2002
五 nevada 2003
基本功能
重新索引
obj = pd.Series([4.5,7.2,-5.3,3.6],index = ['d','b','a','c'])
obj
>>>
d 4.5
b 7.2
a -5.3
c 3.6
dtype: float64
# reindex是创建一个新对象,会根据新索引进行重排,当索引值不存在就引入缺失值
obj2 = obj.reindex(['a','b','c','d','e'])
obj2
Out[49]:
a -5.3
b 7.2
c 3.6
d 4.5
e NaN
dtype: float64
obj3 = pd.Series(['blue','purple','yellow'],index = [0,2,4])
obj3
Out[50]:
0 blue
2 purple
4 yellow
dtype: object
# method插值处理
obj3.reindex(range(6),method = 'ffill') # ffill向前值填充
Out[55]:
0 blue
1 blue
2 purple
3 purple
4 yellow
5 yellow
dtype: object
# 列重新索引
frame
Out[59]:
state year pop
0 ohio 2000 1.5
1 ohio 2001 1.7
2 ohio 2002 3.6
3 nevada 2001 2.4
4 nevada 2002 2.9
5 nevada 2003 3.2
frame.reindex(columns = ['state_a','year','pop'])
Out[61]:
state_a year pop
0 NaN 2000 1.5
1 NaN 2001 1.7
2 NaN 2002 3.6
3 NaN 2001 2.4
4 NaN 2002 2.9
5 NaN 2003 3.2
reindex函数的参数
丢弃指定轴上的项
drop方法返回的是一个在指定轴上删除了指定值的新对象
obj = pd.Series(np.arange(5.),index = ['a','b','c','d','e'])
obj
>>>
a 0.0
b 1.0
c 2.0
d 3.0
e 4.0
dtype: float64
new_obj = obj.drop('c')
new_obj
Out[65]:
a 0.0
b 1.0
d 3.0
e 4.0
dtype: float64
obj.drop(['d','c'])
Out[67]:
a 0.0
b 1.0
e 4.0
dtype: float64
# 对于DataFrame可以删除任意轴上的索引值
data = pd.DataFrame(np.arange(16).reshape(4,4),
index = ['ohio','colorado','utah','new york'],
columns = ['one','two','three','four'])
data
Out[71]:
one two three four
ohio 0 1 2 3
colorado 4 5 6 7
utah 8 9 10 11
new york 12 13 14 15
# 用标签序列调用drop删除值
data.drop(['colorado','ohio'])
Out[72]:
one two three four
utah 8 9 10 11
new york 12 13 14 15
# 通过传递axis =1或axis = ‘columns’删除列的值
data.drop('two',axis = 1)
Out[73]:
one three four
ohio 0 2 3
colorado 4 6 7
utah 8 10 11
new york 12 14 15
data.drop(['two','four'],axis = 'columns')
Out[74]:
one three
ohio 0 2
colorado 4 6
utah 8 10
new york 12 14
索引、选取和过滤
obj = pd.Series(np.arange(4.),index = ['a','b','c','d'])
obj
>>>
a 0.0
b 1.0
c 2.0
d 3.0
dtype: float64
obj[2:4]
Out[76]:
c 2.0
d 3.0
dtype: float64
obj[['b','d']]
Out[77]:
b 1.0
d 3.0
dtype: float64
obj[obj < 2]
Out[78]:
a 0.0
b 1.0
dtype: float64
obj['b':'c'] # 利用标签的切片其末端是包含的
Out[80]:
b 1.0
c 2.0
dtype: float64
data = pd.DataFrame(np.arange(16).reshape(4,4),
index = ['ohio','colorado','utah','new york'],
columns = ['one','two','three','four'])
data
Out[82]:
one two three four
ohio 0 1 2 3
colorado 4 5 6 7
utah 8 9 10 11
new york 12 13 14 15
data[['three','one']]
Out[84]:
three one
ohio 2 0
colorado 6 4
utah 10 8
new york 14 12
data[:2]
Out[85]:
one two three four
ohio 0 1 2 3
colorado 4 5 6 7
data[data['three'] > 5]
Out[87]:
one two three four
colorado 4 5 6 7
utah 8 9 10 11
new york 12 13 14 15
data < 5
Out[88]:
one two three four
ohio True True True True
colorado True False False False
utah False False False False
new york False False False False
data[data <5 ] = 0
data
Out[90]:
one two three four
ohio 0 0 0 0
colorado 0 5 6 7
utah 8 9 10 11
new york 12 13 14 15
用loc和iloc进行选取
data
>>>
one two three four
ohio 0 0 0 0
colorado 0 5 6 7
utah 8 9 10 11
new york 12 13 14 15
# loc使用轴标签索引
data.loc['colorado',['two','three']]
Out[93]:
two 5
three 6
Name: colorado, dtype: int64
# iloc使用整数索引
data.iloc[2,[3,0,1]]
Out[94]:
four 11
one 8
two 9
Name: utah, dtype: int64
data.loc[:'utah','two']
Out[96]:
ohio 0
colorado 5
utah 9
Name: two, dtype: int64
data.iloc[:,:3][data.three > 5]
Out[97]:
one two three
colorado 0 5 6
utah 8 9 10
new york 12 13 14
算术运算和数据对齐
s1 = pd.Series([7.3,-2.5,3.4,1.5],index = ['a','c','d','e'])
s2 = pd.Series([-2.1,3.6,-1.5,4,3.1],index = ['a','c','e','f','g'])
s1
>>>
a 7.3
c -2.5
d 3.4
e 1.5
dtype: float64
s2
Out[99]:
a -2.1
c 3.6
e -1.5
f 4.0
g 3.1
dtype: float64
s1 + s2 # 在不重叠的索引处引入了缺失值,缺失值在算术运算过程中传播
Out[100]:
a 5.2
c 1.1
d NaN
e 0.0
f NaN
g NaN
dtype: float64
df1 = pd.DataFrame(np.arange(9.).reshape(3,3),columns = list('bcd'),index = ['ohio','texas','colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape(4,3),columns = list('bde'),index = ['utah','ohio','texas','oregon'])
df1
Out[103]:
b c d
ohio 0.0 1.0 2.0
texas 3.0 4.0 5.0
colorado 6.0 7.0 8.0
df2
Out[104]:
b d e
utah 0.0 1.0 2.0
ohio 3.0 4.0 5.0
texas 6.0 7.0 8.0
oregon 9.0 10.0 11.0
# 相加后会返回一个新的DataFrame,其索引和列为原来两个的并集
df1 + df2
Out[105]:
b c d e
colorado NaN NaN NaN NaN
ohio 3.0 NaN 6.0 NaN
oregon NaN NaN NaN NaN
texas 9.0 NaN 12.0 NaN
utah NaN NaN NaN NaN
df1 - df2
Out[106]:
b c d e
colorado NaN NaN NaN NaN
ohio -3.0 NaN -2.0 NaN
oregon NaN NaN NaN NaN
texas -3.0 NaN -2.0 NaN
utah NaN NaN NaN NaN
在算术方法中填充值
df1 = pd.DataFrame(np.arange(12.).reshape((3,4)),columns = list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4,5)),columns = list('abcde'))
df2.loc[1,'b'] = np.nan
df1
>>>
a b c d
0 0.0 1.0 2.0 3.0
1 4.0 5.0 6.0 7.0
2 8.0 9.0 10.0 11.0
df2
Out[108]:
a b c d e
0 0.0 1.0 2.0 3.0 4.0
1 5.0 NaN 7.0 8.0 9.0
2 10.0 11.0 12.0 13.0 14.0
3 15.0 16.0 17.0 18.0 19.0
df1 + df2 # 没有重叠的位置就会产生NA值
Out[109]:
a b c d e
0 0.0 2.0 4.0 6.0 NaN
1 9.0 NaN 13.0 15.0 NaN
2 18.0 20.0 22.0 24.0 NaN
3 NaN NaN NaN NaN NaN
# 使用df1的add方法,传入df2以及fill_value参数
df1.add(df2,fill_value = 0)
Out[110]:
a b c d e
0 0.0 2.0 4.0 6.0 4.0
1 9.0 5.0 13.0 15.0 9.0
2 18.0 20.0 22.0 24.0 14.0
3 15.0 16.0 17.0 18.0 19.0
1/df1
Out[111]:
a b c d
0 inf 1.000000 0.500000 0.333333
1 0.250 0.200000 0.166667 0.142857
2 0.125 0.111111 0.100000 0.090909
df1.rdiv(1) # 等价于1/df1
Out[112]:
a b c d
0 inf 1.000000 0.500000 0.333333
1 0.250 0.200000 0.166667 0.142857
2 0.125 0.111111 0.100000 0.090909
DataFrame和Series之间的运算
frame = pd.DataFrame(np.arange(12.).reshape((4,3)),columns =list('bde'),index = ['utah','ohio','texas','orgon'])
series = frame.iloc[0]
frame
>>>
b d e
utah 0.0 1.0 2.0
ohio 3.0 4.0 5.0
texas 6.0 7.0 8.0
orgon 9.0 10.0 11.0
series
Out[114]:
b 0.0
d 1.0
e 2.0
Name: utah, dtype: float64
# DataFrame和Series之间的算术运算会将Series的索引匹配到DataFrame的列,然后沿着行一直向下
frame - series
Out[115]:
b d e
utah 0.0 0.0 0.0
ohio 3.0 3.0 3.0
texas 6.0 6.0 6.0
orgon 9.0 9.0 9.0
# 如果索引值在DataFrame的列或Series的索引中找不到,则参与运算的两个对象会被重新索引形成并集
series2 = pd.Series(range(3),index = list('bef'))
series2
Out[116]:
b 0
e 1
f 2
dtype: int64
frame + series2
Out[117]:
b d e f
utah 0.0 NaN 3.0 NaN
ohio 3.0 NaN 6.0 NaN
texas 6.0 NaN 9.0 NaN
orgon 9.0 NaN 12.0 NaN
# 匹配行索引(axis = 'index'或 axis = 0)
series3 = frame['d']
frame
Out[119]:
b d e
utah 0.0 1.0 2.0
ohio 3.0 4.0 5.0
texas 6.0 7.0 8.0
orgon 9.0 10.0 11.0
series3
Out[120]:
utah 1.0
ohio 4.0
texas 7.0
orgon 10.0
Name: d, dtype: float64
frame.sub(series3,axis = 'index')
Out[121]:
b d e
utah -1.0 0.0 1.0
ohio -1.0 0.0 1.0
texas -1.0 0.0 1.0
orgon -1.0 0.0 1.0
函数应用和映射
frame = pd.DataFrame(np.random.randn(4,3),columns = list('bde'),index = ['utah','ohio','texas','oregon'])
frame
>>>
b d e
utah -1.631864 -2.479669 0.125731
ohio -1.724878 0.811747 -1.425411
texas -0.372374 -0.851630 -1.613718
oregon -1.371739 -0.793874 1.523269
np.abs(frame)
Out[123]:
b d e
utah 1.631864 2.479669 0.125731
ohio 1.724878 0.811747 1.425411
texas 0.372374 0.851630 1.613718
oregon 1.371739 0.793874 1.523269
# 将函数应用到各列或行所形成的一维数组上
f = lambda x : x.max() - x.min()
frame.apply(f)
Out[124]:
b 1.352503
d 3.291416
e 3.136987
dtype: float64
frame.apply(f,axis = 'columns')
Out[126]:
utah 2.605400
ohio 2.536625
texas 1.241344
oregon 2.895008
dtype: float64
def f(x) :
return pd.Series([x.min(),x.max()],index = ['min','max'])
frame.apply(f)
Out[127]:
b d e
min -1.724878 -2.479669 -1.613718
max -0.372374 0.811747 1.523269
format = lambda x: '%.2f' % x
frame.applymap(format)
Out[128]:
b d e
utah -1.63 -2.48 0.13
ohio -1.72 0.81 -1.43
texas -0.37 -0.85 -1.61
oregon -1.37 -0.79 1.52
frame['e'].map(format)
Out[129]:
utah 0.13
ohio -1.43
texas -1.61
oregon 1.52
Name: e, dtype: object
排序和排名
sort_index对行或列索引进行排序(按字典顺序)
obj = pd.Series(range(4),index = ['d','a','b','c'])
obj.sort_index()
>>>
a 1
b 2
c 3
d 0
dtype: int64
frame = pd.DataFrame(np.arange(8).reshape(2,4),index = ['three','two'],columns = ['d','a','b','c'])
frame.sort_index()
Out[131]:
d a b c
three 0 1 2 3
two 4 5 6 7
frame.sort_index(axis = 1) # 按列排序
Out[132]:
a b c d
three 1 2 3 0
two 5 6 7 4
frame.sort_index(axis = 1,ascending = False) #降序
Out[133]:
d c b a
three 0 3 2 1
two 4 7 6 5
sort_values按值排序
obj = pd.Series([4,7,-3,2])
obj.sort_values()
>>>
2 -3
3 2
0 4
1 7
dtype: int64
# 排序时,任何缺失值默认放在末尾
obj = pd.Series([4,np.nan,7,np.nan,-3,2])
obj.sort_values()
Out[136]:
4 -3.0
5 2.0
0 4.0
2 7.0
1 NaN
3 NaN
dtype: float64
obj = pd.Series([4,np.nan,7,np.nan,-3,2])
obj.sort_values(ascending = False)
Out[138]:
2 7.0
0 4.0
5 2.0
4 -3.0
1 NaN
3 NaN
dtype: float64
# 根据一个或多列中的值进行排序
frame = pd.DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
frame
Out[139]:
b a
0 4 0
1 7 1
2 -3 0
3 2 1
frame.sort_values(by = ['a','b'])
Out[140]:
b a
2 -3 0
0 4 0
3 2 1
1 7 1
rank方法
obj = pd.Series([7,-5,7,4,2,0,4])
obj.rank()
>>>
0 6.5
1 1.0
2 6.5
3 4.5
4 3.0
5 2.0
6 4.5
dtype: float64
# 根据值在原数据中出现的顺序给出排名
obj.rank(method = 'first')
Out[142]:
0 6.0
1 1.0
2 7.0
3 4.0
4 3.0
5 2.0
6 5.0
dtype: float64
obj.rank(method = 'first',ascending = False)
Out[143]:
0 1.0
1 7.0
2 2.0
3 3.0
4 5.0
5 6.0
6 4.0
dtype: float64
frame = pd.DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],'c':[-2,5,8,-2.5]})
frame
Out[144]:
b a c
0 4.3 0 -2.0
1 7.0 1 5.0
2 -3.0 0 8.0
3 2.0 1 -2.5
frame.rank(axis = 'columns')
Out[145]:
b a c
0 3.0 2.0 1.0
1 3.0 1.0 2.0
2 1.0 2.0 3.0
3 3.0 2.0 1.0
带有重复标签的轴索引
obj = pd.Series(range(5),index = ['a','a','b','b','c'])
obj
>>>
a 0
a 1
b 2
b 3
c 4
dtype: int64
# 确认唯一性
obj.index.is_unique
Out[148]:
False
obj['a']
Out[149]:
a 0
a 1
dtype: int64
obj['c']
Out[150]:
4
df = pd.DataFrame(np.random.randn(4,3),index = ['a','a','b','b'])
df
Out[151]:
0 1 2
a -0.122680 -0.271023 0.792136
a 0.774621 0.111136 -0.964323
b -0.941466 1.843027 0.581256
b 0.358818 -1.205702 0.537188
df.loc['b']
Out[152]:
0 1 2
b -0.941466 1.843027 0.581256
b 0.358818 -1.205702 0.537188
汇总和计算描述统计
df = pd.DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],
index = list('abcd'),
columns = ['one','two'])
df
Out[154]:
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3
df.sum()
Out[155]:
one 9.25
two -5.80
dtype: float64
# 按行求和运算
df.sum(axis = 1)
Out[156]:
a 1.40
b 2.60
c 0.00
d -0.55
dtype: float64
df.mean(axis = 'columns',skipna = False)
Out[157]:
a NaN
b 1.300
c NaN
d -0.275
dtype: float64