pandas
- series
- dataframe
##Series
3. 由一组数据和一组与之相关的数据标签(即索引)组成
obj = Series([4, 5, -5, 3])
obj
0 4
1 7
2 -5
3 3
#表现形式极为索引在左,值在右边
#如果需要表现获取数组表现形式和索引对象
#用values获取数值
obj.values
orray([4, 7, -5, 3])
#用index获取索引对象
obj.index
Int64Index([0, 1, 2, 3])
#series对每一个数据点标记索引
obj = Series([4, 7, -5, 3], index['a', 'b', 'c', 'd'])
obj
a 4
b 7
c -5
d 3
#通过索引方式选取series中的单个或一组值
obj['a']
4
obj['d'] = 6
obj['a', 'b', 'c', 'd']
a 4
b 7
c -5
d 6
#通过a索引到值4, 通过d 更改数值为 6
#布尔型数组进行过滤,标量乘法,应用数学函数不改变其索引和值之间的连接
#过滤
obj[obj > 0]
a 4
b 7
d 6
#标量乘法
obj*2
a 8
b 14
c -10
d 12
#函数
np.exp(obj) # a 4 b 7 c -5
a 403.428793
b 1096.633158
d 0.006738
#可以用做判断,将series是为定长的有序字典
'b' in obj
true
#数据存放在字典里
sdata = {
'ohio': 3500, 'texas': 71000, 'oregon': 16000, 'utah': 5000}
obj = Series(sdata)
obj
ohio 3500
texas 71000
oregon 16000
utah 5000
#若只传入一个字典,则结果series中的索引就是原字典的键(有序排列)
states = ['california', 'ohio', 'oregon', 'texas']
obj = Series(sdata, index=states)
obj
california NaN#not a number 表示缺失值 针对缺失值,可以使用isnull和 notnull 来检测
ohio 35000
oregon 16000
texas 71000
#pandas-isnull
pd.isnull(obj)
california true
ohio false
oregon false
texas false
#pandas-notnull
pd.notnull(obj)
california false
ohio true
oregon true
texas true
#series-isnull
obj.isnull()
california true
ohio false
oregon false
texas false
#series 最重要的功能,在算数运算时会自动对齐
#之前用abcd对应的数字 4 7 -5 3 现在使用index原地替换
obj.index = ['bob', 'steven', 'jeff', 'ryan']
obj
bob 4
steven 7
jeff -5
ryan 3
##dataframe是表格型的数据结构
import pandas as pd
data = {
'state': ['ohio', 'ohio', 'ohio', 'nevada', 'nevada'], 'year': [2000, 2001, 2002, 2001, 2002],
'pop' [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = pd.DataFrame(data)
frame
#这个命名错了好几次了,注意前后统一,少了个pd死活过不去啊
state year pop
0 ohio 2000 1.5
1 ohio 2001 1.7
2 ohio 2002 3.6
3 nevada 2001 2.4
4 nevada 2002 2.9
#如果指定了顺序
pd.DataFrame(data, columns=['year', 'state', 'pop'])
year state pop
0 2000 ohio 1.5
1 2001 ohio 1.7
2 2002 ohio 3.6
3 2001 nevada 2.4
4 2002 nevada 2.9
#若出现数据找不到,则表现NA
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five'])
frame2
year state pop debt
one 2000 ohio 1.5 NaN
two 2001 ohio 1.7 NaN
three 2002 ohio 3.6 NaN
four 2001 nevada 2.4 NaN
five 2002 nevada 2.9 NaN
#类字典标记,获取一个series
frame2['state']
one ohio
two ohio
three ohio
four nevada
five nevada
Name: state, dtype: object
#通过赋值的方式进行修改
frame2['debt']=16.5
frame2
year state pop debt
one 2000 ohio 1.5 16.5
two 2001 ohio 1.7 16.5
three 2002 ohio 3.6 16.5
four 2001 nevada 2.4 16.5
five 2002 nevada 2.9 16.5
#精准赋值
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt']=val
frame2
year state pop debt
one 2000 ohio 1.5 NaN
two 2001 ohio 1.7 -1.2
three 2002 ohio 3.6 NaN
four 2001 nevada 2.4 -1.5
five 2002 nevada 2.9 -1.7
#为不存在的列赋值会创建出新的列
frame2['eastern'] = frame2.state == 'ohio'
frame2
year state pop debt eastern
one 2000 ohio 1.5 NaN True
two 2001 ohio 1.7 -1.2 True
three 2002 ohio 3.6 NaN True
four 2001 nevada 2.4 -1.5 False
five 2002 nevada 2.9 -1.7 False
#关键字用del删除即可
del frame2['eastern']
frame2.columns
Index(['year', 'state', 'pop', 'debt'], dtype='object')
#嵌套字典-外层字典的键作为列,内层键作为行
pop = {
'nevada': {
2001: 2.4, 2002: 2.9}, 'ohio': {
2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop)
frame3
nevada ohio
2001 2.4 1.7
2002 2.9 3.6
2000 NaN 1.5
#转置-横纵坐标对调
2001 2002 2000
nevada 2.4 2.9 NaN
ohio 1.7 3.6 1.5
#内层字典的键会被合并,排序以形成最终的索引
pd.DataFrame(pop, index=[2001, 2002, 2003])
nevada ohio
2001 2.4 1.7
2002 2.9 3.6
2003 NaN NaN
#还可以给行和列设置属性
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3
state nevada ohio
year
2001 2.4 1.7
2002 2.9 3.6
2000 NaN 1.5
#values 属性也会以ndarray的形式返回
frame3.values
array([[2.4, 1.7],
[2.9, 3.6],
[nan, 1.5]])
#各列的数据类型不同,则值数组的数据类型就会选用能兼容所有列的数据类型
frame2.values
array([[2000, 'ohio', 1.5, nan],
[2001, 'ohio', 1.7, -1.2],
[2002, 'ohio', 3.6, nan],
[2001, 'nevada', 2.4, -1.5],
[2002, 'nevada', 2.9, -1.7]], dtype=object)
重新索引
#reindex---传建一个适应新索引的对象
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj
d 4.5
b 7.2
a -5.3
c 3.6
dtype: float64
#reindex将会根据新索引进行重排
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2
a -5.3
b 7.2
c 3.6
d 4.5
e NaN
dtype: float64
#因为存在空值,令它等于0
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)
a -5.3
b 7.2
c 3.6
d 4.5
e 0.0
dtype: float64
#对于有序数据,索引时需要插值处理
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3
0 blue
2 purple
4 yellow
dtype: object
obj3.reindex(range(6), method='ffill')#前补后
0 blue
1 blue
2 purple
3 purple
4 yellow
5 yellow
dtype: object
obj3.reindex(range(6), method='bfill')#后补前
0 blue
1 purple
2 purple
3 yellow
4 yellow
5 NaN
dtype: object
#reindex可以修改行或列索引,只传递一个序列时,会重新索引结果的行
import numpy as np
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'], columns=['ohio', 'texas', 'california'])
frame
ohio texas california
a 0 1 2
c 3 4 5
d 6 7 8
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2
ohio texas california
a 0.0 1.0 2.0
b NaN NaN NaN
c 3.0 4.0 5.0
d 6.0 7.0 8.0
#列可以用columns重新索引
states = ['texas', 'utah', 'california']
frame.reindex(columns=states)
texas utah california
a 1 NaN 2
c 4 NaN 5
d 7 NaN 8
#丢弃指定值上面的项
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj
a 0.0
b 1.0
c 2.0
d 3.0
e 4.0
dtype: float64
#drop返回的是一个在指定轴上删除了指定值的新对象
#arange返回的是一个数据
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop('c')
new_obj
a 0.0
b 1.0
d 3.0
e 4.0
dtype: float64
#用drop会从行标签axis 0删除
data.drop(['colorado', 'ohio'])
one two three four
utah 8 9 10 11
newyork 12 13 14 15
#axis=1 或axis= 'columns'可以删除列的值
data.drop('two', axis=1)
one three four
ohio 0