《利用Python 进行数据分析》第五章:Pandas入门

       对《利用Python 进行数据分析》(Wes Mckinney著)一书中的第五章中pandas 入门进行代码实验。原书中采用的是Python2.7,而我采用的Python3.7在Pycharm调试的,因此对书中源代码进行了一定的修改,每步打印结果(除“随机”相关外)与原文校验对照一致(输出结果在注释中,简单的输出就没写结果),全手工敲写,供参考。

       Pdf文档和数据集参见:《利用Python 进行数据分析》第二章:引言中的分析代码(含pdf和数据集下载链接)

       因为代码过长,放在一个代码段中显得冗长,因此进行了拆分,如下的库引入每个代码段中均可能有必要。

# -*- coding:utf-8 -*-

from pandas import Series, DataFrame
import pandas as pd
import numpy as np

1、pandas 的数据机构介绍

1.1 Series

obj = Series([4, 7, -5, 3])
print(obj)
'''
0    4
1    7
2   -5
3    3
dtype: int64
'''
#获取Series的键值, [ 4  7 -5  3]
print(obj.values) 

# 获取Series的索引, RangeIndex(start=0, stop=4, step=1)
print(obj.index)
obj2 = Series([4, 7, -5, 3], index = ['d', 'b', 'a', 'c']) # 创建Series
print(obj2)
'''
d    4
b    7
a   -5
c    3
dtype: int64
'''
print(obj2.index) # Index(['d', 'b', 'a', 'c'], dtype='object')
print(obj2['a'])  # -5
obj2['d'] = 6
print(obj2[['c','a','d']])
'''
c    3
a   -5
d    6
dtype: int64
'''
print(obj2)
'''
d    6
b    7
a   -5
c    3
dtype: int64
'''
# Numpy数组运算都会保留索引和值之间的链接
print(obj2[obj2 > 0]) 
'''
d    6
b    7
c    3
dtype: int64
'''
print(obj2 * 2)
'''
d    12
b    14
a   -10
c     6
dtype: int64
'''
print(np.exp(obj2))
'''
d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64
'''
# Series 可以看成一个定长的有序字典
print('b' in obj2) # True
print('e' in obj2) # False

sdata = {'Ohio':35000, 'Texas': 71000, 'Oregon': 16000, 'Utah':5000}
obj3 = Series(sdata)
print(obj3)
'''
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
'''
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index = states)
print(obj4)
'''
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
'''
print(pd.isnull(obj4)) # 判断是否为null
'''
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
'''
print(pd.notnull(obj4)) # 判断是否非空
'''
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool
'''
# Series中也有类似Numpy的方法
print(obj4.isnull()) 
'''
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
'''
# 两个Series求和,自动对齐不同索引
print(obj3 + obj4) 
'''
California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64
'''
# Series对象本身及其索引都有name属性
obj4.name = 'population' 
obj4.index.name = 'state'
print(obj4)
'''
state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64
'''
print(obj.index) # RangeIndex(start=0, stop=4, step=1)

#Series的索引可以通过赋值的方式修改
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan'] 
print(obj.index) # Index(['Bob', 'Steve', 'Jeff', 'Ryan'], dtype='object')

1.2 DataFrame

# 构建Dataframe,传入等长列表或Numpy数组组成字典
data = {'state' : ['California', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year':[2000, 2001, 2002, 2001, 2002],
        'pop':[1.5, 1.7, 3.6, 2.4, 2.9]}  
# Dataframe 会自动添加索引
frame = DataFrame(data) 
print(frame)
'''
        state  year  pop
0  California  2000  1.5
1        Ohio  2001  1.7
2        Ohio  2002  3.6
3      Nevada  2001  2.4
4      Nevada  2002  2.9
'''
# 指定列顺序,则dataframe按给定顺序排列
frame1 = DataFrame(data, columns=['year', 'state', 'pop']) 
print(frame1)
'''
   year       state  pop
0  2000  California  1.5
1  2001        Ohio  1.7
2  2002        Ohio  3.6
3  2001      Nevada  2.4
4  2002      Nevada  2.9
'''
# 如果传入的列在数据中找不到,就会产生NA值
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                   index = ['one','two','three','four','five']) 
print(frame2)
'''
       year       state  pop debt
one    2000  California  1.5  NaN
two    2001        Ohio  1.7  NaN
three  2002        Ohio  3.6  NaN
four   2001      Nevada  2.4  NaN
five   2002      Nevada  2.9  NaN
'''
print(frame2.columns) # Index(['year', 'state', 'pop', 'debt'], dtype='object')

# 可通过字典或者属性的方式获取为一个Series
print(frame2['state']) 
'''
one      California
two            Ohio
three          Ohio
four         Nevada
five         Nevada
Name: state, dtype: object
'''
# 如果取单个字段有两个中括号,则得到一个DataFrame,这个有时会用到,注意与单个中括号区分
print(frame2[['state']])
'''
            state
one    California
two          Ohio
three        Ohio
four       Nevada
five       Nevada
'''
print(frame2.year)
'''
one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64
'''
# 跟结果frame3.iloc[2]效果类似,python3中没有ix()
print(frame2.loc['three']) 
print(frame2.iloc[2])
'''
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object
'''
frame2['debt'] = 16.5 # 给某个列赋值
print(frame2)
'''
       year       state  pop  debt
one    2000  California  1.5  16.5
two    2001        Ohio  1.7  16.5
three  2002        Ohio  3.6  16.5
four   2001      Nevada  2.4  16.5
five   2002      Nevada  2.9  16.5
'''
frame2['debt'] = np.arange(5.)
print(frame2)
'''
year       state  pop  debt
one    2000  California  1.5   0.0
two    2001        Ohio  1.7   1.0
three  2002        Ohio  3.6   2.0
four   2001      Nevada  2.4   3.0
five   2002      Nevada  2.9   4.0
'''

val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
# 赋值是一个Series,就会精确匹配Dataframe索引
frame2['debt'] = val 
print(frame2)
'''
       year       state  pop  debt
one    2000  California  1.5   NaN
two    2001        Ohio  1.7  -1.2
three  2002        Ohio  3.6   NaN
four   2001      Nevada  2.4  -1.5
five   2002      Nevada  2.9  -1.7
'''
# 为不存在的列赋值,会增加新的列
frame2['eastern'] = frame2.state == 'Ohio' 
print(frame2)
'''
       year       state  pop  debt  eastern
one    2000  California  1.5   NaN    False
two    2001        Ohio  1.7  -1.2     True
three  2002        Ohio  3.6   NaN     True
four   2001      Nevada  2.4  -1.5    False
five   2002      Nevada  2.9  -1.7    False
'''

del frame2['eastern'] # 删除某列
print(frame2.columns) # Index(['year', 'state', 'pop', 'debt'], dtype='object')

# 通过嵌套字典创建Dataframe,外层字典键作为列,内层键作为行索引
pop = {'Nevada':{2001:2.4, 2002:2.9},
       'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}} 
frame3 = DataFrame(pop, index = [2000,2001,2002]) # df.sort_index 可以将dataframe按索引排序
print(frame3)
'''
       Nevada  Ohio
2000     NaN   1.5
2001     2.4   1.7
2002     2.9   3.6
'''
# 对Dataframe求转置
print(frame3.T) 
'''
        2000  2001  2002
Nevada   NaN   2.4   2.9
Ohio     1.5   1.7   3.6
'''
# 显式指定索引
frame33 = DataFrame(pop, index = [2001, 2002, 2003]) 
print(frame33)
'''
      Nevada  Ohio
2001     2.4   1.7
2002     2.9   3.6
2003     NaN   NaN
'''
pdata = {'Ohio':frame3['Ohio'][:-1],
         'Nevada':frame3['Nevada'][:2]}
frame4 = DataFrame(pdata)
print(frame4)
print(frame3['Ohio'][:-1])
print(frame3['Nevada'][:2])
'''
      Ohio  Nevada
2000   1.5     NaN
2001   1.7     2.4
'''
# 增加dataframe的index和columns的name属性,则此信息会被显示显示
frame3.index.name = 'year';frame3.columns.name = 'state' 
print(frame3)
'''
state  Nevada  Ohio
year               
2000      NaN   1.5
2001      2.4   1.7
2002      2.9   3.6
'''
print(frame3.values)
'''
[[nan 1.5]
 [2.4 1.7]
 [2.9 3.6]]
'''

1.3 索引对象

# 构建Series或Dataframe时,用到的数组或序列标签都会被转成Index
obj = Series(range(3), index = ['a', 'b', 'c']) 
index = obj.index
print(index) # Index(['a', 'b', 'c'], dtype='object')
print(index[1:]) # Index(['b', 'c'], dtype='object')
# index[1] = 'd' # 不允许, Index对象是不可修改的
index = pd.Index(np.arange(3))
obj2 = Series([-1.5, -2.5, 0], index = index)
print(obj2.index is index) # True

print(frame3)
'''
state  Nevada  Ohio
year               
2000      NaN   1.5
2001      2.4   1.7
2002      2.9   3.6'''

# 除了长得像数组,Index的功能也类似一个固定大小的集合
print('Ohio' in frame3.columns) # True
print(2003 in frame3.index) # False

2、基本功能

2.1 重新索引

# Series的reindex将会根据新索引进行重排, 如何某个索引值当前不存在,则引入缺省值
obj = Series([4.5, 7.2, -5.3, 3.6], index = ['d','b', 'a', 'c'])
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
print(obj2) 
'''
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
'''
obj_reindex = obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value= 0) # 不会影响obj的值
print(obj_reindex)
'''
a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64
'''
print(obj)
'''
d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
'''
# 对于时间序列,重索引可能需要做差值,用method选项完成
obj3 = Series(["blue", "purple", "yellow"], index = [0,2,4]) 
obj3 = obj3.reindex(range(6), method = 'ffill')
print(obj3)
'''
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object
'''
frame = DataFrame(np.arange(9).reshape((3,3)), index = ['a', 'c', 'd'],
                  columns = ['Ohio', 'Texas', 'California'])
print(frame)
'''
    Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8
'''
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
print(frame2)
'''
    Ohio  Texas  California
a   0.0    1.0         2.0
b   NaN    NaN         NaN
c   3.0    4.0         5.0
d   6.0    7.0         8.0
'''
states = ['Texas', 'Utah', 'California']
frame_reindex = frame.reindex(columns = states)
print(frame_reindex)
'''
    Texas  Utah  California
a      1   NaN           2
c      4   NaN           5
d      7   NaN           8
'''
frame_reindex_both = frame.reindex(index = ['a', 'b', 'c', 'd'],columns = states).ffill() # 与原文有改动
print(frame_reindex_both)
'''
    Texas  Utah  California
a    1.0   NaN         2.0
b    1.0   NaN         2.0
c    4.0   NaN         5.0
d    7.0   NaN         8.0
'''

2.2 丢弃指定轴上的项

obj= Series(np.arange(5.), index = ['a','b','c','d','e'])
'''
a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
'''
new_obj = obj.drop('c')
print(new_obj)
'''
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
'''
new_obj2 = obj.drop(['d','c'])
print(new_obj2)
'''
a    0.0
b    1.0
e    4.0
dtype: float64
'''
data = DataFrame(np.arange(16).reshape((4,4)),
                 index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                 columns = ['one', 'two', 'three', 'four'])
# 删除某轴上的索引值
new_data = data.drop(['Colorado', 'Ohio'])
print(new_data)
'''
          one  two  three  four
Utah        8    9     10    11
New York   12   13     14    15
'''
new_data2 = data.drop('two',axis = 1) # 效果与 new_data2 = data.drop(columns = ['two'])一样
print(new_data2)

'''
          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
New York   12     14    15
'''

2.3 索引、选取和过滤

obj = Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])
print(obj)
'''
a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64
'''
print(obj[2:4])
'''
c    2.0
d    3.0
dtype: float64'''
print(obj[['b', 'a', 'd']])
'''
b    1.0
a    0.0
d    3.0
dtype: float64
'''
print(obj[[1,3]])
'''
b    1.0
d    3.0
dtype: float64
'''
print(obj[obj < 2])
'''
a    0.0
b    1.0
dtype: float64
'''
# 与Numpy切片运算有所不同,其末端是包含的
print(obj['b':'d']) 
'''
b    1.0
c    2.0
d    3.0
dtype: float64
'''
# 设置切片值
obj['b':'c'] = 5 
print(obj)
'''
a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64
'''
data = DataFrame(np.arange(16).reshape((4,4)),
                 index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                 columns = ['one', 'two', 'three', 'four'])
print(data)
'''
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
'''
print(data['two'])
'''
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32
'''
print(data[['three', 'one']])
'''
          three  one
Ohio          2    0
Colorado      6    4
Utah         10    8
New York     14   12
'''
# 通过切片或者布尔型数组选取行
print(data[:2]) 
'''
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
'''
print(data[data['three'] > 5])
'''
          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
'''
print(data < 5)
'''
            one    two  three   four
Ohio       True   True   True   True
Colorado   True  False  False  False
Utah      False  False  False  False
New York  False  False  False  False
'''
data[data < 5] = 0
print(data)
'''
          one  two  three  four
Ohio        0    0      0     0
Colorado    0    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
'''

# 通过Numpya式的标记法以及轴标签选取df行和列的子集
print(data.loc['Colorado', ['two', 'three']]) 
'''
two      5
three    6
Name: Colorado, dtype: int32
'''
# 利用列序列值选取,在python3中没有ix函数
print(data.iloc[[1, 2], [3, 0, 1]])  
# print(data.loc[['Colorade', 'Utah'], [3, 0, 1]]) #运行出错
'''
          four  one  two
Colorado     7    0    5
Utah        11    8    9
'''
print(data.iloc[2])
'''
one       8
two       9
three    10
four     11
Name: Utah, dtype: int32
'''
print(data.loc[:'Utah', 'two'])
'''
Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32
'''
print(data.loc[data.three > 5, "one":"three"]) # python3中做了修改
'''
          one  two  three
Colorado    0    5      6
Utah        8    9     10
New York   12   13     14
'''

2.4 算术运算和数据对齐

s1 = Series([-7.3, -2.5, 3.4, 1.5], index = ['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index = ['a', 'c', 'e', 'f', 'd'])
# 自动的数据对齐操作在不重叠的所引处引入了NA值
print(s1 + s2) 
'''
a   -9.4
c    1.1
d    6.5
e    0.0
f    NaN
dtype: float64
'''
df1 = DataFrame(np.arange(9).reshape((3,3)), columns = list('bcd'),
                index = ['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12).reshape((4,3)), columns = list('bde'),
                index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
print(df1)
'''
          b  c  d
Ohio      0  1  2
Texas     3  4  5
Colorado  6  7  8
'''
print(df2)
'''
        b   d   e
Utah    0   1   2
Ohio    3   4   5
Texas   6   7   8
Oregon  9  10  11
'''
print(df1 + df2)
'''
            b   c     d   e
Colorado  NaN NaN   NaN NaN
Ohio      3.0 NaN   6.0 NaN
Oregon    NaN NaN   NaN NaN
Texas     9.0 NaN  12.0 NaN
Utah      NaN NaN   NaN NaN
'''

2.5 在算术方法中填充值

df1 = DataFrame(np.arange(12).reshape((3,4)), columns = list('abcd'))
df2 = DataFrame(np.arange(20).reshape((4,5)), columns = list('abcde'))
print(df1)
'''
   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
'''
print(df2)
'''
    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
'''
# 相加时, 没有重叠的位置会产生NA值
print(df1 + df2) 
'''
      a     b     c     d   e
0   0.0   2.0   4.0   6.0 NaN
1   9.0  11.0  13.0  15.0 NaN
2  18.0  20.0  22.0  24.0 NaN
3   NaN   NaN   NaN   NaN NaN
'''
# 将df2用0填充后再相加
df_add  = df1.add(df2, fill_value = 0) 
print(df_add)
'''
      a     b     c     d     e
0   0.0   2.0   4.0   6.0   4.0
1   9.0  11.0  13.0  15.0   9.0
2  18.0  20.0  22.0  24.0  14.0
3  15.0  16.0  17.0  18.0  19.0
'''
# 对Series或Dataframe重新索引时,可以指定一个填充值
df1_new = df1.reindex(columns = df2.columns, fill_value= 0)
print(df1_new) 
'''
   a  b   c   d  e
0  0  1   2   3  0
1  4  5   6   7  0
2  8  9  10  11  0
'''

2.6 DataFrame和Series之间的运算

arr = np.arange(12.).reshape((3,4))
print(arr)
print(arr[0]) # [0. 1. 2. 3.]
print(arr - arr[0])
'''
[[0. 0. 0. 0.]
 [4. 4. 4. 4.]
 [8. 8. 8. 8.]]
'''
frame = DataFrame(np.arange(12).reshape((4,3)), columns = list('bde'),
                index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
'''
        b   d   e
Utah    0   1   2
Ohio    3   4   5
Texas   6   7   8
Oregon  9  10  11
'''
series = frame.iloc[0]
print(series)
'''
b    0
d    1
e    2
'''
# 根据广播机制向下运算
print(frame - series) 
'''
        b  d  e
Utah    0  0  0
Ohio    3  3  3
Texas   6  6  6
Oregon  9  9  9
'''
series2 = Series(range(3), index = ['b', 'e', 'f'])
print(series2)
'''
b    0
e    1
f    2
'''
print(frame + series2)
'''
          b   d     e   f
Utah    0.0 NaN   3.0 NaN
Ohio    3.0 NaN   6.0 NaN
Texas   6.0 NaN   9.0 NaN
Oregon  9.0 NaN  12.0 NaN
'''
series3 = frame['d']
print(series3)
'''
Utah       1
Ohio       4
Texas      7
Oregon    10
Name: d, dtype: int32
'''
frame_sub = frame.sub(series3, axis = 0)
print(frame_sub)
'''
        b  d  e
Utah   -1  0  1
Ohio   -1  0  1
Texas  -1  0  1
Oregon -1  0  1
'''

2.7 函数应用和映射

# Numpy的元素级操作方法,也可以用于pandas对象
frame = DataFrame(np.random.randn(4,3), columns = list('bde'),
                index = ['Utah', 'Ohio', 'Texas', 'Oregon']) 
print(frame)
'''
               b         d         e
Utah   -1.203000 -0.755058 -0.964232
Ohio   -0.751273 -0.108269  1.673861
Texas   1.396867  1.603542  0.345961
Oregon  0.592793 -1.212553  0.005406
'''
print(np.abs(frame))
'''
               b         d         e
Utah    1.203000  0.755058  0.964232
Ohio    0.751273  0.108269  1.673861
Texas   1.396867  1.603542  0.345961
Oregon  0.592793  1.212553  0.005406
'''
f = lambda x: x.max() - x.min()
# axis = 0, 表示沿着每一列或行标签\索引值向下执行方法
print(frame.apply(f)) 
'''
b    2.599867
d    2.816095
e    2.638093
'''
# 表示沿着每一行或者列标签模向执行对应的方法
print(frame.apply(f, axis = 1)) 
'''
Utah      0.447942
Ohio      2.425134
Texas     1.257581
Oregon    1.805346
'''

def f(x):
    # 传递给apply的函数可以返回多个值组成的Series
    return Series([x.min(), x.max()], index = ['min','max'])
print(frame.apply(f))
'''
            b         d         e
min -1.203000 -1.212553 -0.964232
max  1.396867  1.603542  1.673861
'''
format = lambda x: '%.2f' %x
# 将frame中各浮点值格式化
print(frame.applymap(format)) 
'''
            b      d      e
Utah    -1.20  -0.76  -0.96
Ohio    -0.75  -0.11   1.67
Texas    1.40   1.60   0.35
Oregon   0.59  -1.21   0.01
'''

# 应用于元素级函数的map方法,applymap
print(frame['e'].map(format)) 
'''
Utah      -0.96
Ohio       1.67
Texas      0.35
Oregon     0.01
Name: e, dtype: object
'''

2.8 排序和排名

obj = Series(range(4), index = ['d', 'a', 'b', 'c'])
# 不会改变原有obj,加上inplace = True才会改变原来的obj
obj_sort = obj.sort_index() 
print(obj_sort)
'''
a    1
b    2
c    3
d    0
'''
frame = DataFrame(np.arange(8).reshape((2,4)), index = ['three', 'one'],
                  columns = ['d', 'a', 'b', 'c'])
# 按行名排列
frame_sort_index= frame.sort_index() 
print(frame_sort_index)
'''
       d  a  b  c
one    4  5  6  7
three  0  1  2  3
'''
# 按列名排列
frame_sort_columns = frame.sort_index(axis = 1) 
print(frame_sort_columns)
'''
       a  b  c  d
three  1  2  3  0
one    5  6  7  4
'''
frame_sort_descend = frame.sort_index(axis = 1, ascending= False) # 降序排列
print(frame_sort_descend)
'''
       d  c  b  a
three  0  3  2  1
one    4  7  6  5
'''

# 对Series排序,使用order方法
obj = Series([4, 7, -3, 2]) 

# python3.6后去除order函数,改为sort_values()
obj_order = obj.sort_values() 
print(obj_order)
'''
2   -3
3    2
0    4
1    7
dtype: int64
'''
# 任何缺省值都会被放到Series的末尾
obj = Series([4, np.nan, 7, np.nan, -3, 2]) 
print(obj.sort_values())
'''
4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64
'''
frame = DataFrame({'b':[4, 7, -3, 2], 'a': [0, 1, 0, 1]})
print(frame)
'''
   b  a
0  4  0
1  7  1
2 -3  0
3  2  1
'''
# 按某列的值进行排序
frame_sort = frame.sort_values(by = ['b'])  
print(frame_sort)
'''
   b  a
2 -3  0
3  2  1
0  4  0
1  7  1
'''
frame_sort_multi = frame.sort_values(by = ['a', 'b'])
print(frame_sort_multi)
'''
   b  a
2 -3  0
0  4  0
3  2  1
1  7  1
'''

obj = Series([7, -5, 7, 4, 2, 0, 4])
# rank会增设一个排名值,从1开始,一直到数组中所有有效数据的数量
print(obj.rank()) 
'''
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64
'''
# 可以根据值在原数据中出现的顺序给出排名
print(obj.rank(method = 'first')) 
'''
0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
'''
print(obj.rank(ascending = False, method = 'max'))
'''
0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
'''
frame = DataFrame({'b':[4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                  'c':[-2, 5, 8, -2.5]})
# 按每行的值排序
print(frame.rank(axis = 1)) 
'''
     b    a    c
0  3.0  2.0  1.0
1  3.0  1.0  2.0
2  1.0  2.0  3.0
3  3.0  2.0  1.0
'''

2.9 带有重复值得轴索引

obj = Series(range(5), index = ['a', 'a', 'b' ,'b' ,'c'])
print(obj)
'''
a    0
a    1
b    2
b    3
c    4
dtype: int64
'''
# 判断索引值是否是唯一的
print(obj.index.is_unique) # False 
# 带有重复值的索引,返回一个Series
print(obj['a'])  
'''
a    0
a    1
dtype: int64
'''
# 对应单个值的,则返回一个标量值
print(obj['c'])  # 4

df = DataFrame(np.random.randn(4,3), index = ['a', 'a', 'b', 'b'])
print(df)
'''
          0         1         2
a  0.618408 -1.495024 -1.569955
a  0.959323  0.444813 -0.070000
b  0.306775 -0.155840  1.806228
b -0.601667  0.029473  0.114349
'''
# 对于DataFrame的行进行索引时也与上述类似
print(df.loc['b']) 
'''
          0         1         2
b  0.306775 -0.155840  1.806228
b -0.601667  0.029473  0.114349
'''

3、汇总和计算描述统计

3.1 和、均值等计算

df = DataFrame([[1.4, np.nan], [7.1, -4.5],
               [np.nan, np.nan], [0.75, -1.3]],
               index = ['a','b','c','d'],
               columns = ['one', 'two'])
print(df)
'''
    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3
'''
# 按行求和
print(df.sum()) 
'''
one    9.25
two   -5.80
dtype: float64
'''
# 按列求和,NA值会自动被排除;与原文有点不同,原文c行的值为NaN
print(df.sum(axis = 1)) 
'''
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64
'''
# skipna可禁止“NA值被排除"
print(df.mean(axis=1, skipna=False)) 
'''
a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64
'''
# 返回每列中的最大值索引
print(df.idxmax()) 
'''
one    b
two    d
dtype: object
'''
#按行求累加和
print(df.cumsum()) 
'''
    one  two
a  1.40  NaN
b  8.50 -4.5
c   NaN  NaN
d  9.25 -5.8
'''
# 返回汇总统计
print(df.describe()) #
'''
            one       two
count  3.000000  2.000000
mean   3.083333 -2.900000
std    3.493685  2.262742
min    0.750000 -4.500000
25%    1.075000 -3.700000
50%    1.400000 -2.900000
75%    4.250000 -2.100000
max    7.100000 -1.300000
'''

# 对于非数值型,describe会产生另外一种汇总统计
obj = Series(['a', 'a', 'b', 'c'] * 4) 
print(obj.describe())
'''
count     16
unique     3
top        a
freq       8
dtype: object
'''

3.2 相关系数和协方差

import pandas_datareader.data as web # 相比原文有修改
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.items()})
volume = DataFrame({tic: data['Volume'] for tic, data in all_data.items()})
returns = price.pct_change()
print(returns.tail())
'''
                AAPL       IBM      MSFT      GOOG
Date                                              
2009-12-24  0.034339  0.004385  0.002587  0.011117
2009-12-28  0.012294  0.013326  0.005484  0.007098
2009-12-29 -0.011861 -0.003477  0.007058 -0.005571
2009-12-30  0.012147  0.005461 -0.013699  0.005376
2009-12-31 -0.004299 -0.012597 -0.015504 -0.004416
'''
# 计算两个Series相关系数
print(returns.MSFT.corr(returns.IBM)) # 0.4943579912733758
# 计算协方差
print(returns.MSFT.cov(returns.IBM))  # 0.0002158213155918
# corr()返回完整的相关系数
print(returns.corr()) # 
'''
          AAPL       IBM      MSFT      GOOG
AAPL  1.000000  0.412392  0.423598  0.470676
IBM   0.412392  1.000000  0.494358  0.390688
MSFT  0.423598  0.494358  1.000000  0.443586
GOOG  0.470676  0.390688  0.443586  1.000000
'''
# cov()返回协方差矩阵
print(returns.cov())  
'''
          AAPL       IBM      MSFT      GOOG
AAPL  0.001030  0.000254  0.000309  0.000303
IBM   0.000254  0.000369  0.000216  0.000142
MSFT  0.000309  0.000216  0.000516  0.000205
GOOG  0.000303  0.000142  0.000205  0.000580
'''
# 计算列或者行与另一Series或DataFrame之间的相关系数
print(returns.corrwith(returns.IBM)) 
'''
AAPL    0.412392
IBM     1.000000
MSFT    0.494358
GOOG    0.390688
dtype: float64
'''
# 传入DataFrame会计算按列名匹配的相关系数
print(returns.corrwith(volume)) 
'''
AAPL   -0.057665
IBM    -0.006592
MSFT   -0.014228
GOOG    0.062647
dtype: float64
'''

3.3 唯一值、值计算以及成员资格

obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
# 得到Series中的唯一数组
uniques = obj.unique() 
print(uniques) # ['c' 'a' 'd' 'b']

# 计算Series中各值出现的频率,而且是降序排列
print(obj.value_counts()) 
'''
a    3
c    3
b    2
d    1
dtype: int64
'''
print(pd.value_counts(obj.values, sort=False))
'''
b    2
a    3
d    1
c    3
dtype: int64
'''
# isin() 用于判断矢量化集合的成员资格,用于选择Seried或DataFrame例中数据子集
mask = obj.isin(['b', 'c']) 
print(mask)
'''
0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
'''
print(obj[mask])
'''
0    c
5    b
6    b
7    c
8    c
dtype: object
'''
data = DataFrame({'Qu1': [1, 3, 4, 3, 4],
                  'Qu2': [2, 3, 1, 2, 3],
                  'Qu3': [1, 5, 2, 4, 4]})
print(data)
'''
   Qu1  Qu2  Qu3
0    1    2    1
1    3    3    5
2    4    1    2
3    3    2    4
4    4    3    4
'''
result = data.apply(pd.value_counts)  # 不是太明白做什么,文中说得到多个相关列的一张柱状图
print(result)
'''
   Qu1  Qu2  Qu3
1  1.0  1.0  1.0
2  0.0  2.0  1.0
3  2.0  2.0  0.0
4  2.0  0.0  2.0
5  0.0  0.0  1.0
'''

4、处理缺失数据

string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
print(string_data)
'''
0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object
'''
print(string_data.isnull())
'''
0    False
1    False
2     True
3    False
dtype: bool
'''
string_data[0] = None
# Python 内置的None也会被当做NA处理
print(string_data.isnull())  
'''
0     True
1    False
2     True
3    False
dtype: bool
'''

4.1 滤除缺失数据

# 如果增加inplace = True 则会对原始的data更改
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data_no_nan =data.dropna() 
print(data_no_nan)
'''
0    1.0
2    3.5
4    7.0
dtype: float64
'''
#通过布尔型索引也可以实现
print(data[data.notnull()]) 
'''
0    1.0
2    3.5
4    7.0
dtype: float64
'''
data = DataFrame([[1., 6.5, 3.],[1., NA, NA],
                 [NA, NA, NA],[NA, 6.5, 3.]])
#丢弃任何含有缺失值的行
cleaned = data.dropna() 
print(data)
'''
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0
'''
print(cleaned)
'''
     0    1    2
0  1.0  6.5  3.0
'''
# 只丢弃全为NA的那些行
data_new = data.dropna(how = 'all') 
print(data_new)
'''
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
3  NaN  6.5  3.0
'''
data[4] = NA
print(data)
'''
     0    1    2   4
0  1.0  6.5  3.0 NaN
1  1.0  NaN  NaN NaN
2  NaN  NaN  NaN NaN
3  NaN  6.5  3.0 NaN
'''
# 当某列全为NA时丢弃
data_new2 = data.dropna(axis = 1, how = 'all') 
print(data_new2)
'''
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0
'''
df = DataFrame(np.random.randn(7,3))
df.iloc[:5, 1] = NA
df.iloc[:3, 2] = NA
print(df)
'''
          0         1         2
0 -0.776768       NaN       NaN
1 -0.432202       NaN       NaN
2  0.633459       NaN       NaN
3 -0.671694       NaN  1.433947
4  1.109140       NaN -0.646371
5  1.097845  0.128552 -0.392997
6  0.212458 -0.700758  1.566229
'''
# 只保留一部分观测数据,每行数据要大于等于3才保留,否则删除整行
df_drop = df.dropna(thresh = 3) 
print(df_drop)
'''
          0         1         2
5  1.097845  0.128552 -0.392997
6  0.212458 -0.700758  1.566229
'''

4.2 填充缺失数据

# 用0填充缺失值
df_fill_0 = df.fillna(0)  
print(df_fill_0)
'''
          0         1         2
0 -0.776768  0.000000  0.000000
1 -0.432202  0.000000  0.000000
2  0.633459  0.000000  0.000000
3 -0.671694  0.000000  1.433947
4  1.109140  0.000000 -0.646371
5  1.097845  0.128552 -0.392997
6  0.212458 -0.700758  1.566229
'''
# 用dict填充缺失值
df_fill_dict = df.fillna({1:0.5, 3:-1}) 
print(df_fill_dict)
'''
          0         1         2
0 -0.776768  0.500000       NaN
1 -0.432202  0.500000       NaN
2  0.633459  0.500000       NaN
3 -0.671694  0.500000  1.433947
4  1.109140  0.500000 -0.646371
5  1.097845  0.128552 -0.392997
6  0.212458 -0.700758  1.56622
'''
# 在原数据上填充
_ = df.fillna(0, inplace= True) 
print(df)
'''
          0         1         2
0 -0.776768  0.000000  0.000000
1 -0.432202  0.000000  0.000000
2  0.633459  0.000000  0.000000
3 -0.671694  0.000000  1.433947
4  1.109140  0.000000 -0.646371
5  1.097845  0.128552 -0.392997
6  0.212458 -0.700758  1.566229
'''
df = DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] =NA
print(df)
'''
          0         1         2
0 -1.107271  0.148246 -0.623761
1 -0.805279 -1.708179  0.804398
2  0.241048       NaN  0.439521
3 -0.811178       NaN  0.360617
4  0.145371       NaN       NaN
5  0.450989       NaN       NaN
'''
# 通过差值的方式填充缺失值
df_fill = df.fillna(method = 'ffill') 
print(df_fill)
'''
          0         1         2
0 -1.107271  0.148246 -0.623761
1 -0.805279 -1.708179  0.804398
2  0.241048 -1.708179  0.439521
3 -0.811178 -1.708179  0.360617
4  0.145371 -1.708179  0.360617
5  0.450989 -1.708179  0.360617
'''
# 对填充的数量做限制,例如不能超过2个
df_fill_limit2 = df.fillna(method = 'ffill', limit = 2)
print(df_fill_limit2)
'''
          0         1         2
0 -1.107271  0.148246 -0.623761
1 -0.805279 -1.708179  0.804398
2  0.241048 -1.708179  0.439521
3 -0.811178 -1.708179  0.360617
4  0.145371       NaN  0.360617
5  0.450989       NaN  0.360617
'''

data = Series([1., NA, 3.5, NA, 7])
# 用平均值进行缺失值的填充
data_fill_mean = data.fillna(data.mean()) 
print(data_fill_mean)
'''
0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64
'''

5、层次化索引

# 是的在一个轴上有多个(两个以上)索引级别
data = Series(np.random.randn(10),
              index = [['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                       [1,2,3,1,2,3,1,2,2,3]]) 
print(data)
'''
a  1    0.948250
   2    0.186262
   3    0.167854
b  1   -0.349104
   2   -0.911277
   3    0.790769
c  1   -0.348525
   2   -0.047111
d  2   -1.302505
   3    0.392822
dtype: float64
'''
print(data.index)
'''
MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 2),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )
'''
print(data['b'])
'''
1   -0.349104
2   -0.911277
3    0.790769
dtype: float64
'''
print(data['b':'c'])
'''
b  1   -0.349104
   2   -0.911277
   3    0.790769
c  1   -0.348525
   2   -0.047111
dtype: float64
'''
print(data.loc[['b', 'd']])
'''
b  1   -0.349104
   2   -0.911277
   3    0.790769
d  2   -1.302505
   3    0.392822
dtype: float64
'''
# "内层”中进行选取
print(data[:, 2])  
'''
a    0.186262
b   -0.911277
c   -0.047111
d   -1.302505
'''
# 重新安排到一个DataFrame中
print(data.unstack()) 
'''
          1         2         3
a  0.948250  0.186262  0.167854
b -0.349104 -0.911277  0.790769
c -0.348525 -0.047111       NaN
d       NaN -1.302505  0.392822
'''
# unstack 是stack逆运算
print(data.unstack().stack()) 
'''
a  1    0.948250
   2    0.186262
   3    0.167854
b  1   -0.349104
   2   -0.911277
   3    0.790769
c  1   -0.348525
   2   -0.047111
d  2   -1.302505
   3    0.392822
dtype: float64
'''
# 对于DataFrame, 每条轴都可以有分层索引
frame = DataFrame(np.arange(12).reshape((4,3)),
                  index = [['a', 'a', 'b', 'b'], [1,2,1,2]],
                  columns = [['Ohio', 'Ohio', 'Colorado'],
                             ['Green', 'Red', 'Green']]) 
print(frame)
'''
     Ohio     Colorado
    Green Red    Green
a 1     0   1        2
  2     3   4        5
b 1     6   7        8
  2     9  10       11
'''
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
print(frame)
'''
state      Ohio     Colorado
color     Green Red    Green
key1 key2                   
a    1        0   1        2
     2        3   4        5
b    1        6   7        8
     2        9  10       11
'''
# 通过分部的列索引,可以轻松选取分组
print(frame['Ohio']) 
'''
color      Green  Red
key1 key2            
a    1         0    1
     2         3    4
b    1         6    7
     2         9   10
'''

5.1 重排分级顺序

# frame由"层次化索引一节"得到; 互换两个级别
frame_swap_lev = frame.swaplevel('key1', 'key2') 
print(frame_swap_lev)
'''
state      Ohio     Colorado
color     Green Red    Green
key2 key1                   
1    a        0   1        2
2    a        3   4        5
1    b        6   7        8
2    b        9  10       11
'''

5.2 根据级别汇总统计

#  frame由"层次化索引一节"得到; 根据行或列上的级别进行求和
print(frame.sum(level = 'key2'))  
'''
state  Ohio     Colorado
color Green Red    Green
key2                    
1         6   8       10
2        12  14       16
'''

# 按照color进行求和
print(frame.sum(level = 'color', axis = 1)) 
'''
color      Green  Red
key1 key2            
a    1         2    1
     2         8    4
b    1        14    7
     2        20   10
'''

5.3 使用DataFrame的列

frame = DataFrame({'a':range(7),'b':range(7,0,-1),
                   'c':['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                   'd':[0, 1, 2, 0, 1, 2, 3]})
print(frame)
'''
   a  b    c  d
0  0  7  one  0
1  1  6  one  1
2  2  5  one  2
3  3  4  two  0
4  4  3  two  1
5  5  2  two  2
6  6  1  two  3
'''
# 将一个或多个列转化为行索引,并创建一个新的DataFrame
frame2 = frame.set_index(['c', 'd']) 
print(frame2)
'''
       a  b
c   d      
one 0  0  7
    1  1  6
    2  2  5
two 0  3  4
    1  4  3
    2  5  2
    3  6  1
'''
# 将被转换的列保留
frame3 = frame.set_index(['c', 'd'], drop = False) 
print(frame3)
'''
       a  b    c  d
c   d              
one 0  0  7  one  0
    1  1  6  one  1
    2  2  5  one  2
two 0  3  4  two  0
    1  4  3  two  1
    2  5  2  two  2
    3  6  1  two  3
'''
# 与set_index()刚好相反,层次化索引的级别会被转移到里面
frame2_reset = frame2.reset_index() 
print(frame2_reset)
'''
     c  d  a  b
0  one  0  0  7
1  one  1  1  6
2  one  2  2  5
3  two  0  3  4
4  two  1  4  3
5  two  2  5  2
6  two  3  6  1
'''
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

南洲.

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值