包含全部示例的代码仓库见GIthub
1 导入库
import pandas as pd
import numpy as np
2 新建DataFrame
2.1 示例1
新建DataFrame
data = pd.DataFrame({'a':[1,2,3,4],
'b':list('abcd'),
'c':9,
'd':pd.Series([5,6,7,8])})
data # abcd列索引
# output
a b c d
0 1 a 9 5
1 2 b 9 6
2 3 c 9 7
3 4 d 9 8
列索引
data.columns # 列索引
# output
Index(['a', 'b', 'c', 'd'], dtype='object')
行索引
data.index # 行索引
# output
RangeIndex(start=0, stop=4, step=1)
2.2 示例2
新建DataFrame
xuhao = ['one','two','three','four']
df = pd.DataFrame(np.random.randn(4,3), index=xuhao, columns=['a','b','c'])
df
# output
a b c
one -0.327567 -0.190464 0.489604
two 0.577296 0.846016 -1.272735
three 0.297807 1.177698 -0.182066
four -1.125733 -0.765275 1.492720
行索引和列索引
df.index
# output
Index(['one', 'two', 'three', 'four'], dtype='object')
df.columns
# output
Index(['a', 'b', 'c'], dtype='object')
返回单列
df['a']
# output
one -0.327567
two 0.577296
three 0.297807
four -1.125733
Name: a, dtype: float64
df.b
# output
one -0.190464
two 0.846016
three 1.177698
four -0.765275
Name: b, dtype: float64
返回多列
df[['a', 'c']] #返回多列
# output
a c
one -0.327567 0.489604
two 0.577296 -1.272735
three 0.297807 -0.182066
four -1.125733 1.492720
单列是Series,多列是DataFrame
type(df.b)
# output
pandas.core.series.Series
type(df[['a', 'c']])
# output
pandas.core.frame.DataFrame
返回前3行
df.head(3)
# output
a b c
one -0.327567 -0.190464 0.489604
two 0.577296 0.846016 -1.272735
three 0.297807 1.177698 -0.182066
返回最后2行
df.tail(2)
# output
a b c
three 0.297807 1.177698 -0.182066
four -1.125733 -0.765275 1.492720
返回array
df.values #返回array
# output
array([[-0.3275668 , -0.19046443, 0.48960352],
[ 0.57729616, 0.8460159 , -1.27273538],
[ 0.29780732, 1.17769787, -0.18206624],
[-1.1257328 , -0.76527496, 1.49272042]])
3 loc按索引取值
df
# output
a b c
one -0.327567 -0.190464 0.489604
two 0.577296 0.846016 -1.272735
three 0.297807 1.177698 -0.182066
four -1.125733 -0.765275 1.492720
返回单行
df.loc['one']
# output
a -0.327567
b -0.190464
c 0.489604
Name: one, dtype: float64
返回多行
df.loc[['one','four']] #取多个索引,需要输入一个列表
# output
a b c
one -0.327567 -0.190464 0.489604
four -1.125733 -0.765275 1.492720
切片返回多行
df.loc['one':'three'] # 包含最后一列
# output
a b c
one -0.327567 -0.190464 0.489604
two 0.577296 0.846016 -1.272735
three 0.297807 1.177698 -0.182066
切片返回多行多列
df.loc['one':'three',['a','b']]
# output
a b
one -0.327567 -0.190464
two 0.577296 0.846016
three 0.297807 1.177698
返回单列
df.a
# output
one -0.327567
two 0.577296
three 0.297807
four -1.125733
Name: a, dtype: float64
切片返回多行多列
df.loc['one':'three','a':'c']
# output
a b c
one -0.327567 -0.190464 0.489604
two 0.577296 0.846016 -1.272735
three 0.297807 1.177698 -0.182066
切片返回多行多列
df.loc[['one','four'], 'a':'c']
# output
a b c
one -0.327567 -0.190464 0.489604
four -1.125733 -0.765275 1.492720
返回单个值,前面是行,后面是列
df.loc['two','b'] # 前面是行,后面是列
# output
0.8460159027597443
切片返回多行多列,前后都包含
df.loc[:'two','b':]
# output
b c
one -0.190464 0.489604
two 0.846016 -1.272735
4 iloc
df
# output
a b c
one -0.327567 -0.190464 0.489604
two 0.577296 0.846016 -1.272735
three 0.297807 1.177698 -0.182066
four -1.125733 -0.765275 1.492720
按位取值,返回行
df.iloc[1] # 按位取值
# output
a 0.577296
b 0.846016
c -1.272735
Name: two, dtype: float64
切片返回多行多列
df.iloc[1:]
# output
a b c
two 0.577296 0.846016 -1.272735
three 0.297807 1.177698 -0.182066
four -1.125733 -0.765275 1.492720
切片返回多行
df.iloc[1:, 0]
# output
two 0.577296
three 0.297807
four -1.125733
Name: a, dtype: float64
切片返回多行多列,前包后不包
df.iloc[:2, 1:] # 按位取值,不包含最后一位
# output
b c
one -0.190464 0.489604
two 0.846016 -1.272735
切片返回多行多列
df.iloc[:2, [0,2]]
# output
a c
one -0.327567 0.489604
two 0.577296 -1.272735
返回单个值
df.iloc[2,1]
# output
1.1776978729086762
单个值赋值
df.iloc[2,1]=1000
df
# output
a b c
one -0.327567 -0.190464 0.489604
two 0.577296 0.846016 -1.272735
three 0.297807 1000.000000 -0.182066
four -1.125733 -0.765275 1.492720
5 添加和删除行和列
df
# output
a b c
one -0.327567 -0.190464 0.489604
two 0.577296 0.846016 -1.272735
three 0.297807 1000.000000 -0.182066
four -1.125733 -0.765275 1.492720
筛选,返回boolean值
df>0
# output
a b c
one False False True
two True True False
three True True False
four False False True
筛选
df[df>0]
# output
a b c
one NaN NaN 0.489604
two 0.577296 0.846016 NaN
three 0.297807 1000.000000 NaN
four NaN NaN 1.492720
筛选赋值
df[df<0] = 0
df
# output
a b c
one 0.000000 0.000000 0.489604
two 0.577296 0.846016 0.000000
three 0.297807 1000.000000 0.000000
four 0.000000 0.000000 1.492720
按列筛选
df[df.b > 1]
# output
a b c
three 0.297807 1000.0 0.0
添加列
df
# output
a b c
one 0.000000 0.000000 0.489604
two 0.577296 0.846016 0.000000
three 0.297807 1000.000000 0.000000
four 0.000000 0.000000 1.492720
df['d'] = 4 # 添加列
df['e'] = np.arange(4)
df
# output
a b c d e
one 0.000000 0.000000 0.489604 4 0
two 0.577296 0.846016 0.000000 4 1
three 0.297807 1000.000000 0.000000 4 2
four 0.000000 0.000000 1.492720 4 3
添加列
df['f'] = pd.Series([2,3,4,5], index=['one', 'three', 'four', 'five'])
df
# output
a b c d e f
one 0.000000 0.000000 0.489604 4 0 2.0
two 0.577296 0.846016 0.000000 4 1 NaN
three 0.297807 1000.000000 0.000000 4 2 3.0
four 0.000000 0.000000 1.492720 4 3 4.0
删除列
del df['e'] # 删除列
df
# output
a b c d f
one 0.000000 0.000000 0.489604 4 2.0
two 0.577296 0.846016 0.000000 4 NaN
three 0.297807 1000.000000 0.000000 4 3.0
four 0.000000 0.000000 1.492720 4 4.0
这样不会添加
df.h = 5 # 并不会添加,要使用中括号,正式的列名
df
# output
a b c d f
one 0.000000 0.000000 0.489604 4 2.0
two 0.577296 0.846016 0.000000 4 NaN
three 0.297807 1000.000000 0.000000 4 3.0
four 0.000000 0.000000 1.492720 4 4.0
添加行
df.loc['five'] = 10 # 添加行
df
# output
a b c d f
one 0.000000 0.000000 0.489604 4 2.0
two 0.577296 0.846016 0.000000 4 NaN
three 0.297807 1000.000000 0.000000 4 3.0
four 0.000000 0.000000 1.492720 4 4.0
five 10.000000 10.000000 10.000000 10 10.0
删除行
df.drop('one') # 删除行
# output
a b c d f
two 0.577296 0.846016 0.00000 4 NaN
three 0.297807 1000.000000 0.00000 4 3.0
four 0.000000 0.000000 1.49272 4 4.0
five 10.000000 10.000000 10.00000 10 10.0
删除要赋值回去
df = df.drop('one')
df
# output
a b c d f
two 0.577296 0.846016 0.00000 4 NaN
three 0.297807 1000.000000 0.00000 4 3.0
four 0.000000 0.000000 1.49272 4 4.0
five 10.000000 10.000000 10.00000 10 10.0
删除多行
df = df.drop(['four','five'])
取多列,效果和删除多列相同
df[['a','b']]
# output
a b
two 0.577296 0.846016
three 0.297807 1000.000000
删除多列
df.drop(columns=['c','f']) # 删除列
# output
a b d
two 0.577296 0.846016 4
three 0.297807 1000.000000 4
但是不赋值回去不会删除成功
df
# output
a b c d f
two 0.577296 0.846016 0.0 4 NaN
three 0.297807 1000.000000 0.0 4 3.0
6 排序与数据对齐
num = ['one', 'two', 'three', 'four', 'five', 'six', 'seven']
data = pd.DataFrame(np.random.randn(7,4), index=num, columns=list('abcd'))
data
# output
a b c d
one 0.695007 0.712372 0.739108 -0.932024
two -1.524550 0.945240 1.424037 -0.262905
three 0.110267 0.872907 -1.592445 -0.063497
four -1.507138 0.020152 -0.145890 0.673170
five -0.398889 1.113829 0.503900 0.288276
six -0.852355 0.654533 1.220746 2.075212
seven -0.551279 1.460979 -1.770640 -0.379884
data.shape
# output
(7, 4)
np.shape(data)
# output
(7, 4)
转置
data.T
# output
one two three four five six seven
a 0.695007 -1.524550 0.110267 -1.507138 -0.398889 -0.852355 -0.551279
b 0.712372 0.945240 0.872907 0.020152 1.113829 0.654533 1.460979
c 0.739108 1.424037 -1.592445 -0.145890 0.503900 1.220746 -1.770640
d -0.932024 -0.262905 -0.063497 0.673170 0.288276 2.075212 -0.379884
data.T.shape
# output
(4, 7)
排序
data.sort_index(axis=1, ascending=False) # 默认升序,ascending=False 降序
# output
d c b a
one -0.932024 0.739108 0.712372 0.695007
two -0.262905 1.424037 0.945240 -1.524550
three -0.063497 -1.592445 0.872907 0.110267
four 0.673170 -0.145890 0.020152 -1.507138
five 0.288276 0.503900 1.113829 -0.398889
six 2.075212 1.220746 0.654533 -0.852355
seven -0.379884 -1.770640 1.460979 -0.551279
不赋值回去不会覆盖原dataframe
data
# output
a b c d
one 0.695007 0.712372 0.739108 -0.932024
two -1.524550 0.945240 1.424037 -0.262905
three 0.110267 0.872907 -1.592445 -0.063497
four -1.507138 0.020152 -0.145890 0.673170
five -0.398889 1.113829 0.503900 0.288276
six -0.852355 0.654533 1.220746 2.075212
seven -0.551279 1.460979 -1.770640 -0.379884
inplace=True
立即替换,完成排序
data.sort_index(axis=1, ascending=False, inplace=True)
data
# output
d c b a
one -0.932024 0.739108 0.712372 0.695007
two -0.262905 1.424037 0.945240 -1.524550
three -0.063497 -1.592445 0.872907 0.110267
four 0.673170 -0.145890 0.020152 -1.507138
five 0.288276 0.503900 1.113829 -0.398889
six 2.075212 1.220746 0.654533 -0.852355
seven -0.379884 -1.770640 1.460979 -0.551279
axis=0
按行排序
data.sort_index(axis=0, ascending=False)
# output
d c b a
two -0.262905 1.424037 0.945240 -1.524550
three -0.063497 -1.592445 0.872907 0.110267
six 2.075212 1.220746 0.654533 -0.852355
seven -0.379884 -1.770640 1.460979 -0.551279
one -0.932024 0.739108 0.712372 0.695007
four 0.673170 -0.145890 0.020152 -1.507138
five 0.288276 0.503900 1.113829 -0.398889
根据'c'
列,按axis=0
排序
data.sort_values(by='c')
# output
d c b a
seven -0.379884 -1.770640 1.460979 -0.551279
three -0.063497 -1.592445 0.872907 0.110267
four 0.673170 -0.145890 0.020152 -1.507138
five 0.288276 0.503900 1.113829 -0.398889
one -0.932024 0.739108 0.712372 0.695007
six 2.075212 1.220746 0.654533 -0.852355
two -0.262905 1.424037 0.945240 -1.524550
7 索引重新排序
data
# output
d c b a
one -0.932024 0.739108 0.712372 0.695007
two -0.262905 1.424037 0.945240 -1.524550
three -0.063497 -1.592445 0.872907 0.110267
four 0.673170 -0.145890 0.020152 -1.507138
five 0.288276 0.503900 1.113829 -0.398889
six 2.075212 1.220746 0.654533 -0.852355
seven -0.379884 -1.770640 1.460979 -0.551279
按列重新排序索引
data.reindex(columns=['b','c','a','e','d'])
# output
b c a e d
one 0.712372 0.739108 0.695007 NaN -0.932024
two 0.945240 1.424037 -1.524550 NaN -0.262905
three 0.872907 -1.592445 0.110267 NaN -0.063497
four 0.020152 -0.145890 -1.507138 NaN 0.673170
five 1.113829 0.503900 -0.398889 NaN 0.288276
six 0.654533 1.220746 -0.852355 NaN 2.075212
seven 1.460979 -1.770640 -0.551279 NaN -0.379884
data.reindex(columns=['b','c','e','d'])
# output
b c e d
one 0.712372 0.739108 NaN -0.932024
two 0.945240 1.424037 NaN -0.262905
three 0.872907 -1.592445 NaN -0.063497
four 0.020152 -0.145890 NaN 0.673170
five 1.113829 0.503900 NaN 0.288276
six 0.654533 1.220746 NaN 2.075212
seven 1.460979 -1.770640 NaN -0.379884
8 同时行索引和列索引对齐
data1 = data.reindex(columns=['b','c','e','d'], index=['one', 'two', 'three', 'four', 'six', 'seven'])
data1
# output
b c e d
one 0.712372 0.739108 NaN -0.932024
two 0.945240 1.424037 NaN -0.262905
three 0.872907 -1.592445 NaN -0.063497
four 0.020152 -0.145890 NaN 0.673170
six 0.654533 1.220746 NaN 2.075212
seven 1.460979 -1.770640 NaN -0.379884
data
# output
d c b a
one -0.932024 0.739108 0.712372 0.695007
two -0.262905 1.424037 0.945240 -1.524550
three -0.063497 -1.592445 0.872907 0.110267
four 0.673170 -0.145890 0.020152 -1.507138
five 0.288276 0.503900 1.113829 -0.398889
six 2.075212 1.220746 0.654533 -0.852355
seven -0.379884 -1.770640 1.460979 -0.551279
dataframe相加
data2 = data + data1
data2
# output
data2
data2
a b c d e
five NaN NaN NaN NaN NaN
four NaN 0.040303 -0.291781 1.346339 NaN
one NaN 1.424744 1.478216 -1.864048 NaN
seven NaN 2.921959 -3.541281 -0.759769 NaN
six NaN 1.309065 2.441493 4.150424 NaN
three NaN 1.745814 -3.184889 -0.126993 NaN
two NaN 1.890480 2.848073 -0.525810 NaN
求平均值,对axis=1
np.mean(data2, axis=1)
# output
five NaN
four 0.364954
one 0.346304
seven -0.459697
six 2.633661
three -0.522023
two 1.404248
dtype: float64
data2.mean(1)
# output
five NaN
four 0.364954
one 0.346304
seven -0.459697
six 2.633661
three -0.522023
two 1.404248
dtype: float64
求和,对axis=1
np.sum(data2, axis=1)
# output
five 0.000000
four 1.094862
one 1.038912
seven -1.379091
six 7.900982
three -1.566068
two 4.212743
dtype: float64
data2.sum(1)
# output
five 0.000000
four 1.094862
one 1.038912
seven -1.379091
six 7.900982
three -1.566068
two 4.212743
dtype: float64
求最大值
data2.max(1)
# output
five NaN
four 1.346339
one 1.478216
seven 2.921959
six 4.150424
three 1.745814
two 2.848073
dtype: float64