目录
pandas主要提供了3种数据结构:1)Series,带标签的一维数组;2)DataFrame,带标签且大小可变的二维表格结构;3)Panel,带标签且大小可变的三维数组。
生成一维数组
python中关于数据结构Series的讲解_C.DLording的博客-CSDN博客_series在python
>>> import numpy as np
>>> import pandas as pd
>>> x = pd.Series([1, 3, 5, np.nan]) # np.nan表示空值、缺失值
>>> x
0 1.0
1 3.0
2 5.0
3 NaN
dtype: float64
>>> dates = pd.date_range(start='20200101', end='20201231', freq='M') # 间隔为月,每月最后一天
生成DataFrame
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import pandas as pd
dates = pd.date_range(start='20200101', end='20201231', freq='M')
dataframe=pd.DataFrame(np.random.randn(12,4), index=dates, columns=list('ABCD'))
print(dataframe)
>>> pd.DataFrame([np.random.randint(1, 100, 4) for i in range(12)],
index=dates, columns=list('ABCD')) # 4列随机数
二维数据查看
>>> df.head() # 默认显示前5行
A B C D E F
zhang 20 2020-01-01 1.0 3 test foo
li 26 2020-01-02 2.0 3 train foo
zhou 63 2020-01-03 3.0 3 test foo
wang 69 2020-01-04 4.0 3 train foo
>>> df.head(3) # 查看前3行
A B C D E F
zhang 20 2020-01-01 1.0 3 test foo
li 26 2020-01-02 2.0 3 train foo
zhou 63 2020-01-03 3.0 3 test foo
>>> df.tail(2) # 查看最后2行
A B C D E F
zhou 63 2020-01-03 3.0 3 test foo
wang 69 2020-01-04 4.0 3 train foo
查看二维数据的索引、列名和数据
print(df.index)
print(df.columns)
print(df.values)
查看数据的统计信息
>>> df.describe() # 平均值、标准差、最小值、最大值等信息
二维数据转置
>>> df.T
排序
数据选择
注:这个地方选的是一个字段名 不是表格内的具体数值
>>> df['A'] # 选择列
zhang 20
li 26
zhou 63
wang 69
Name: A, dtype: int32
>>> 69 in df['A'] # df['A']是一个Series对象
False
>>> 69 in df['A'].values
True
>>> df[0:2] # 使用切片选择多行
A B C D E F
zhang 20 2020-01-01 1.0 3 test foo
li 26 2020-01-02 2.0 3 train foo
>>> df.loc[:, ['A', 'C']] # 选择多列,等价于df[['A', 'C']]
A C
zhang 20 1.0
li 26 2.0
zhou 63 3.0
wang 69 4.0
>>> df.loc[['zhang', 'zhou'], ['A', 'D', 'E']]
# 同时指定多行与多列进行选择
A D E
zhang 20 3 test
zhou 63 3 test
>>> df.loc['zhang', ['A', 'D', 'E']]
A 20
D 3
E test
Name: zhang, dtype: object
>>> df.at['zhang', 'A'] # 查询指定行、列位置的数据值
20
>>> df.at['zhang', 'D']
3
>>> df.iloc[3] # 查询第3行数据
A 69
B 2020-01-04 00:00:00
C 4
D 3
E train
F foo
Name: wang, dtype: object
>>> df.iloc[0:3, 0:4] # 查询前3行、前4列数据
A B C D
zhang 20 2020-01-01 1.0 3
li 26 2020-01-02 2.0 3
zhou 63 2020-01-03 3.0 3
>>> df.iloc[[0, 2, 3], [0, 4]] # 查询指定的多行、多列数据
A E
zhang 20 test
zhou 63 test
wang 69 train
>>> df.iloc[0,1] # 查询第0行第1列位置的数据值
Timestamp('2020-01-01 00:00:00')
>>> df.iloc[2,2] # 查询第2行第2列位置的数据值
3.0
>>> df[df.A>50] # 按给定条件进行查询
A B C D E F
zhou 63 2020-01-03 3.0 3 test foo
wang 69 2020-01-04 4.0 3 train foo
>>> df[df['E']=='test'] # 按给定条件进行查询
A B C D E F
zhang 20 2020-01-01 1.0 3 test foo
zhou 63 2020-01-03 3.0 3 test foo
>>> df[df['A'].isin([20,69])]
A B C D E F
zhang 20 2020-01-01 1.0 3 test foo
wang 69 2020-01-04 4.0 3 train foo
>>> df.nlargest(3, ['C']) # 返回指定列最大的前3行
A B C D E F
wang 69 2020-01-04 4.0 3 train foo
zhou 63 2020-01-03 3.0 3 test foo
li 26 2020-01-02 2.0 3 train foo
>>> df.nlargest(3, ['A'])
A B C D E F
wang 69 2020-01-04 4.0 3 train foo
zhou 63 2020-01-03 3.0 3 test foo
li 26 2020-01-02 2.0 3 train foo
所有求和等于特定值的行
>>> dff = pd.DataFrame({'A':[1,2,3,4], 'B':[10,20,8,40]})
>>> dff
A B
0 1 10
1 2 20
2 3 8
3 4 40
>>> dff[dff.sum(axis=1)==11]
A B
0 1 10
2 3 8
数据修改
>>> df.iat[0, 2] = 3 # 修改指定行、列位置的数据值
>>> df.loc[:, 'D'] = np.random.randint(50, 60, 4)
# 修改某列的值
>>> df['C'] = -df['C'] # 对指定列数据取反
>>> df # 查看修改结果
A B C D E F
zhang 20 2020-01-01 -3.0 53 test foo
li 26 2020-01-02 -2.0 59 train foo
zhou 63 2020-01-03 -3.0 59 test foo
wang 69 2020-01-04 -4.0 50 train foo
>>> from copy import deepcopy
>>> dff = deepcopy(df)
>>> dff
A B C D E F
zhang 20 2020-01-01 -3.0 53 test foo
li 26 2020-01-02 -2.0 59 train foo
zhou 63 2020-01-03 -3.0 59 test foo
wang 69 2020-01-04 -4.0 50 train foo
>>> dff['C'] = dff['C'] ** 2 # 替换列数据
>>> dff
A B C D E F
zhang 20 2020-01-01 9.0 53 test foo
li 26 2020-01-02 4.0 59 train foo
zhou 63 2020-01-03 9.0 59 test foo
wang 69 2020-01-04 16.0 50 train foo
>>> data = pd.DataFrame({'k1':['one'] * 3 + ['two'] * 4,
'k2':[1, 1, 2, 3, 3, 4, 4]})
>>> data.replace(1, 5) # 把所有1替换为5
k1 k2
0 one 5
1 one 5
2 one 2
3 two 3
4 two 3
5 two 4
6 two 4
>>> data.replace([1,2],[5,6]) # 1->5,2->6
k1 k2
0 one 5
1 one 5
2 one 6
3 two 3
4 two 3
5 two 4
6 two 4
>>> data.replace({1:5, 'one':'ONE'}) # 使用字典指定替换关系
k1 k2
0 ONE 5
1 ONE 5
2 ONE 2
3 two 3
4 two 3
5 two 4
6 two 4
>>> data = pd.DataFrame({'k1':['one'] * 3 + ['two'] * 4,
'k2':[1, 1, 2, 3, 3, 4, 4]})
>>> data
k1 k2
0 one 1
1 one 1
2 one 2
3 two 3
4 two 3
5 two 4
6 two 4
>>> data.drop(5, axis=0) # 删除指定行
k1 k2
0 one 1
1 one 1
2 one 2
3 two 3
4 two 3
6 two 4
>>> data.drop(3, inplace=True) # 原地删除
>>> data
k1 k2
0 one 1
1 one 1
2 one 2
4 two 3
5 two 4
6 two 4
>>> data.drop('k1', axis=1) # 删除指定列
k2
0 1
1 1
2 2
4 3
5 4
6 4
>>> data = pd.DataFrame({'age':np.random.randint(20,50,5)})
>>> data
age
0 31
1 27
2 26
3 33
4 37
>>> data['rank'] = data['age'].rank() # 增加一列位次序号
>>> data
age rank
0 31 3.0
1 27 2.0
2 26 1.0
3 33 4.0
4 37 5.0
>>> data = pd.DataFrame({'姓名':['张三','李四','王五','赵六','刘七','孙八'],
'成绩':[86,92,86,60,78,78]})
>>> data
姓名 成绩
0 张三 86
1 李四 92
2 王五 86
3 赵六 60
4 刘七 78
5 孙八 78
>>> data['排名'] = data['成绩'].rank(method='min') # 倒数名次,并列的取最小值
>>> data
姓名 成绩 排名
0 张三 86 4.0
1 李四 92 6.0
2 王五 86 4.0
3 赵六 60 1.0
4 刘七 78 2.0
5 孙八 78 2.0
>>> data['排名'] = data['成绩'].rank(method='min', ascending=False)
>>> data # 正数名次,并列的名次取最小值
姓名 成绩 排名
0 张三 86 2.0
1 李四 92 1.0
2 王五 86 2.0
3 赵六 60 6.0
4 刘七 78 4.0
5 孙八 78 4.0
>>> data['排名'] = data['成绩'].rank(method='max', ascending=False)
>>> data # 正数名次,并列的名次取最大值
姓名 成绩 排名
0 张三 86 3.0
1 李四 92 1.0
2 王五 86 3.0
3 赵六 60 6.0
4 刘七 78 5.0
5 孙八 78 5.0
>>> data['排名'] = data['成绩'].rank(method='max')
>>> data # 倒数名次,并列的名次取最大值
姓名 成绩 排名
0 张三 86 5.0
1 李四 92 6.0
2 王五 86 5.0
3 赵六 60 1.0
4 刘七 78 3.0
5 孙八 78 3.0
>>> data['排名'] = data['成绩'].rank(method='average')
>>> data # 倒数名次,并列的名次取平均值
姓名 成绩 排名
0 张三 86 4.5
1 李四 92 6.0
2 王五 86 4.5
3 赵六 60 1.0
4 刘七 78 2.5
5 孙八 78 2.5
对行求和,增加一列;对列求和,增加一行
>>> dff = pd.DataFrame({'A':[1,2,3,4], 'B':[10,20,8,40]})
>>> dff
A B
0 1 10
1 2 20
2 3 8
3 4 40
>>> dff['ColSum'] = dff.apply(sum, axis=1) # 对行求和,增加1列
>>> dff.loc['RowSum'] = dff.apply(sum, axis=0) # 对列求和,增加1行
>>> dff
A B ColSum
0 1 10 11
1 2 20 22
2 3 8 11
3 4 40 44
RowSum 10 78 88
缺失值处理
>>> df
A B C D E F
zhang 20 2020-01-01 9.0 53 test foo
li 26 2020-01-02 4.0 59 train foo
zhou 63 2020-01-03 9.0 59 test foo
wang 69 2020-01-04 16.0 50 train foo
>>> df1 = df.reindex(columns=list(df.columns) + ['G'])
>>> df1
A B C D E F G
zhang 20 2020-01-01 9.0 53 test foo NaN
li 26 2020-01-02 4.0 59 train foo NaN
zhou 63 2020-01-03 9.0 59 test foo NaN
wang 69 2020-01-04 16.0 50 train foo NaN
>>> df1.iat[0, 6] = 3 # 修改指定位置元素值,该列其他元素为缺失值NaN
>>> df1
A B C D E F G
zhang 20 2020-01-01 9.0 53 test foo 3.0
li 26 2020-01-02 4.0 59 train foo NaN
zhou 63 2020-01-03 9.0 59 test foo NaN
wang 69 2020-01-04 16.0 50 train foo NaN
>>> pd.isnull(df1) # 测试缺失值,返回值为True/False阵列
A B C D E F G
zhang False False False False False False False
li False False False False False False True
zhou False False False False False False True
wang False False False False False False True
>>> df1.dropna() # 返回不包含缺失值的行
A B C D E F G
zhang 20 2020-01-01 9.0 53 test foo 3.0
>>> from copy import deepcopy
>>> df2 = deepcopy(df1)
>>> df1['G'].fillna(5, inplace=True) # 使用指定值填充缺失值
>>> df1
A B C D E F G
zhang 20 2020-01-01 9.0 53 test foo 3.0
li 26 2020-01-02 4.0 59 train foo 5.0
zhou 63 2020-01-03 9.0 59 test foo 5.0
wang 69 2020-01-04 16.0 50 train foo 5.0
>>> df2.iat[2, 5] = np.NaN
>>> df2
A B C D E F G
zhang 20 2020-01-01 1.0 53 test foo 3.0
li 26 2020-01-02 4.0 59 train foo NaN
zhou 63 2020-01-03 9.0 59 test NaN NaN
wang 69 2020-01-04 16.0 50 train foo NaN
>>> df2.dropna(thresh=6) # 返回包含6个有效值以上的数据
A B C D E F G
zhang 20 2020-01-01 1.0 53 test foo 3.0
li 26 2020-01-02 4.0 59 train foo NaN
wang 69 2020-01-04 16.0 50 train foo NaN
>>> df2.iat[3, 6] = 8
>>> df2
A B C D E F G
zhang 20 2020-01-01 1.0 53 test foo 3.0
li 26 2020-01-02 4.0 59 train foo NaN
zhou 63 2020-01-03 9.0 59 test NaN NaN
wang 69 2020-01-04 16.0 50 train foo 8.0
>>> df2.fillna({'F':'foo', 'G':df2['G'].mean()}) # 填充缺失值
A B C D E F G
zhang 20 2020-01-01 1.0 53 test foo 3.0
li 26 2020-01-02 4.0 59 train foo 5.5
zhou 63 2020-01-03 9.0 59 test foo 5.5
wang 69 2020-01-04 16.0 50 train foo 8.0
>>> import numpy as np
>>> import pandas as pd
>>> dft = pd.DataFrame({'a':[1,np.NaN, np.NaN,3]})
>>> dft.fillna(method='pad') # 使用缺失值前最后一个有效值进行填充
a
0 1.0
1 1.0
2 1.0
3 3.0
>>> dft.fillna(method='bfill') # 使用缺失值后第一个有效值往回填充
a
0 1.0
1 3.0
2 3.0
3 3.0
>>> dft.fillna(method='bfill', limit=1) # 只填充一个缺失值
a
0 1.0
1 NaN
2 3.0
3 3.0
重复值处理
>>> data = pd.DataFrame({'k1':['one'] * 3 + ['two'] * 4,
'k2':[1, 1, 2, 3, 3, 4, 4]})
>>> data
k1 k2
0 one 1
1 one 1
2 one 2
3 two 3
4 two 3
5 two 4
6 two 4
>>> data.duplicated() # 检查重复行
0 False
1 True
2 False
3 False
4 True
5 False
6 True
dtype: bool
>>> data.drop_duplicates() # 返回新数组,删除重复行
k1 k2
0 one 1
2 one 2
3 two 3
5 two 4
>>> data.drop_duplicates(['k1']) # 删除k1列的重复数据,只保留第一项
k1 k2
0 one 1
3 two 3
>>> data.drop_duplicates(['k1'], keep='last') # 保留最后一项
k1 k2
2 one 2
6 two 4
>>> data = pd.Series([3,3,3,2,1,1,1,0])
>>> data
0 3
1 3
2 3
3 2
4 1
5 1
6 1
7 0
dtype: int64
>>> data.drop_duplicates(keep=False) # 只保留出现一次的数字
3 2
7 0
dtype: int64
异常值处理
拆分与合并/连接
>>> df2 = pd.DataFrame(np.random.randn(10, 4))
>>> df2
0 1 2 3
0 2.064867 -0.888018 0.586441 -0.660901
1 -0.465664 -0.496101 0.249952 0.627771
2 1.974986 1.304449 -0.168889 -0.334622
3 0.715677 2.017427 1.750627 -0.787901
4 -0.370020 -0.878282 0.499584 0.269102
5 0.184308 0.653620 0.117899 -1.186588
6 -0.364170 1.652270 0.234833 0.362925
7 -0.329063 0.356276 1.158202 -1.063800
8 -0.778828 -0.156918 -0.760394 -0.040323
9 -0.391045 -0.374825 -1.016456 0.767481
>>> p1 = df2[:3] # 数据行拆分
>>> p1
0 1 2 3
0 2.064867 -0.888018 0.586441 -0.660901
1 -0.465664 -0.496101 0.249952 0.627771
2 1.974986 1.304449 -0.168889 -0.334622
>>> p2 = df2[3:7]
>>> p3 = df2[7:]
>>> df3 = pd.concat([p1, p2, p3]) # 数据行合并
分组计算
>>> df4 = pd.DataFrame({'A':np.random.randint(1,5,8),
'B':np.random.randint(10,15,8),
'C':np.random.randint(20,30,8),
'D':np.random.randint(80,100,8)})
>>> df4
A B C D
0 1 13 26 81
1 3 14 29 88
2 1 13 28 88
3 2 10 21 90
4 4 14 28 83
5 4 11 24 81
6 2 11 26 99
7 3 13 25 91
>>> df4.groupby('A').sum() # 数据分组计算
B C D
A
1 26 54 169
2 21 47 189
3 27 54 179
4 25 52 164
>>> df4.groupby(by=['A', 'B']).mean()
C D
A B
1 13 27.0 84.5
2 10 21.0 90.0
11 26.0 99.0
3 13 25.0 91.0
14 29.0 88.0
4 11 24.0 81.0
14 28.0 83.0
>>> df4.groupby(by=['A', 'B'], as_index=False).mean()
A B C D
0 1 13 27.0 84.5
1 2 10 21.0 90.0
2 2 11 26.0 99.0
3 3 13 25.0 91.0
4 3 14 29.0 88.0
5 4 11 24.0 81.0
6 4 14 28.0 83.0
>>> df4.groupby(by=['A', 'B']).aggregate({'C':np.mean, 'D':np.min})
# 分组后,C列使用平均值,D列使用最小值
C D
A B
1 13 27 81
2 10 21 90
11 26 99
3 13 25 91
14 29 88
4 11 24 81
14 28 83