包含全部示例的代码仓库见GIthub
1 导入库
import pandas as pd
import numpy as np
2 统计方法
2.1 示例1
新建DataFrame
data = pd.DataFrame(np.random.randn(9,6), columns=list('abcdef'))
data
# output
a b c d e f
0 -1.275374 -1.716434 0.378963 -0.570430 -0.838515 -0.580085
1 -0.379262 -0.194995 1.712992 0.647108 0.716119 1.556584
2 -0.560927 -1.532025 -0.477870 0.543167 1.043725 -0.748711
3 1.628004 -0.179523 1.089616 1.102608 0.333088 -0.942939
4 -1.625205 -0.379636 1.757878 0.963458 0.576844 1.185927
5 -1.506126 -0.228909 0.136160 -0.638295 -0.210192 0.556669
6 -0.612531 -1.607560 0.165254 -1.736242 -1.267426 0.397543
7 -0.051339 -0.155817 1.433560 -0.135979 -0.486893 0.890941
8 0.964277 1.297130 -0.521509 -0.057722 -0.240441 -0.036027
显示信息
data.info()
# output
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 a 9 non-null float64
1 b 9 non-null float64
2 c 9 non-null float64
3 d 9 non-null float64
4 e 9 non-null float64
5 f 9 non-null float64
dtypes: float64(6)
memory usage: 560.0 bytes
修改为np.nan
data.iloc[2,2] = np.nan
data.info()
# output
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 a 9 non-null float64
1 b 9 non-null float64
2 c 8 non-null float64
3 d 9 non-null float64
4 e 9 non-null float64
5 f 9 non-null float64
dtypes: float64(6)
memory usage: 560.0 bytes
输出一些统计指标
data.describe()
# output
a b c d e f
count 9.000000 9.000000 8.000000 9.000000 9.000000 9.000000
mean -0.379832 -0.521974 0.769114 0.013075 -0.041521 0.253322
std 1.098543 0.961703 0.844814 0.910042 0.764763 0.887484
min -1.625205 -1.716434 -0.521509 -1.736242 -1.267426 -0.942939
25% -1.275374 -1.532025 0.157980 -0.570430 -0.486893 -0.580085
50% -0.560927 -0.228909 0.734290 -0.057722 -0.210192 0.397543
75% -0.051339 -0.179523 1.503418 0.647108 0.576844 0.890941
max 1.628004 1.297130 1.757878 1.102608 1.043725 1.556584
求和
data.sum(0)
# output
a -3.418484
b -4.697769
c 6.152914
d 0.117672
e -0.373692
f 2.279902
dtype: float64
返回最大值索引
data.idxmax(0) # 返回最大值索引
# output
a 3
b 8
c 4
d 3
e 2
f 1
dtype: int64
data.a.idxmax()
# output
3
返回a列的最大值
data.a[data.a.idxmax()] # 返回最大值
# output
1.6280042571252895
2.2 示例2
新建Dataframe
data = pd.DataFrame(np.random.randint(1,10,size=(5,7))) # 范围1到10
data
# output
0 1 2 3 4 5 6
0 4 1 7 8 7 3 6
1 1 5 4 5 6 6 9
2 4 3 7 5 3 8 3
3 9 7 5 5 6 1 6
4 7 7 1 8 1 7 8
返回包含的值
np.unique(data)
# output
array([1, 3, 4, 5, 6, 7, 8, 9])
data.iloc[:,-1]
# output
0 6
1 9
2 3
3 6
4 8
Name: 6, dtype: int32
data.iloc[:,-1].unique()
# output
array([6, 9, 3, 8])
data.iloc[2].unique()
# output
array([4, 3, 7, 5, 8])
统计每个值出现的次数
data.iloc[:,-2].value_counts() # 统计每个值出现的次数
# output
3 1
6 1
8 1
1 1
7 1
Name: 5, dtype: int64
s = pd.Series(['a', 'b', 'b', 'b', 'b', 'b', 'a', 'c'])
s.value_counts()
# output
b 5
a 2
c 1
dtype: int64
3 应用函数 apply
3.1 成员关系判断
s = pd.Series(['a', 'b', 'b', 'b', 'b', 'b', 'a', 'c'])
s.isin(['a','c'])
# output
0 True
1 False
2 False
3 False
4 False
5 False
6 True
7 True
dtype: bool
apply
函数,求每列最大值减最小值
data = pd.DataFrame(np.random.randint(1,10,size=(5,7))) # 范围1到10
data
# output
0 1 2 3 4 5 6
0 4 1 7 8 7 3 6
1 1 5 4 5 6 6 9
2 4 3 7 5 3 8 3
3 9 7 5 5 6 1 6
4 7 7 1 8 1 7 8
data.apply(lambda x: x.max()-x.min(), axis=0)
# output
0 8
1 6
2 6
3 3
4 6
5 7
6 6
dtype: int32
修改列名
data.columns = list("abcdefg")
data
# output
a b c d e f g
0 4 1 7 8 7 3 6
1 1 5 4 5 6 6 9
2 4 3 7 5 3 8 3
3 9 7 5 5 6 1 6
4 7 7 1 8 1 7 8
data.a.apply(lambda x: x+10)
# output
0 14
1 11
2 14
3 19
4 17
Name: a, dtype: int64
3.2 元素级应用函数
apply广播
data.applymap(lambda x: x**2 + x + 3) #和广播效果类似
# output
a b c d e f g
0 23 5 59 75 59 15 45
1 5 33 23 33 45 45 93
2 23 15 59 33 15 75 15
3 93 59 33 33 45 5 45
4 59 59 5 75 5 59 75
赋值
data['g'] = ['ss','fge','sgs','gega','gas']
data
# output
a b c d e f g
0 4 1 7 8 7 3 ss
1 1 5 4 5 6 6 fge
2 4 3 7 5 3 8 sgs
3 9 7 5 5 6 1 gega
4 7 7 1 8 1 7 gas
对列进行apply
data['g'] = data.g.apply(lambda x: x.title()) # g列字符串首字母大写
data
# output
a b c d e f g
0 4 1 7 8 7 3 Ss
1 1 5 4 5 6 6 Fge
2 4 3 7 5 3 8 Sgs
3 9 7 5 5 6 1 Gega
4 7 7 1 8 1 7 Gas