【Pandas】1.3.汇总统计方法与应用函数

LouHerGetUp

已于 2023-11-29 08:45:19 修改

阅读量418

点赞数 14

分类专栏：机器学习文章标签： pandas

于 2023-11-26 22:50:37 首次发布

本文链接：https://blog.csdn.net/CSDNLHCC/article/details/134634459

版权

机器学习专栏收录该内容

48 篇文章 0 订阅

订阅专栏

包含全部示例的代码仓库见GIthub

1 导入库

import pandas as pd
import numpy as np

2 统计方法

2.1 示例1

新建DataFrame

data = pd.DataFrame(np.random.randn(9,6), columns=list('abcdef'))
data
# output
       a	b	c	d	e	f
0	-1.275374	-1.716434	0.378963	-0.570430	-0.838515	-0.580085
1	-0.379262	-0.194995	1.712992	0.647108	0.716119	1.556584
2	-0.560927	-1.532025	-0.477870	0.543167	1.043725	-0.748711
3	1.628004	-0.179523	1.089616	1.102608	0.333088	-0.942939
4	-1.625205	-0.379636	1.757878	0.963458	0.576844	1.185927
5	-1.506126	-0.228909	0.136160	-0.638295	-0.210192	0.556669
6	-0.612531	-1.607560	0.165254	-1.736242	-1.267426	0.397543
7	-0.051339	-0.155817	1.433560	-0.135979	-0.486893	0.890941
8	0.964277	1.297130	-0.521509	-0.057722	-0.240441	-0.036027

显示信息

data.info()
# output
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   a       9 non-null      float64
 1   b       9 non-null      float64
 2   c       9 non-null      float64
 3   d       9 non-null      float64
 4   e       9 non-null      float64
 5   f       9 non-null      float64
dtypes: float64(6)
memory usage: 560.0 bytes

修改为np.nan

data.iloc[2,2] = np.nan
data.info()
# output
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   a       9 non-null      float64
 1   b       9 non-null      float64
 2   c       8 non-null      float64
 3   d       9 non-null      float64
 4   e       9 non-null      float64
 5   f       9 non-null      float64
dtypes: float64(6)
memory usage: 560.0 bytes

输出一些统计指标

data.describe()
# output
           a	b	c	d	e	f
count	9.000000	9.000000	8.000000	9.000000	9.000000	9.000000
mean	-0.379832	-0.521974	0.769114	0.013075	-0.041521	0.253322
std	1.098543	0.961703	0.844814	0.910042	0.764763	0.887484
min	-1.625205	-1.716434	-0.521509	-1.736242	-1.267426	-0.942939
25%	-1.275374	-1.532025	0.157980	-0.570430	-0.486893	-0.580085
50%	-0.560927	-0.228909	0.734290	-0.057722	-0.210192	0.397543
75%	-0.051339	-0.179523	1.503418	0.647108	0.576844	0.890941
max	1.628004	1.297130	1.757878	1.102608	1.043725	1.556584

求和

data.sum(0)  
# output
a   -3.418484
b   -4.697769
c    6.152914
d    0.117672
e   -0.373692
f    2.279902
dtype: float64

返回最大值索引

data.idxmax(0)  # 返回最大值索引
# output
a    3
b    8
c    4
d    3
e    2
f    1
dtype: int64

data.a.idxmax()
# output
3

返回a列的最大值

data.a[data.a.idxmax()]  # 返回最大值
# output
1.6280042571252895

2.2 示例2

新建Dataframe

data = pd.DataFrame(np.random.randint(1,10,size=(5,7)))  # 范围1到10
data
# output
    0	1	2	3	4	5	6
0	4	1	7	8	7	3	6
1	1	5	4	5	6	6	9
2	4	3	7	5	3	8	3
3	9	7	5	5	6	1	6
4	7	7	1	8	1	7	8

返回包含的值

np.unique(data)
# output
array([1, 3, 4, 5, 6, 7, 8, 9])

data.iloc[:,-1]
# output
0    6
1    9
2    3
3    6
4    8
Name: 6, dtype: int32

data.iloc[:,-1].unique()
# output
array([6, 9, 3, 8])

data.iloc[2].unique()
# output
array([4, 3, 7, 5, 8])

统计每个值出现的次数

data.iloc[:,-2].value_counts()   # 统计每个值出现的次数
# output
3    1
6    1
8    1
1    1
7    1
Name: 5, dtype: int64

s = pd.Series(['a', 'b', 'b', 'b', 'b', 'b', 'a', 'c'])
s.value_counts()
# output
b    5
a    2
c    1
dtype: int64

3 应用函数 apply

3.1 成员关系判断

s = pd.Series(['a', 'b', 'b', 'b', 'b', 'b', 'a', 'c'])
s.isin(['a','c'])
# output
0     True
1    False
2    False
3    False
4    False
5    False
6     True
7     True
dtype: bool

apply函数，求每列最大值减最小值

data = pd.DataFrame(np.random.randint(1,10,size=(5,7)))  # 范围1到10
data
# output
    0	1	2	3	4	5	6
0	4	1	7	8	7	3	6
1	1	5	4	5	6	6	9
2	4	3	7	5	3	8	3
3	9	7	5	5	6	1	6
4	7	7	1	8	1	7	8

data.apply(lambda x: x.max()-x.min(), axis=0)
# output
0    8
1    6
2    6
3    3
4    6
5    7
6    6
dtype: int32

修改列名

data.columns = list("abcdefg")
data
# output
	a	b	c	d	e	f	g
0	4	1	7	8	7	3	6
1	1	5	4	5	6	6	9
2	4	3	7	5	3	8	3
3	9	7	5	5	6	1	6
4	7	7	1	8	1	7	8

data.a.apply(lambda x: x+10)
# output
0    14
1    11
2    14
3    19
4    17
Name: a, dtype: int64

3.2 元素级应用函数

apply广播

data.applymap(lambda x: x**2 + x + 3)  #和广播效果类似
# output
    a	b	c	d	e	f	g
0	23	5	59	75	59	15	45
1	5	33	23	33	45	45	93
2	23	15	59	33	15	75	15
3	93	59	33	33	45	5	45
4	59	59	5	75	5	59	75

赋值

data['g'] = ['ss','fge','sgs','gega','gas']
data
# output
    a	b	c	d	e	f	g
0	4	1	7	8	7	3	ss
1	1	5	4	5	6	6	fge
2	4	3	7	5	3	8	sgs
3	9	7	5	5	6	1	gega
4	7	7	1	8	1	7	gas

对列进行apply

data['g'] = data.g.apply(lambda x: x.title())  # g列字符串首字母大写
data
# output
    a	b	c	d	e	f	g
0	4	1	7	8	7	3	Ss
1	1	5	4	5	6	6	Fge
2	4	3	7	5	3	8	Sgs
3	9	7	5	5	6	1	Gega
4	7	7	1	8	1	7	Gas