Pandas个人最强笔记

导入库

import numpy as np
import pandas as pd  # pandas基于Numpy,升级版
# pandas牛逼,pandas就是神!!
import time
# 如果有警告,想要忽略可以操作
import warnings
warnings.filterwarnings('ignore')

pandas基础

pandas数据结构

Series 一维数据

# Series 是一维的数组,和numpy的数组还不一样,在于series多了索引,默认从0开始
l = np.array([1,2,3,6,9])
s1 = pd.Series(data=l)
display(l,s1)
array([1, 2, 3, 6, 9])



0    1
1    2
2    3
3    6
4    9
dtype: int32
s2 = pd.Series(data = l,index = list('ABCDE'))
s2  # 可以通过index添加行名
A    1
B    2
C    3
D    6
E    9
dtype: int32
s3 = pd.Series(data = {'A':149,'B':130,'C':118,'D':99,'E':66})
s3  # 可以通过字典创建添加列名
A    149
B    130
C    118
D     99
E     66
dtype: int64

DataFrame 多维数据

# Series是一维的,功能比较少
# DataFrame是二维的,多个Series公用索引,组成了DataFrame
df1 = pd.DataFrame(data = np.random.randint(0,151,size=(10,3)),
                  index = list('ABCDEFHIJK'),  # 行索引
                  columns=['python','math','english'], # 列索引
                  dtype = np.float16) 
df1
pythonmathenglish
A120.076.039.0
B10.0139.085.0
C69.014.033.0
D85.097.0144.0
E132.011.035.0
F129.025.0148.0
H45.066.095.0
I148.054.061.0
J56.014.081.0
K55.0137.0104.0
df2 = pd.DataFrame(data={'python':[66,99,128],'math':[88,65,137],'english':[100,121,45]})
df2  # 字典,key作为列索引,不指定index默认从0开始索引,自动索引一样
pythonmathenglish
06688100
19965121
212813745

数据查看

df = pd.DataFrame(data = np.random.randint(0,151,size=(100,3)),
                 columns=['python','math','english'])
df
pythonmathenglish
015011183
13113741
2126129130
39123118
4966822
............
95212978
9634760
9713421111
98108130137
9912710559

100 rows × 3 columns

df.shape  # 查看形状
(100, 3)
df.head(n=3)  # 显示前n个记录
pythonmathenglish
015011183
13113741
2126129130
df.tail(n=5)  # 显示后5个记录
pythonmathenglish
95212978
9634760
9713421111
98108130137
9912710559
df.dtypes  # 查看数据类型
python     int32
math       int32
english    int32
dtype: object
df.info()  # 查看信息:内存、类型、行列数据的详细信息
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   python   100 non-null    int32
 1   math     100 non-null    int32
 2   english  100 non-null    int32
dtypes: int32(3)
memory usage: 1.3 KB
df.describe()  # 查看描述数据的属性和信息,个数、平均值,标准差,最小值
# 四分位差、中位数、四分之三位差、最大值
pythonmathenglish
count100.00000100.000000100.000000
mean75.4800072.33000078.180000
std41.9464540.50640442.564836
min1.000001.0000003.000000
25%47.5000040.50000042.500000
50%75.0000070.00000081.500000
75%109.00000101.250000113.500000
max150.00000146.000000150.000000
df.values  # 返回的是NumPy数据
array([[150, 111,  83],
       [ 31, 137,  41],
       [126, 129, 130],
       [ 91,  23, 118],
       [ 96,  68,  22],
       [ 46,  12, 108],
       [141, 100,  16],
       [ 87,   6,  20],
       [104, 133, 101],
       [ 73, 108, 134],
       [ 57, 146,  18],
       [ 57, 133,  49],
       [ 33,  70,  28],
       [ 69,  37,  80],
       [118,  30,  44],
       [  7,  43,  37],
       [104,  42,  39],
       [145,  58, 107],
       [121,  17,  92],
       [120,  80,  89],
       [ 66,  69,  18],
       [ 21, 107, 140],
       [ 17,   1, 129],
       [ 83,  65,  95],
       [ 95,  61,  89],
       [ 79, 129,  90],
       [ 97,  54,  91],
       [123,  14,  50],
       [128,  76,  33],
       [  7,  41, 138],
       [ 61, 141, 134],
       [138, 145,  20],
       [ 60,  27, 115],
       [ 77, 139,  66],
       [112,  83, 119],
       [138, 142,  48],
       [ 16,  99,  85],
       [ 55,  21,  90],
       [ 51,  75,  63],
       [123,  34,  39],
       [ 18,  39, 103],
       [129,  73,  57],
       [139,  10, 130],
       [ 22,  79,   6],
       [ 61, 127,  85],
       [ 99,  78, 138],
       [ 59,  60,  12],
       [ 63,  41,  64],
       [ 17,  69, 112],
       [ 86,  42,  80],
       [ 94,  18,  41],
       [ 15,  42,   3],
       [ 90,  57, 139],
       [ 48,  91, 107],
       [ 66,  92,  54],
       [ 70,  81,  31],
       [ 11,   8,  66],
       [ 10,  96,  94],
       [144,  92, 120],
       [ 84,  69,  16],
       [  5,  52,  76],
       [ 41,  74, 140],
       [ 26,  75,  26],
       [ 49, 118,  67],
       [ 59,  72,  61],
       [ 77,  26,  72],
       [122,  52,   7],
       [ 70,  34,  20],
       [117,  16, 131],
       [  1,  89, 144],
       [ 89,  24,  93],
       [  6,  83, 125],
       [119, 117,   9],
       [138,  89, 104],
       [ 65, 110,  29],
       [ 87, 126,  70],
       [ 51, 118,  75],
       [ 29,  82,  43],
       [104, 136, 129],
       [ 22,  65, 150],
       [107, 139, 120],
       [ 68,  95, 122],
       [ 58,  53, 133],
       [117,  70,  87],
       [ 28, 131,   6],
       [104,  68,  95],
       [128,  32, 148],
       [ 96, 138,   7],
       [ 60,  65, 103],
       [ 53,  11, 150],
       [  5,  86,  83],
       [145,  53,  60],
       [ 88,  70, 113],
       [ 88,  32,  59],
       [ 54,  70,  50],
       [ 21,  29,  78],
       [ 34,   7,  60],
       [134,  21, 111],
       [108, 130, 137],
       [127, 105,  59]])
df.columns  # 查看列索引
Index(['python', 'math', 'english'], dtype='object')
df.index  # 查看行索引
RangeIndex(start=0, stop=100, step=1)

数据输入与输出

Csv

df = pd.DataFrame(data = np.random.randint(0,151,size = (100,3)),
                 columns=['Python','Math','English'])
df  # 行索引、列索引
PythonMathEnglish
06240150
1956127
21811056
389159
445402
............
954010055
9614133135
97979591
98457072
991315449

100 rows × 3 columns

df.to_csv('./data.csv',
          sep=',',  # 指定分隔符
         index = True,  # 保存行索引
         header = True)  # 保存列头(列索引)
pd.read_csv('./data.csv', # 加载文件
           index_col = 0   # 指定第一列为行索引
           # ,header = None  # 如果想要指定列头的时候用
           )
            
PythonMathEnglish
06240150
1956127
21811056
389159
445402
............
954010055
9614133135
97979591
98457072
991315449

100 rows × 3 columns

Excel

df
# pip install xlrd -i https://pypi.tuna.tsinghua.edu.cn/simple
# pip install xlwt -i https://pypi.tuna.tsinghua.edu.cn/simple
PythonMathEnglish
06240150
1956127
21811056
389159
445402
............
954010055
9614133135
97979591
98457072
991315449

100 rows × 3 columns

df.to_excel('./data.xlsx')  # 存入excel
pd.read_excel('./data.xlsx',
             index_col = 0)  # 设定第一列作为行索引
PythonMathEnglish
06240150
1956127
21811056
389159
445402
............
954010055
9614133135
97979591
98457072
991315449

100 rows × 3 columns

HDF5

df  # HDF5可以管理非常大而复杂的数据集,而且读取速度非常的快
# pip install tables -i https://pypi.tuna.tsinghua.edu.cn/simple
PythonMathEnglish
06240150
1956127
21811056
389159
445402
............
954010055
9614133135
97979591
98457072
991315449

100 rows × 3 columns

df.to_hdf('./data.h5',
         key = 'score')
df2 = pd.DataFrame(data = np.random.randint(6,100,size=(1000,5)),
                  columns = ['计算机','化工','生物','工程','教师'])
df2
计算机化工生物工程教师
05860643415
12536942555
23315685885
33081985566
48315353733
..................
99596877348
9964042231376
9973533412371
9988072211829
9991977443018

1000 rows × 5 columns

df2.to_hdf('./data.h5',
          key = 'salary')
pd.read_hdf('./data.h5',
           key = 'salary')
计算机化工生物工程教师
05860643415
12536942555
23315685885
33081985566
48315353733
..................
99596877348
9964042231376
9973533412371
9988072211829
9991977443018

1000 rows × 5 columns

SQL

# pip install sqlalchemy -i https://pypi.tuna.tsinghua.edu.cn/simple
# pip install pymysql -i https://pypi.tuna.tsinghua.edu.cn/simple
# https://docs.sqlalchemy.org/en/13/core/engines.html 数据库引擎配置
from sqlalchemy import create_engine  # 数据库引擎,构建和数据库的连接
# PyMySQL
# 类似网页地址
engine = create_engine(
    'mysql+pymysql://root:123456@localhost/cda?charset=utf8'
)
# 'mysql+pymysql://用户名:密码@端口/数据库名称?charset=编码'
df2.to_sql('salary',  # 表名
          con = engine,  # con数据库引擎
          index = False  
          )  # 存入数据
1000
pd.read_sql('select * from salary limit 10',
           con=engine)
计算机化工生物工程教师
05860643415
12536942555
23315685885
33081985566
48315353733
55426439315
65413426238
76810266525
86951487
92998213685

数据获取

获取数据

df = pd.DataFrame(np.random.randint(0,151,size=(10,3)),
                 index=list('ABCDEFGHIJ'),
                 columns=['python','math','en'])
df
pythonmathen
A174109
B785191
C5229121
D11812565
E72104144
F1136918
G8611497
H6421142
I10272118
J1143493
df['python']  # 获取数据series
A     27
B    126
C     91
D     66
E    103
F     53
G     45
H     34
I      2
J     26
Name: python, dtype: int32
df.python  # 在DataFrame中列表示属性,行表示具体的索引
A     27
B    126
C     91
D     66
E    103
F     53
G     45
H     34
I      2
J     26
Name: python, dtype: int32
df[['python','en']]  # 获取多列的数据
pythonen
A2738
B12648
C9137
D667
E10372
F5347
G454
H34116
I21
J2618

标签选择

# 标签,就是行索引 location = loc位置
df.loc['A']  # 通过行索引获取信息
python     27
math      148
en         38
Name: A, dtype: int32
df.loc[['A','B','D']]  # 获取多个行的信息
pythonmathen
A2714838
B12614148
D66167
df.loc['A','python']  # 行列索引,A行python列
27
df.loc[['A','C','F'],'python']  # 获取多个行的python的数据
A    27
C    91
F    53
Name: python, dtype: int32
df.loc['A'::2,['math','en']]
mathen
A14838
C037
E14172
G284
I231
df.loc['A':'D',:]
pythonmathen
A2714838
B12614148
C91037
D66167

位置选择

df.iloc[0]  # 是用数字表示的,其实和loc使用方法完全一样
python     27
math      148
en         38
Name: A, dtype: int32
df.iloc[:4,:2]
pythonmath
A27148
B126141
C910
D6616

布尔索引

cond = df.python > 80 # 将python中大于80的显示出来
df[cond]
pythonmathen
B12614148
C91037
E10314172
df.mean()  # 每个科目的平均值
python    57.3
math      71.7
en        38.8
dtype: float64
df.mean(axis=1) # 每个人的平均值
A     71.000000
B    105.000000
C     42.666667
D     29.666667
E    105.333333
F     55.666667
G     25.666667
H     73.666667
I      8.666667
J     42.000000
dtype: float64
df[df.mean(axis=1) > 75]  # 获取大于75分的人的数据,筛选出来
pythonmathen
B12614148
E10314172
# 筛选python和math都大于70的人
df[(df.python > 70) & (df.math > 70 )]
pythonmathen
B12614148
E10314172
df.index  # 获取所有的学生
Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'], dtype='object')
cond = df.index.isin(['C','E','H','K'])  # 判断数据是否在数组中,返回01
df[cond]
pythonmathen
C91037
E10314172
H3471116

赋值操作

df['python']['A'] = 150  
df  # 修改某个位置的值
pythonmathen
A15014838
B12614148
C91037
D66167
E10314172
F536747
G45284
H3471116
I2231
J268218
df['java'] = np.random.randint(1,151,size=10)
df  # 新增加一列,赋值
pythonmathenjava
A17410988
B78519199
C522912154
D218125165135
E7210414473
F113691853
G18611419722
H642114237
I1027211839
J21434193125
df.loc[['C','D','E'],'math'] = 147
df  # 修改多个人的成绩
pythonmathenjava
A15014838107
B126141486
C9114737133
D661477145
E1031477246
F53674789
G4528495
H347111627
I2231104
J268218117
df[df < 60] = 60
df  # 把小于60的全部更改为60,相当于where条件操作,符合这个条件的值将会修改
pythonmathenjava
A15014860107
B1261416060
C9114760133
D6614760145
E1031477260
F60676089
G60606095
H607111660
I606060104
J608260117
df.iloc[3::3,[0,2]] += 100
df
pythonmathenjava
A17410988
B78519199
C522912154
D318125265135
E7210414473
F113691853
G28611429722
H642114237
I1027211839
J31434293125

数据集成

concat行列合并

df1 = pd.DataFrame(np.random.randint(0,151,size=(10,3)),
                 index=list('ABCDEFGHIJ'),
                 columns=['python','math','en'])
df2 = pd.DataFrame(np.random.randint(0,151,size=(10,3)),
                 index=list('HILKLMNOPQ'),
                 columns=['python','math','en'])
df3 = pd.DataFrame(np.random.randint(0,151,size=(10,2)),
                 index=list('ABCDEFGHIJ'),
                 columns=['java','chinese'])
pd.concat([df1,df2],
         axis=0)  # axis = 0表示行合并,行增加
pythonmathen
A10752125
B464364
C141147118
D5111021
E1356246
F75325
G4329120
H223439
I82920
J321777
H58141110
I1213762
L29140131
K745933
L30297
M3677
N5013512
O139134134
P140115119
Q14114121
pd.concat([df1,df3],
         axis = 1)  # axis = 1表示列合并
pythonmathenjavachinese
A107521251226
B4643647013
C141147118143130
D511102173115
E13562461452
F753255449
G432912061119
H223439105137
I82920114114
J3217778549

数据插入

df1.insert(loc = 1,  # 插入的位置
          column='c++',  # 插入的列名
          value = np.random.randint(0,151,size=10))  # 插入的数据
---------------------------------------------------------

ValueError              Traceback (most recent call last)

Input In [106], in <cell line: 1>()
----> 1 df1.insert(loc = 1,
      2           column='c++',
      3           value = np.random.randint(0,151,size=10))  # 插入一列
      4 df1


File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\frame.py:4434, in DataFrame.insert(self, loc, column, value, allow_duplicates)
   4428     raise ValueError(
   4429         "Cannot specify 'allow_duplicates=True' when "
   4430         "'self.flags.allows_duplicate_labels' is False."
   4431     )
   4432 if not allow_duplicates and column in self.columns:
   4433     # Should this be a different kind of error??
-> 4434     raise ValueError(f"cannot insert {column}, already exists")
   4435 if not isinstance(loc, int):
   4436     raise TypeError("loc must be int")


ValueError: cannot insert c++, already exists
df1
pythonc++mathen
A1079452125
B46924364
C14172147118
D51711021
E135736246
F7515325
G436029120
H221423439
I8243920
J32941777
pd.concat([df1,df3],
         axis=0)  # 数据合并
pythonc++mathenjavachinese
A107.094.052.0125.0NaNNaN
B46.092.043.064.0NaNNaN
C141.072.0147.0118.0NaNNaN
D51.07.0110.021.0NaNNaN
E135.073.062.046.0NaNNaN
F75.015.03.025.0NaNNaN
G43.060.029.0120.0NaNNaN
H22.0142.034.039.0NaNNaN
I82.043.092.00.0NaNNaN
J32.094.017.077.0NaNNaN
ANaNNaNNaNNaN12.026.0
BNaNNaNNaNNaN70.013.0
CNaNNaNNaNNaN143.0130.0
DNaNNaNNaNNaN73.0115.0
ENaNNaNNaNNaN145.02.0
FNaNNaNNaNNaN54.049.0
GNaNNaNNaNNaN61.0119.0
HNaNNaNNaNNaN105.0137.0
INaNNaNNaNNaN114.0114.0
JNaNNaNNaNNaN85.049.0

Join SQL风格合并

df1 = pd.DataFrame(data={'name':['softpo','brandon','ella','daniel','张三'],
                        'height':[175,180,169,177,168]})  # 身高
df2 = pd.DataFrame(data={'name':['softpo','brandon','ella','daniel','张三'],
                        'weigth':[70,65,704,653,88]})  # 体重
df3 = pd.DataFrame(data={'名字':['softpo','brandon','ella','daniel','张三'],
                        'salary':np.random.randint(20,100,size=5)})  # 薪水
display(df1,df2,df3)
nameheight
0softpo175
1brandon180
2ella169
3daniel177
4张三168
nameweigth
0softpo70
1brandon65
2ella704
3daniel653
4张三88
名字salary
0softpo74
1brandon79
2ella39
3daniel97
4张三62
pd.concat([df1,df2],axis = 1)
nameheightnameweigth
0softpo175softpo70
1brandon180brandon65
2ella169ella704
3daniel177daniel653
4张三168张三88
# 根据共同的属性,合并数据
# df1 和df2 共同属性:name
# 数据库,合并join共同可以相似
pd.merge(df1,df2,how='inner')  # 融合,根据共同的name进行合并,两表合并,外键,默认不填为inner,内合并
nameheightweigth
0softpo17570
1brandon18065
2ella169704
3daniel177653
4张三16888
pd.merge(df1,df2,how='outer')  # 外合并,所有数据都保留,不对应的位置填充空数据
nameheightweigth
0softpo17570
1brandon18065
2ella169704
3daniel177653
4张三16888
pd.merge(df1,df2,how='left')  # 左合并
nameheightweigth
0softpo17570
1brandon18065
2ella169704
3daniel177653
4张三16888
pd.merge(df1,df3,left_on='name',right_on='名字')  # 如果两列头不一样,可以这样
nameheight名字salary
0softpo175softpo74
1brandon180brandon79
2ella169ella39
3daniel177daniel97
4张三168张三62
df4 = pd.DataFrame(data=np.random.randint(0,151,size=(10,3)),
                  columns=['python','math','en'],
                  index=list('ABCDEFGHIJ'))
df4
pythonmathen
A837108
B3448128
C9299132
D11212698
E107583
F7733136
G583912
H6710969
I1152948
J195592
score_mean = df4.mean(axis=1).round(1)  # 平均分保留一位小数
score_mean
A     66.0
B     70.0
C    107.7
D    112.0
E     65.0
F     82.0
G     36.3
H     81.7
I     64.0
J     55.3
dtype: float64
df4.insert(loc=3,column='平均分',value=score_mean)
---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

Input In [32], in <cell line: 1>()
----> 1 df4.insert(loc=3,column='平均分',value=score_mean)
      2 df4


File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\frame.py:4434, in DataFrame.insert(self, loc, column, value, allow_duplicates)
   4428     raise ValueError(
   4429         "Cannot specify 'allow_duplicates=True' when "
   4430         "'self.flags.allows_duplicate_labels' is False."
   4431     )
   4432 if not allow_duplicates and column in self.columns:
   4433     # Should this be a different kind of error??
-> 4434     raise ValueError(f"cannot insert {column}, already exists")
   4435 if not isinstance(loc, int):
   4436     raise TypeError("loc must be int")


ValueError: cannot insert 平均分, already exists
df4
pythonmath平均分en
A83766.0108
B344870.0128
C9299107.7132
D112126112.098
E107565.083
F773382.0136
G583936.312
H6710981.769
I1152964.048
J195555.392
df5 = df4.iloc[:,[0,1,3]]
df5
pythonmathen
A837108
B3448128
C9299132
D11212698
E107583
F7733136
G583912
H6710969
I1152948
J195592
score_mean.name='平均分'
score_mean
A     66.0
B     70.0
C    107.7
D    112.0
E     65.0
F     82.0
G     36.3
H     81.7
I     64.0
J     55.3
Name: 平均分, dtype: float64
df5
pythonmathen
A837108
B3448128
C9299132
D11212698
E107583
F7733136
G583912
H6710969
I1152948
J195592
pd.merge(df5,
        score_mean,
        left_index = True,  # 数据合并根据行索引,对应
        right_index = True)  # 右边数据根据行索引,对应
pythonmathen平均分
A83710866.0
B344812870.0
C9299132107.7
D11212698112.0
E10758365.0
F773313682.0
G58391236.3
H671096981.7
I115294864.0
J19559255.3

数据清洗

df = pd.DataFrame(data = {'color':['red','blue','red','green','green','blue',None,np.NaN,'green'],
                         'price':[20,15,20,18,18,22,30,30,22]})
df
colorprice
0red20
1blue15
2red20
3green18
4green18
5blue22
6None30
7NaN30
8green22
# 重复数据删除
df.drop_duplicates()  # 非重复数据,索引7和索引6是重复数据,None == NaN
colorprice
0red20
1blue15
3green18
5blue22
6None30
8green22
df.dropna()  # 空数据过滤
colorprice
0red20
1blue15
2red20
3green18
4green18
5blue22
8green22
df.drop(labels=[2,4,6,8])  # 删除指定的行,默认axis = 0
colorprice
0red20
1blue15
3green18
5blue22
7NaN30
df.drop(labels=['color'],axis = 1)  # 删除指定的列,axis=1表示列
price
020
115
220
318
418
522
630
730
822
df.filter(items=['price'])  # 保留数据 price
price
020
115
220
318
418
522
630
730
822
df
colorprice
0red20
1blue15
2red20
3green18
4green18
5blue22
6None30
7NaN30
8green22
df['size'] = 1024  # 广播
df
colorpricesize
0red201024
1blue151024
2red201024
3green181024
4green181024
5blue221024
6None301024
7NaN301024
8green221024
df.filter(like = 'i')  # 模糊匹配,保留带有i字母的索引
pricesize
0201024
1151024
2201024
3181024
4181024
5221024
6301024
7301024
8221024
df.filter(regex = 'e$')  # 保留,用正则表达式
pricesize
0201024
1151024
2201024
3181024
4181024
5221024
6301024
7301024
8221024
# 异常值过滤
a = np.random.randint(1,200,size = 1000)
a
array([ 95,  17, 106,  14,  77,  43,  35,  82, 169,  84, 135,  61, 107,
        10,  57,  88, 181, 147, 160, 141,  99, 190, 148, 101,  33, 150,
        37,  16,  90, 104,  56, 179,   7, 120, 172, 180, 161,  76, 175,
       137,  33,  52, 157,  62,  49,  46,  49,  78, 110,  97, 188,  87,
       185,  45, 106,  29, 173, 188,  71, 195,  84,  47,  13,  68,  35,
        39,  79,  86,  67, 132,  58,  27,  48, 118, 190,  60,  66, 118,
        36,  76, 109, 130,  75,  38, 190, 158,  45, 153, 136,  20,  30,
       106, 136,  83, 182, 170,  51, 118,  95,  50,  77, 135,  65, 196,
       129, 183, 115,  77,  27, 106,  24,  67,  56, 150, 139, 100,   6,
       160,  36, 184, 191,  19, 189,  15, 163, 140, 123, 151,  86,  39,
        39, 197, 193, 122,  62,  21,  25, 182,  29, 101, 154,  23, 195,
        68, 174, 148, 154, 199,  33, 168,  45, 162,  90, 152, 182,  65,
       199,  51,  83,  75, 197,  72, 159, 123, 152,   4, 175, 187,  77,
        53, 136,  35, 106,  98, 135,  57, 189,  84,   4, 108,  20,  41,
         7,  16,  45, 168, 156, 114,  32, 147,  24, 105, 198,  57,  16,
       192, 128,  37, 156,  53,  44, 142, 160, 198, 154,  75, 109,  22,
        16,  12,  80, 146, 153, 144, 192,  99,  97, 199, 111,  27,  59,
       127, 112, 160, 166, 198,  41, 196, 170,  38, 147,  64, 199,  93,
         1, 152, 150, 132, 181,  57,  56,  58, 196, 171, 163, 150,   8,
       197,  38, 135, 176, 196,  19, 185,  29,   9,  15, 101, 177, 126,
         7, 197,  58,  56, 149, 182, 172,  60, 106, 130, 174, 198, 185,
         6, 124,  53,  95,  92,  57,  78,  18, 192,  93, 190,  38, 111,
        14,  94,  47, 180,  79,  86,  37,  44,  15, 166,  42, 124, 137,
       152, 173, 180, 142,  19, 110,  21,   7,  83,   3, 158,  21, 112,
       183,  65, 193,  84, 171, 106, 100, 118,  43, 183, 142, 143, 188,
       109, 136,  59,  84,  28,  72,   1,  73, 166, 104,  73,  16,  28,
        55,  29,  76,  13, 110,  83, 134,  23, 160, 195, 175,  47,  27,
       148, 100,  96, 169,  64,  45, 162, 113,  52,  45, 170, 127,  91,
       141,  13, 128, 121, 167, 103,  66, 134,  80,  46, 171,  91,  53,
        75,   7, 190,  35,  48, 164, 130, 104,  16,  31, 181,  68, 162,
         1,  95, 174, 163, 183,  78,   1, 103,  12,  41,  50, 125,  49,
         4,  61,  81, 142, 157, 119,  11,  70, 121, 191,  28, 140,  63,
       144, 190,  51, 184, 111,  16,   3, 150,   1,  69,  79, 143,  16,
       136,  90,  27, 107,  78, 123,   4,  18,  36,  80,  32, 103,  19,
        79, 181,  83, 167,  20,  29, 150,  92,  26, 123,  57,  39, 115,
        69,  75,  44,  60,  88,  99, 191, 191,  82, 122, 156, 164,  41,
       150,   3, 184,  15, 104, 144,  77,  49, 157, 129,  29,  56, 188,
       113, 171, 118,  63, 132, 177,  27,   8, 164,  43,  22,  78,  89,
       188, 168, 148, 140, 125, 140,  97,  21, 185,   4,  91, 176,  93,
       165,  81, 124, 144, 137,  86, 133,  45, 193,  86, 158,  91, 137,
       100,  55,  95,  33,  30, 179,   3,  77,  26, 113, 197,  49, 105,
         3, 160, 139, 106,  94,   5,  16, 104, 145,  77,  16,  14,  98,
       137, 196,  53, 105,  54, 157,  66,  23,  11, 197, 140, 138,  61,
       155, 138, 187, 138, 175, 143,  66,  79,  25,  26, 165,  49,  53,
       128,   2,   4, 194,  49,  17,   4,  89, 107,  75,  99, 162, 166,
       173, 122,  87, 159,  75, 107, 143,  85,  41, 108, 180, 120,   3,
        29,  16,   3, 186,  86, 135,  89,  61, 172,  74, 178,  38,  49,
        61,  93,  96, 114, 170, 110, 136,  40,  72, 102, 186, 120, 147,
       158, 128,  71, 110,  59,  87,  99, 171,   3,  55, 163, 181, 188,
        80, 194,  75,  38, 173, 137,  97, 171, 113, 132, 165,  20,  37,
       141, 124,  40,  98,  38,  61, 136,   3,  47,   1, 117, 143, 190,
       162, 181, 115, 181, 100,  92, 180, 165,  26, 183,  65,  43,  64,
       102, 120, 102, 100,  38,   9,  49,  95, 126, 185,  18, 146,  64,
        47, 151, 191, 113,  64, 119,  49,  51,  25,   6,  49, 171,  98,
       112, 134,  15, 135, 144,   7,  69, 113,  89, 190, 179, 129, 182,
         9, 157,  55,  95, 181,  44, 144, 193, 109,   3, 183,  48, 194,
        80,  39, 173, 194, 195,  55, 156, 158,  29,  21, 119,  77, 142,
        97,  41,  50,  14,  70, 197,  43,  93,  51, 190, 105, 130, 162,
        33, 153, 190,  90, 123, 144,  71,  80, 138, 167, 128, 176,  95,
        37, 148, 193, 169, 175, 131, 198, 188,  39,  26, 178,  79,  50,
       190, 148, 156,  56,   3,  10, 112, 136, 135,  16,  54,  16, 105,
       102, 197,  96,  30,  80,  47, 144, 150,   3,  96,  39,  72, 128,
       146,  96, 156,  37,  33, 123, 180, 103,  82,  26, 144,   9,  81,
       144, 163,  53, 128, 133, 160, 104, 135,   6,  92, 188, 112,  57,
       120, 182, 151,  95,  73,  19, 137,  25, 179, 150,  44, 162,  30,
       165, 108,  43,  16,  84, 195,   3,  61,  59,  15,   5, 185, 143,
       131,  28,  61, 133,  51,  93,  83,  51,  39, 132, 195,  93, 152,
        90,  68, 166,  44,  38,  50, 129,  74, 111, 139,  79,  97, 135,
        57, 139, 103,  75,  33, 176,  27, 147, 114,  76,   2,  21,   1,
       187,  58, 191, 139, 171, 146, 189, 184,  28,  74, 173,  57,  78,
       166,  82, 115,  78, 140, 171,  45,  86,  20, 101,  19, 169,   7,
       130, 182, 164,  54, 175, 188,  44,   2,  31, 171, 135, 110, 113,
       152,  45,  98,  82,  48, 158,   4,  31, 193, 123,  69,  48,  49,
       169,   3, 105, 178,  45,  59,  34, 120, 179, 163, 154, 163,  62,
         2,   4,  34,  71, 144,  55,  93, 100, 168, 144,  19, 161, 156,
       115,  76, 126,  48,  92, 137, 129, 142,  83,  90, 192,  76, 162,
        54,  92,  31,  62,  32, 126,  72,  41,  38, 172,  93,  99])
# 异常值,大于800,小于100算作异常
cond = (a <= 800) & (a >= 100)
a[cond]
array([106, 169, 135, 107, 181, 147, 160, 141, 190, 148, 101, 150, 104,
       179, 120, 172, 180, 161, 175, 137, 157, 110, 188, 185, 106, 173,
       188, 195, 132, 118, 190, 118, 109, 130, 190, 158, 153, 136, 106,
       136, 182, 170, 118, 135, 196, 129, 183, 115, 106, 150, 139, 100,
       160, 184, 191, 189, 163, 140, 123, 151, 197, 193, 122, 182, 101,
       154, 195, 174, 148, 154, 199, 168, 162, 152, 182, 199, 197, 159,
       123, 152, 175, 187, 136, 106, 135, 189, 108, 168, 156, 114, 147,
       105, 198, 192, 128, 156, 142, 160, 198, 154, 109, 146, 153, 144,
       192, 199, 111, 127, 112, 160, 166, 198, 196, 170, 147, 199, 152,
       150, 132, 181, 196, 171, 163, 150, 197, 135, 176, 196, 185, 101,
       177, 126, 197, 149, 182, 172, 106, 130, 174, 198, 185, 124, 192,
       190, 111, 180, 166, 124, 137, 152, 173, 180, 142, 110, 158, 112,
       183, 193, 171, 106, 100, 118, 183, 142, 143, 188, 109, 136, 166,
       104, 110, 134, 160, 195, 175, 148, 100, 169, 162, 113, 170, 127,
       141, 128, 121, 167, 103, 134, 171, 190, 164, 130, 104, 181, 162,
       174, 163, 183, 103, 125, 142, 157, 119, 121, 191, 140, 144, 190,
       184, 111, 150, 143, 136, 107, 123, 103, 181, 167, 150, 123, 115,
       191, 191, 122, 156, 164, 150, 184, 104, 144, 157, 129, 188, 113,
       171, 118, 132, 177, 164, 188, 168, 148, 140, 125, 140, 185, 176,
       165, 124, 144, 137, 133, 193, 158, 137, 100, 179, 113, 197, 105,
       160, 139, 106, 104, 145, 137, 196, 105, 157, 197, 140, 138, 155,
       138, 187, 138, 175, 143, 165, 128, 194, 107, 162, 166, 173, 122,
       159, 107, 143, 108, 180, 120, 186, 135, 172, 178, 114, 170, 110,
       136, 102, 186, 120, 147, 158, 128, 110, 171, 163, 181, 188, 194,
       173, 137, 171, 113, 132, 165, 141, 124, 136, 117, 143, 190, 162,
       181, 115, 181, 100, 180, 165, 183, 102, 120, 102, 100, 126, 185,
       146, 151, 191, 113, 119, 171, 112, 134, 135, 144, 113, 190, 179,
       129, 182, 157, 181, 144, 193, 109, 183, 194, 173, 194, 195, 156,
       158, 119, 142, 197, 190, 105, 130, 162, 153, 190, 123, 144, 138,
       167, 128, 176, 148, 193, 169, 175, 131, 198, 188, 178, 190, 148,
       156, 112, 136, 135, 105, 102, 197, 144, 150, 128, 146, 156, 123,
       180, 103, 144, 144, 163, 128, 133, 160, 104, 135, 188, 112, 120,
       182, 151, 137, 179, 150, 162, 165, 108, 195, 185, 143, 131, 133,
       132, 195, 152, 166, 129, 111, 139, 135, 139, 103, 176, 147, 114,
       187, 191, 139, 171, 146, 189, 184, 173, 166, 115, 140, 171, 101,
       169, 130, 182, 164, 175, 188, 171, 135, 110, 113, 152, 158, 193,
       123, 169, 105, 178, 120, 179, 163, 154, 163, 144, 100, 168, 144,
       161, 156, 115, 126, 137, 129, 142, 192, 162, 126, 172])
# 正态分布,平均值为0,标准差为1
b = np.random.randn(100000)
b
array([ 9.30519601e-01, -3.44360662e-01,  6.65514977e-05, ...,
       -2.30123357e+00,  2.17474374e-01,  2.86950376e-01])
cond = np.abs(b) > 3*1  # 这些异常值,找到了
b[cond]
array([-3.113988  , -3.43316674, -3.28413448, -4.18671551,  3.01815086,
       -3.28350433,  3.39229337, -3.8349722 ,  3.16620524, -3.01368522,
       -3.15619888, -3.02706183, -3.20469248, -3.05930093, -3.25541995,
       -3.16289873,  3.16185476, -3.04808409,  3.54344272, -3.2585841 ,
        3.08115957, -3.17921412,  3.30126439, -3.18372694, -3.06651084,
        3.58377405, -3.07022682,  3.16360384, -3.34376458, -3.81579603,
       -3.99645404,  3.58753363,  3.35206183, -3.74457891, -3.33198327,
       -3.33993979, -3.17078523,  3.02754635, -3.17691131, -3.12715306,
       -3.38487724,  3.3550324 ,  3.5759783 ,  3.48937298,  3.36398366,
       -3.53495624, -3.5156431 ,  3.32001882, -3.12741237,  3.02816144,
       -3.13263251,  3.76773434, -3.3346886 ,  3.40044899,  3.29585109,
        3.01097665, -3.06049384,  3.14896826,  3.11488174,  3.15971499,
       -3.63196786,  3.21199513,  3.06839176,  3.06794524,  3.06776471,
        3.24618032,  3.67058793,  3.00866597,  3.66160146, -3.19700386,
       -3.09563525, -3.2564589 ,  3.34902208,  3.33267903,  3.46037542,
        3.3830624 , -3.10813257, -3.27485335, -3.09740389,  3.0953684 ,
        3.53897655, -3.21084279,  3.54120948,  3.15640238, -3.14993374,
        3.03442456, -3.10751423,  3.28260234,  3.19101812,  3.15943105,
        3.25920238, -3.03706885, -3.30509468, -3.6464602 ,  3.26733257,
        3.01876074,  3.01606638,  3.21088046,  3.65200452, -3.08139884,
       -3.07168219, -3.18855195, -3.06266893,  3.01535252, -3.03137453,
        3.12828081,  3.03530021,  3.60066559, -3.10464509,  3.30509352,
        3.37102098,  3.75526796,  3.09714856,  3.08665134, -3.4107222 ,
       -3.26908219, -3.18781645,  3.63833634, -3.008857  ,  3.72641975,
       -3.01805091, -3.11050402, -3.31183682, -3.29071569, -3.19408484,
        3.1183616 ,  3.02554176,  3.21680547, -3.12981011, -3.73713851,
        3.24781689, -3.25066575,  3.13496805,  3.17233319, -3.2934357 ,
        3.10123582, -3.15880969, -3.74877105,  3.09063324,  3.63149354,
       -3.19359058, -3.00209698,  3.15207181, -3.03289971,  3.10833683,
        3.72043919,  3.19074043,  3.25338012, -3.02472805,  3.06074732,
        3.07540501, -3.73763831,  3.45588068, -3.14578358, -3.24339571,
       -3.59522797, -3.53748643, -3.06930718, -3.83786393,  3.23984478,
       -3.17985666, -3.16852988, -3.31971537, -3.2654107 , -3.12220388,
        3.88867571,  3.08592098, -3.53951333, -3.27504561,  3.30145401,
        3.00455928, -3.00767388,  3.00140698,  3.1611202 , -3.28482905,
       -4.00614279, -3.32878319, -3.01924921, -4.09578766,  3.09062415,
        3.02481019, -3.47085658,  3.13053125, -3.12608597, -3.29770516,
       -3.1034331 ,  3.03759591,  3.10319671, -3.23967661, -3.08915652,
       -3.10352916,  3.06012689,  3.40541064, -3.20405873,  3.46436661,
        3.09252034,  3.76867128,  3.08367434, -3.05931862,  3.46965336,
        3.22184177,  3.28171723, -3.21577232,  3.20512585, -3.2546693 ,
        3.18373904, -3.22062146, -3.01107575, -3.33173298, -3.74574667,
       -3.76695662,  3.21289624,  3.46063306, -3.09351324,  3.69547389,
        3.07973201,  3.00144524, -3.97588352,  3.15801451, -3.91660008,
       -3.00728368, -3.31921622, -3.07473275, -3.18439241,  3.04325623,
       -3.11538206,  3.22550755,  3.08726614,  3.1049817 ,  3.04386415,
        3.17139888, -3.33516451,  3.26625014,  3.25838664, -3.13951313,
        3.04181862, -3.01509484, -3.10914859,  3.12164542, -3.44755551,
        3.18598457,  3.96503764, -3.09481686,  3.11598421,  3.05800474,
        3.04178915, -3.0661321 , -3.13813002, -3.23868465, -3.27279316,
       -3.19480029, -3.17663568,  3.06456787, -3.60160737,  3.4677848 ,
        3.21025512, -3.11477108,  3.43266798,  3.21094261, -3.0864423 ,
        3.09832142, -3.00932592,  3.02609391,  3.28907182,  3.93589522,
       -3.25151322,  3.19567292,  3.36226978,  3.57918895,  3.01461819,
        3.3557285 ,  3.33894176, -3.11794324, -3.66518483,  3.06824796,
        3.29518237, -3.28611762, -3.28105128,  3.1483141 ,  3.1262014 ,
        3.27454636, -3.13272194,  3.12399007, -3.52308132,  3.63540754,
        3.29869533,  3.00378091,  3.08858567,  3.6191856 ])

数据转换

轴和元素转换

df = pd.DataFrame(np.random.randint(0,10,size=(10,3)),
                 columns = ['python','tensorflow','keras'],
                 index = list('ABCDEFGHIJ'))
df.T  # 转置
ABCDEFGHIJ
python4474805842
tensorflow9587615170
keras3151777699
# 更改行名或者列名
df.rename(index={'A':'X','K':'Y'},  # 修改行索引
         columns = {'python':'人工智能'},  # 修改列索引
         inplace = False)  # 如果是True则表示替换原数据
人工智能tensorflowkeras
X493
B451
C785
D471
E867
F017
G557
H816
I479
J209
df.replace(5,
           50,  # 把所有的5都替换为50
          inplace = False)  # 如果是True则表示替换元数据
pythontensorflowkeras
A493
B4501
C7850
D471
E867
F017
G50507
H816
I479
J209
df.replace([2,7],
           1024,
          inplace=False)
pythontensorflowkeras
A493
B451
C102485
D410241
E861024
F011024
G551024
H816
I410249
J102409
df.iloc[4,2] = np.NaN  # 这种替换会直接改变原来的数据
df
pythontensorflowkeras
A493.0
B451.0
C785.0
D471.0
E86NaN
F017.0
G557.0
H816.0
I479.0
J209.0
df.replace({0:2048,np.NaN:-100},  # 也可以这么个替换
          inplace = True)
df
pythontensorflowkeras
A493.0
B451.0
C785.0
D471.0
E86-100.0
F204817.0
G557.0
H816.0
I479.0
J220489.0
df.replace({'tensorflow':1024},  # 指定某一列进行数据的替换
           2048)
pythontensorflowkeras
A493.0
B451.0
C785.0
D471.0
E86-100.0
F204817.0
G557.0
H816.0
I479.0
J220489.0

map映射元素转变

# 选择的概率随机
df['人工智能'] = np.random.choice([1024,2048,6],
                              10,
                              p=[1/3,1/3,1/3])  # 概率,和必须为1
df
pythontensorflowkeras人工智能
A493.06
B451.02048
C785.02048
D471.01024
E86-100.02048
F204817.01024
G557.02048
H816.01024
I479.06
J220489.02048
# map只能针对一列,就是Series
# 有一些没有对应,那么返回就是空数据
df['人工智能'].map({1024:3.14,2048:2.718,6:1108})  # 根据字典对数据进行改变
A    1108.000
B       2.718
C       2.718
D       3.140
E       2.718
F       3.140
G       2.718
H       3.140
I    1108.000
J       2.718
Name: 人工智能, dtype: float64
df['keras'].map(lambda x:True if x > 0 else False)  # 匿名函数,如果大于0,返回true,否则返回false
A     True
B     True
C     True
D     True
E    False
F     True
G     True
H     True
I     True
J     True
Name: keras, dtype: bool
def convert(x):
    if x >= 5:
        return True
    else:
        return False
df['level'] = df['tensorflow'].map(convert)  # map映射,映射tensorflow中这一列中每一个数据,传递到方法中
df
pythontensorflowkeras人工智能level
A493.06True
B451.02048True
C785.02048True
D471.01024True
E86-100.02048True
F204817.01024False
G557.02048True
H816.01024False
I479.06True
J220489.02048True

apply映射元素转变

# 即可以操作Series,又可以操作DataFrame
df['人工智能'].apply(lambda x:x+100)  # 针对一列每个数值增加100
A     106
B    2148
C    2148
D    1124
E    2148
F    1124
G    2148
H    1124
I     106
J    2148
Name: 人工智能, dtype: int64
df['level'].apply(lambda x:1 if x else 0)
A    1
B    1
C    1
D    1
E    1
F    0
G    1
H    0
I    1
J    1
Name: level, dtype: int64
df.apply(lambda x:x+1000) # 如果没有索引,那么所有数据都会映射,map就不可以这么映射
pythontensorflowkeras人工智能level
A100410091003.010061001
B100410051001.030481001
C100710081005.030481001
D100410071001.020241001
E10081006900.030481001
F304810011007.020241000
G100510051007.030481001
H100810011006.020241000
I100410071009.010061001
J100230481009.030481001
def convert(x):
    return (x.median(),  # 返回中位数
           x.count(), # 返回计数
           x.min(),
           x.max(),
           x.std()  # 标准差
           ) 
            
df.apply(convert).round(1)  # 默认操作列数据
pythontensorflowkeras人工智能level
04.56.55.51536.01.0
110.010.010.010.010
22.01.0-100.06.0False
32048.02048.09.02048.0True
4646.0645.933.4840.90.421637
df.apply(convert,axis = 1)  # 操作行数据
A        (4.0, 5, True, 9, 3.0495901363953815)
B       (4.0, 5, 1.0, 2048, 914.6653486385062)
C      (7.0, 5, True, 2048, 913.5495060477018)
D       (4.0, 5, 1.0, 1024, 456.5000547645093)
E    (6.0, 5, -100.0, 2048, 926.5164866315116)
F      (7.0, 5, False, 2048, 914.555903157374)
G      (5.0, 5, True, 2048, 913.8835812071469)
H     (6.0, 5, False, 1024, 456.2819303895345)
I      (6.0, 5, True, 9.0, 3.0495901363953815)
J     (9.0, 5, True, 2048, 1119.5491503279345)
dtype: object

transform元素转变

# transform 有变形金刚之意
df = pd.DataFrame(np.random.randint(0,10,size=(10,3)),
                 columns=['python','tensorflow','keras'],
                 index=list('ABCDEFGHIJ'))
df
pythontensorflowkeras
A494
B949
C446
D319
E135
F567
G423
H112
I763
J429
# 此功能和map、apply类似的
df['python'].transform(lambda x:1024 if x > 5 else -1024)
A   -1024
B    1024
C   -1024
D   -1024
E   -1024
F   -1024
G   -1024
H   -1024
I    1024
J   -1024
Name: python, dtype: int64
df['tensorflow'].transform([np.sqrt,  # 平方
                          np.square,  # 开平方
                          np.cumsum]  # 累和
                          )  # 针对一列进行了不同的参数,其实apply也可以实现相同的功能
sqrtsquarecumsum
A3.000000819
B2.0000001613
C2.0000001617
D1.000000118
E1.732051921
F2.4494903627
G1.414214429
H1.000000130
I2.4494903636
J1.414214438
def convert(x):
    if x>5 :
        return True
    else:
        return False
    # 可以针对DataFrame进行计算
df.transform({'python':np.cumsum,
             'tensorflow':np.square,
             'keras':convert})  # 针对多列列进行了不同的操作,其实apply也可以实现相同的功能
pythontensorflowkeras
A481False
B1316True
C1716True
D201True
E219False
F2636True
G304False
H311False
I3836False
J424True

重排随机抽样哑变量

df
pythontensorflowkeras
A494
B949
C446
D319
E135
F567
G423
H112
I763
J429
index = np.random.permutation(10)  # 从0-9打乱顺序,就是打乱顺序的索引
index
array([0, 4, 6, 2, 7, 8, 5, 3, 1, 9])
# 重排,索引打乱
df.take(index)
pythontensorflowkeras
A494
E135
G423
C446
H112
I763
F567
D319
B949
J429
# 从大量的数据中随机抽取数据
df.take(np.random.randint(0,10,size=20))  # 随机抽样20个数据
pythontensorflowkeras
B949
I763
F567
H112
F567
E135
J429
B949
E135
A494
C446
H112
B949
J429
F567
B949
C446
G423
F567
H112
# 哑变量
df2 = pd.DataFrame(data={'key':['a','a','b','b','c','b','c']})
df2
key
0a
1a
2b
3b
4c
5b
6c
# 哑变量,又叫one-hot
# str类型数据,经过哑变量变换就可以使用数字来表示
pd.get_dummies(df2,
              prefix='',
              prefix_sep='')
# 1表示有,0表示没有
abc
0100
1100
2010
3010
4001
5010
6001

数据重塑

df.T  # 行列转置
ABCDEFGHIJ
python4943154174
tensorflow9441362162
keras4969573239
# 多层索引
df2 = pd.DataFrame(np.random.randint(0,10,size=(20,3)),
                  columns=['python','math','en'],
                  index= pd.MultiIndex.from_product([list('ABCDEFGHIJ'),
                                                    ['期中','期末']]))  # 多层索引
df2
pythonmathen
A期中978
期末922
B期中115
期末564
C期中585
期末569
D期中412
期末017
E期中432
期末466
F期中010
期末973
G期中995
期末695
H期中452
期末811
I期中354
期末548
J期中843
期末978
df2.unstack(level = 1)  # 将行索引变成列索引,-1表示最后一层,最后一层就是期中期末
pythonmathen
期中期末期中期末期中期末
A997282
B151654
C558659
D401127
E443626
F091703
G969955
H485121
I355448
J894738
df2.unstack(level = 0)
python...en
ABCDEFGHIJ...ABCDEFGHIJ
期中9154409438...8552205243
期末9550496859...2497635188

2 rows × 30 columns

df2.unstack(level = -1)
pythonmathen
期中期末期中期末期中期末
A997282
B151654
C558659
D401127
E443626
F091703
G969955
H485121
I355448
J894738
df2.stack()  # 列变成了行
A  期中  python    9
       math      7
       en        8
   期末  python    9
       math      2
       en        2
B  期中  python    1
       math      1
       en        5
   期末  python    5
       math      6
       en        4
C  期中  python    5
       math      8
       en        5
   期末  python    5
       math      6
       en        9
D  期中  python    4
       math      1
       en        2
   期末  python    0
       math      1
       en        7
E  期中  python    4
       math      3
       en        2
   期末  python    4
       math      6
       en        6
F  期中  python    0
       math      1
       en        0
   期末  python    9
       math      7
       en        3
G  期中  python    9
       math      9
       en        5
   期末  python    6
       math      9
       en        5
H  期中  python    4
       math      5
       en        2
   期末  python    8
       math      1
       en        1
I  期中  python    3
       math      5
       en        4
   期末  python    5
       math      4
       en        8
J  期中  python    8
       math      4
       en        3
   期末  python    9
       math      7
       en        8
dtype: int32
df2.unstack().stack(level = 0)
期中期末
Aen82
math72
python99
Ben54
math16
python15
Cen59
math86
python55
Den27
math11
python40
Een26
math36
python44
Fen03
math17
python09
Gen55
math99
python96
Hen21
math51
python48
Ien48
math54
python35
Jen38
math47
python89
df2.mean(axis = 0)  # 默认计算的是 列
python    5.35
math      4.65
en        4.45
dtype: float64
df2.mean(axis = 1)  # 计算的是行
A  期中    8.000000
   期末    4.333333
B  期中    2.333333
   期末    5.000000
C  期中    6.000000
   期末    6.666667
D  期中    2.333333
   期末    2.666667
E  期中    3.000000
   期末    5.333333
F  期中    0.333333
   期末    6.333333
G  期中    7.666667
   期末    6.666667
H  期中    3.666667
   期末    3.333333
I  期中    4.000000
   期末    5.666667
J  期中    5.000000
   期末    8.000000
dtype: float64
df2.groupby(level=1).mean()  # 计算每个其中期末的平均分
pythonmathen
期中4.74.43.6
期末6.04.95.3
df2.groupby(level=0).mean()  # 每个学生的其中期末的平均分
pythonmathen
A9.04.55.0
B3.03.54.5
C5.07.07.0
D2.01.04.5
E4.04.54.0
F4.54.01.5
G7.59.05.0
H6.03.01.5
I4.04.56.0
J8.55.55.5

pandas高级应用

数学和统计方法

简单统计指标

df =pd.DataFrame(np.random.randint(0,10,size=(20,3)),
               columns=['python','math','en'],
               index=list('QWERTYUIOPLKJHGFDSAZ'))
df
pythonmathen
Q615
W311
E273
R020
T626
Y505
U975
I778
O533
P082
L703
K485
J207
H463
G598
F761
D916
S823
A874
Z371
df.iloc[6,2] = np.NaN
df
pythonmathen
Q615.0
W311.0
E273.0
R020.0
T626.0
Y505.0
U97NaN
I778.0
O533.0
P082.0
L703.0
K485.0
J207.0
H463.0
G598.0
F761.0
D916.0
S823.0
A874.0
Z371.0
df.count()  # 统计非空数据的数量,刚刚给en设置了一个空数据,所以en是19
python    20
math      20
en        19
dtype: int64
df.mean()  # 平均值
python    5.000000
math      4.200000
en        3.894737
dtype: float64
df.median()  # 中位数
python    5.0
math      4.5
en        3.0
dtype: float64
df.max()
python    9.0
math      9.0
en        8.0
dtype: float64
df.min()
python    0.0
math      0.0
en        0.0
dtype: float64
df['python'].unique()  # 去除重复数据
array([6, 3, 2, 0, 5, 9, 7, 4, 8])
df['math'].value_counts()  # 统计出现的频次
7    5
1    3
2    3
0    3
8    2
6    2
3    1
9    1
Name: math, dtype: int64
df.quantile(q=[0,0.25,0.5,0.75,1])  # 获取百分位数
pythonmathen
0.000.00.00.0
0.253.01.02.5
0.505.04.53.0
0.757.07.05.5
1.009.09.08.0
df.describe().round()  # 获取描述
pythonmathen
count20.020.019.0
mean5.04.04.0
std3.03.02.0
min0.00.00.0
25%3.01.02.0
50%5.04.03.0
75%7.07.06.0
max9.09.08.0

索引标签、位置获取

df['python'].argmax()   # 获取python中最大值的索引位置,最小值是一样的方式,max改成min即可
6
df.idxmax()  # 返回最大值的标签,最小值是一样的方式,max改成min即可
python    U
math      G
en        I
dtype: object

更多统计指标

df.cumsum()  # 累加和
pythonmathen
Q615.0
W926.0
E1199.0
R11119.0
T171315.0
Y221320.0
U3120NaN
I382728.0
O433031.0
P433833.0
L503836.0
K544641.0
J564648.0
H605251.0
G656159.0
F726760.0
D816866.0
S897069.0
A977773.0
Z1008474.0
df.cumprod() # 累乘和
pythonmathen
Q615.0
W1815.0
E36715.0
R0140.0
T0280.0
Y000.0
U00NaN
I000.0
O000.0
P000.0
L000.0
K000.0
J000.0
H000.0
G000.0
F000.0
D000.0
S000.0
A000.0
Z000.0
df.cummin()  # 累计最小值
pythonmathen
Q615.0
W311.0
E211.0
R010.0
T010.0
Y000.0
U00NaN
I000.0
O000.0
P000.0
L000.0
K000.0
J000.0
H000.0
G000.0
F000.0
D000.0
S000.0
A000.0
Z000.0
df.std()  # 标准差
python    2.733804
math      3.221637
en        2.401267
dtype: float64
df.var()  # 方差
python     7.473684
math      10.378947
en         5.766082
dtype: float64
df.diff()  # 差分,当前数据减去上一个的差值
pythonmathen
QNaNNaNNaN
W-3.00.0-4.0
E-1.06.02.0
R-2.0-5.0-3.0
T6.00.06.0
Y-1.0-2.0-1.0
U4.07.0NaN
I-2.00.0NaN
O-2.0-4.0-5.0
P-5.05.0-1.0
L7.0-8.01.0
K-3.08.02.0
J-2.0-8.02.0
H2.06.0-4.0
G1.03.05.0
F2.0-3.0-7.0
D2.0-5.05.0
S-1.01.0-3.0
A0.05.01.0
Z-5.00.0-3.0
df.pct_change().round(2)  # 计算百分比的变化,相当于环比增长率
pythonmathen
QNaNNaNNaN
W-0.500.00-0.80
E-0.336.002.00
R-1.00-0.71-1.00
Tinf0.00inf
Y-0.17-1.00-0.17
U0.80inf0.00
I-0.220.000.60
O-0.29-0.57-0.62
P-1.001.67-0.33
Linf-1.000.50
K-0.43inf0.67
J-0.50-1.000.40
H1.00inf-0.57
G0.250.501.67
F0.40-0.33-0.88
D0.29-0.835.00
S-0.111.00-0.50
A0.002.500.33
Z-0.620.00-0.75

高级统计指标

df.cov()  # 协方差,自己和别人计算
pythonmathen
python7.473684-0.6842112.421053
math-0.68421110.3789470.172515
en2.4210530.1725155.766082
df.var()  # 方差,自己和自己计算
python     7.473684
math      10.378947
en         5.766082
dtype: float64
df['python'].cov(df['math'])  # python和math的协方差
-0.6842105263157894
df.corr()  # 相关性系数,pandas牛逼!!!
pythonmathen
python1.000000-0.0776870.382359
math-0.0776871.0000000.022174
en0.3823590.0221741.000000
df.corrwith(df['en'])  # 计算某一列的相关系数
python    0.382359
math      0.022174
en        1.000000
dtype: float64

排序

df = pd.DataFrame(np.random.randint(0,20,size=(20,3)),
                 columns=['python','math','en'],
                 index=list('QWERTYUIOPLKJHGDSAZM'))
df
pythonmathen
Q91617
W535
E104
R01214
T683
Y16140
U17178
I548
O13512
P4510
L1537
K17410
J81412
H10313
G71616
D038
S151614
A31111
Z18154
M101414
# 根据行索引进行排序
df.sort_index(axis = 0,  # 0行;1列
             ascending = False)  # 这个参数如果是False,则表示降序
pythonmathen
Z18154
Y16140
W535
U17178
T683
S151614
R01214
Q91617
P4510
O13512
M101414
L1537
K17410
J81412
I548
H10313
G71616
E104
D038
A31111
# 根据目标列的值进行排序
df.sort_values(by='python',
              ascending=True)
pythonmathen
R01214
D038
E104
A31111
P4510
W535
I548
T683
G71616
J81412
Q91617
H10313
M101414
O13512
S151614
L1537
Y16140
K17410
U17178
Z18154
df.sort_values(by=['python','math'],  # 可以进行多个的排序,分先后排
              ascending=True)
pythonmathen
D038
R01214
E104
A31111
P4510
W535
I548
T683
G71616
J81412
Q91617
H10313
M101414
O13512
L1537
S151614
Y16140
K17410
U17178
Z18154
# 根据python排序,获取最大的n个数值
df.nlargest(n=5,
           columns='python')
pythonmathen
Z18154
U17178
K17410
Y16140
L1537
# 根据python排序,获取最小的n个数值
df.nsmallest(n=5,
           columns='python')
pythonmathen
R01214
D038
E104
A31111
P4510

分箱操作

# 将连续数据转换为分类对应的数据
df = pd.DataFrame(np.random.randint(0,151,size=(100,3)),
                 columns=['python','math','en'])
df
pythonmathen
012712450
115091130
22411369
3902123
413311360
............
954513458
96742175
971003693
981302110
99206687

100 rows × 3 columns

# 等宽分箱
pd.cut(df.python,  # 确定列
      bins = 4,  # 划分为四等份
      labels=['不及格','及格','中等','优秀'],  # 每等份的命名
      right=True  # 区间,闭区间
      )
0      优秀
1      优秀
2     不及格
3      中等
4      优秀
     ... 
95     及格
96     及格
97     中等
98     优秀
99    不及格
Name: python, Length: 100, dtype: category
Categories (4, object): ['不及格' < '及格' < '中等' < '优秀']
# 自定义宽度
df['等级'] = pd.cut(df.python,
                      bins = [0,30,60,90,120,150],
                      labels=['极差','不及格','及格','中等','优秀'])
df
pythonmathen等级
012712450优秀
115091130优秀
22411369极差
3902123及格
413311360优秀
...............
954513458不及格
96742175及格
971003693中等
981302110优秀
99206687极差

100 rows × 4 columns

# 等频
pd.qcut(df.python,
       q = 4,
       labels=['不及格','及格','中等','优秀'],
       )
0      优秀
1      优秀
2     不及格
3      中等
4      优秀
     ... 
95     及格
96     中等
97     优秀
98     优秀
99    不及格
Name: python, Length: 100, dtype: category
Categories (4, object): ['不及格' < '及格' < '中等' < '优秀']
# 等频
pd.qcut(df.python,
       q = 4,
       labels=['不及格','及格','中等','优秀'],
       ).value_counts()
不及格    25
及格     25
中等     25
优秀     25
Name: python, dtype: int64

分组聚合

分组

df = pd.DataFrame(data={'sex':np.random.randint(0,2,size = 300),
                       'class':np.random.randint(1,9,size = 300),
                       'python':np.random.randint(0,151,size = 300),
                       'math':np.random.randint(0,151,size = 300),
                       'en':np.random.randint(0,151,size = 300)})
df['sex'] = df['sex'].map({0:'男',1:'女'})  # 把0替换为男,1替换为女
df
sexclasspythonmathen
08167085
11378106
242510663
336055101
4212111414
..................
2954338466
296814213489
297215088130
2988621271
29919810180

300 rows × 5 columns

for name,group in df.groupby(by = 'sex'):  # 单分组
    print(name)
    print(group)
女
    sex  class  python  math   en
6     女      8     102    26   26
11    女      2      94   140   71
12    女      5     109    27  127
18    女      6      65    86   52
21    女      7     147    58   55
..   ..    ...     ...   ...  ...
289   女      3     139    62   26
290   女      2      11    52    1
291   女      4      37    14   42
293   女      4      30    21   47
297   女      2     150    88  130

[136 rows x 5 columns]
男
    sex  class  python  math   en
0     男      8      16    70   85
1     男      1      37     8  106
2     男      4      25   106   63
3     男      3      60    55  101
4     男      2     121   114   14
..   ..    ...     ...   ...  ...
294   男      6     124    90  107
295   男      4      33    84   66
296   男      8     142   134   89
298   男      8      62   127    1
299   男      1      98   101   80

[164 rows x 5 columns]
for name,group in df.groupby(by = ['sex','class']):  # 多分组
    print(name)
    print(group)
('女', 1)
    sex  class  python  math   en
27    女      1     128    67   30
34    女      1      10   116   91
35    女      1       3    22  144
73    女      1     102   143  120
84    女      1      29    23  120
99    女      1      53    77  144
143   女      1     104    84  103
157   女      1     145    77   92
173   女      1      61   118  143
174   女      1     114    26  130
178   女      1      85    21    0
186   女      1      26    46    7
192   女      1      76   115   79
198   女      1      61   132  128
217   女      1      58    53   89
230   女      1      61   104  144
234   女      1      50   150  128
238   女      1     115   103   13
277   女      1      83    17   82
('女', 2)
    sex  class  python  math   en
11    女      2      94   140   71
37    女      2       9    71  129
47    女      2      31     5   14
79    女      2      49   142  113
82    女      2     105    67   17
122   女      2      23   143   96
142   女      2     142   144   29
147   女      2      53   107    0
163   女      2      66    87    1
171   女      2     104    80  118
206   女      2      16   100  140
222   女      2      38    12  112
226   女      2     118    85   20
242   女      2       1     3  136
260   女      2      50   115  146
268   女      2      70    93   22
272   女      2      71    74   47
290   女      2      11    52    1
297   女      2     150    88  130
('女', 3)
    sex  class  python  math   en
30    女      3      24    90   78
40    女      3      89   134   99
65    女      3      48    81   73
69    女      3       0     0   66
72    女      3     147    65   95
104   女      3     143   122   54
140   女      3     140   119   97
144   女      3     125    96   14
145   女      3      84   112   42
249   女      3      85    59  122
283   女      3      69    34   27
286   女      3     129    10   73
289   女      3     139    62   26
('女', 4)
    sex  class  python  math   en
32    女      4     116     5  119
43    女      4      60    96  103
48    女      4      13   147  133
64    女      4     126   140   10
92    女      4     122    49   71
118   女      4     125    34   23
139   女      4      84    58   66
164   女      4      59    64   80
167   女      4      47   109   94
183   女      4      96    14   97
187   女      4     123    26   21
203   女      4       6    74  105
274   女      4     148    10  109
291   女      4      37    14   42
293   女      4      30    21   47
('女', 5)
    sex  class  python  math   en
12    女      5     109    27  127
25    女      5       6   124   98
55    女      5      78    63   10
70    女      5      30    71   29
90    女      5      20    52   78
152   女      5      60   116   35
240   女      5      49    27   33
264   女      5     137    95  142
275   女      5     101    69   80
276   女      5      49    88    5
('女', 6)
    sex  class  python  math   en
18    女      6      65    86   52
26    女      6     112    25   89
31    女      6      86    88   92
42    女      6     134    37  111
66    女      6      25   119  113
85    女      6      29   123   36
91    女      6     109    46   87
98    女      6     100   132   10
102   女      6      27   113  107
141   女      6      35    22  144
159   女      6     108    13  136
185   女      6      77   116  103
218   女      6     120   137   45
219   女      6     132    95   42
232   女      6     110    51   47
241   女      6      77    57   40
259   女      6      61   104   58
('女', 7)
    sex  class  python  math   en
21    女      7     147    58   55
28    女      7     141    69   22
33    女      7      53    70  146
51    女      7     120    64   42
59    女      7      89    30  107
74    女      7      64   145   40
89    女      7      36   100   61
96    女      7      66    82   49
106   女      7      77    65   28
127   女      7     103     9  115
133   女      7      31   138  106
138   女      7     130    33  120
151   女      7      83    82   43
161   女      7      22   127   52
165   女      7      47    51    3
170   女      7     100    23   56
194   女      7     118   146  142
199   女      7     125   132   87
213   女      7      64    46   41
216   女      7      43    99  114
228   女      7     140    91   70
243   女      7      76   103   44
245   女      7       9    89   12
250   女      7     139    81   40
288   女      7     115   148   79
('女', 8)
    sex  class  python  math   en
6     女      8     102    26   26
88    女      8      19    44  134
114   女      8     133    19   74
124   女      8      91    77  120
130   女      8     141   135   21
166   女      8       3   114   91
168   女      8     136    69  119
172   女      8      69   147   32
177   女      8      37    81   49
195   女      8      66    69    4
224   女      8      18    29  141
233   女      8     117    63   23
239   女      8      68    71   71
248   女      8      31    71    9
252   女      8     141   113  147
267   女      8     125   136   17
271   女      8     102   137   97
279   女      8     137   110   51
('男', 1)
    sex  class  python  math   en
1     男      1      37     8  106
8     男      1       8    48  113
23    男      1     116    43   13
24    男      1      79   115  123
29    男      1      24   112  126
39    男      1     112   115  133
68    男      1     131   138   80
95    男      1      86   125    5
101   男      1      49     0   67
137   男      1      59     5   62
176   男      1      91    75   68
191   男      1      90    91   15
196   男      1     129    59    9
211   男      1      78    97   13
215   男      1      31    27   66
229   男      1      62    21   28
237   男      1     140    66  126
282   男      1       5    67   12
299   男      1      98   101   80
('男', 2)
    sex  class  python  math   en
4     男      2     121   114   14
7     男      2      55   125   62
14    男      2      36   142   77
16    男      2      80   143   49
36    男      2     105   101   88
38    男      2      51    55   18
45    男      2      80    17   13
62    男      2      23    35   20
105   男      2     115   106   59
128   男      2      86    18   78
149   男      2      81    34   12
158   男      2      80    24  133
169   男      2      55   100   47
175   男      2     138    87   43
182   男      2     111    62   54
210   男      2      69   128   67
227   男      2      16    34   79
246   男      2      29    29   86
253   男      2     130   117  119
265   男      2       8    21   18
284   男      2      22    66   16
('男', 3)
    sex  class  python  math   en
3     男      3      60    55  101
5     男      3      85    69   89
9     男      3      69    42   34
13    男      3      91   142   86
17    男      3      36    50   15
50    男      3     116    60   68
54    男      3      34   144   96
58    男      3     102   126   95
61    男      3     127    97   78
77    男      3      16    74    7
81    男      3     130    54  112
86    男      3      92     3   92
93    男      3      51    83   25
103   男      3     118   122   64
108   男      3      41   109  124
109   男      3      84   123   71
111   男      3      15    32   94
117   男      3      18    62   86
126   男      3       8    76   64
129   男      3     145     9    5
204   男      3     134     5   54
207   男      3      61    22  143
212   男      3      66   146   49
251   男      3     132   109  147
254   男      3      10   146  117
280   男      3      89   109  101
287   男      3      24    24  128
('男', 4)
    sex  class  python  math   en
2     男      4      25   106   63
46    男      4      24   149  108
49    男      4     133   134   64
52    男      4      46    37  134
60    男      4     137    40   45
87    男      4      64     6   19
112   男      4     136    73  114
120   男      4     133    18   63
190   男      4      45    25  111
200   男      4      38    41   92
202   男      4      19   121   72
221   男      4      29   132   83
236   男      4      54   100   53
263   男      4      40    14   48
270   男      4     125   103  132
295   男      4      33    84   66
('男', 5)
    sex  class  python  math   en
53    男      5     114    92  128
63    男      5      91   105  125
76    男      5      14    18   56
97    男      5      42    43   95
110   男      5      83    20   49
113   男      5     139    46  146
115   男      5      93    49  142
125   男      5      31    85   95
131   男      5      51    18   12
150   男      5      46    90   33
162   男      5      78   123  127
208   男      5       3   130   94
209   男      5     106    92   92
214   男      5      39    67  119
223   男      5       7   102  137
257   男      5      79    44   88
262   男      5     109    34   96
266   男      5     107    72  149
273   男      5     145     2   58
('男', 6)
    sex  class  python  math   en
15    男      6     147    53   58
20    男      6      18    24   53
116   男      6     124    92   52
119   男      6     117    60   33
134   男      6      36    17  136
135   男      6      66    19   48
148   男      6     102    26   42
154   男      6      12    18   31
155   男      6      36    19  136
181   男      6      48    51  142
184   男      6      72   114   74
188   男      6     136    82   26
189   男      6     120   116   36
193   男      6      42    21   69
201   男      6     144   124   24
225   男      6     107   140   18
235   男      6      83   117   99
294   男      6     124    90  107
('男', 7)
    sex  class  python  math   en
10    男      7      35   134   96
19    男      7     112   127   64
22    男      7     106    60    8
44    男      7     132    15  124
67    男      7     103    40   93
71    男      7      57    64  150
78    男      7     131   122  136
83    男      7      12   149   83
94    男      7      76    54  118
123   男      7      53   109   50
136   男      7      15    64   51
146   男      7      12   101   65
153   男      7      58   110  107
156   男      7     109    49  138
180   男      7      50    73   86
197   男      7      25    71   74
205   男      7     136    80   50
220   男      7     103    48   80
244   男      7      58    55   99
261   男      7      75   136   83
269   男      7      69    86  149
278   男      7      66    70   81
292   男      7     144   149   11
('男', 8)
    sex  class  python  math   en
0     男      8      16    70   85
41    男      8      91   118   70
56    男      8      32    30  146
57    男      8      85   129   98
75    男      8     145    99  135
80    男      8      56    94   57
100   男      8      85   100   53
107   男      8     146   133   64
121   男      8      69    21    9
132   男      8      66    41   43
160   男      8      24   125  136
179   男      8      16    44   92
231   男      8      96     6    1
247   男      8      81   139  133
255   男      8      12    13  113
256   男      8      64   101   20
258   男      8     138    27  103
281   男      8      66    23   17
285   男      8      20   118   54
296   男      8     142   134   89
298   男      8      62   127    1
for name,group in df[['python','math']].groupby(df['sex']):
    print(name)
    print(group)
女
     python  math
6       102    26
11       94   140
12      109    27
18       65    86
21      147    58
..      ...   ...
289     139    62
290      11    52
291      37    14
293      30    21
297     150    88

[136 rows x 2 columns]
男
     python  math
0        16    70
1        37     8
2        25   106
3        60    55
4       121   114
..      ...   ...
294     124    90
295      33    84
296     142   134
298      62   127
299      98   101

[164 rows x 2 columns]
for name,group in df[['python','math']].groupby([df['sex'],df['class']]):
    print(name)
    print(group)
('女', 1)
     python  math
27      128    67
34       10   116
35        3    22
73      102   143
84       29    23
99       53    77
143     104    84
157     145    77
173      61   118
174     114    26
178      85    21
186      26    46
192      76   115
198      61   132
217      58    53
230      61   104
234      50   150
238     115   103
277      83    17
('女', 2)
     python  math
11       94   140
37        9    71
47       31     5
79       49   142
82      105    67
122      23   143
142     142   144
147      53   107
163      66    87
171     104    80
206      16   100
222      38    12
226     118    85
242       1     3
260      50   115
268      70    93
272      71    74
290      11    52
297     150    88
('女', 3)
     python  math
30       24    90
40       89   134
65       48    81
69        0     0
72      147    65
104     143   122
140     140   119
144     125    96
145      84   112
249      85    59
283      69    34
286     129    10
289     139    62
('女', 4)
     python  math
32      116     5
43       60    96
48       13   147
64      126   140
92      122    49
118     125    34
139      84    58
164      59    64
167      47   109
183      96    14
187     123    26
203       6    74
274     148    10
291      37    14
293      30    21
('女', 5)
     python  math
12      109    27
25        6   124
55       78    63
70       30    71
90       20    52
152      60   116
240      49    27
264     137    95
275     101    69
276      49    88
('女', 6)
     python  math
18       65    86
26      112    25
31       86    88
42      134    37
66       25   119
85       29   123
91      109    46
98      100   132
102      27   113
141      35    22
159     108    13
185      77   116
218     120   137
219     132    95
232     110    51
241      77    57
259      61   104
('女', 7)
     python  math
21      147    58
28      141    69
33       53    70
51      120    64
59       89    30
74       64   145
89       36   100
96       66    82
106      77    65
127     103     9
133      31   138
138     130    33
151      83    82
161      22   127
165      47    51
170     100    23
194     118   146
199     125   132
213      64    46
216      43    99
228     140    91
243      76   103
245       9    89
250     139    81
288     115   148
('女', 8)
     python  math
6       102    26
88       19    44
114     133    19
124      91    77
130     141   135
166       3   114
168     136    69
172      69   147
177      37    81
195      66    69
224      18    29
233     117    63
239      68    71
248      31    71
252     141   113
267     125   136
271     102   137
279     137   110
('男', 1)
     python  math
1        37     8
8         8    48
23      116    43
24       79   115
29       24   112
39      112   115
68      131   138
95       86   125
101      49     0
137      59     5
176      91    75
191      90    91
196     129    59
211      78    97
215      31    27
229      62    21
237     140    66
282       5    67
299      98   101
('男', 2)
     python  math
4       121   114
7        55   125
14       36   142
16       80   143
36      105   101
38       51    55
45       80    17
62       23    35
105     115   106
128      86    18
149      81    34
158      80    24
169      55   100
175     138    87
182     111    62
210      69   128
227      16    34
246      29    29
253     130   117
265       8    21
284      22    66
('男', 3)
     python  math
3        60    55
5        85    69
9        69    42
13       91   142
17       36    50
50      116    60
54       34   144
58      102   126
61      127    97
77       16    74
81      130    54
86       92     3
93       51    83
103     118   122
108      41   109
109      84   123
111      15    32
117      18    62
126       8    76
129     145     9
204     134     5
207      61    22
212      66   146
251     132   109
254      10   146
280      89   109
287      24    24
('男', 4)
     python  math
2        25   106
46       24   149
49      133   134
52       46    37
60      137    40
87       64     6
112     136    73
120     133    18
190      45    25
200      38    41
202      19   121
221      29   132
236      54   100
263      40    14
270     125   103
295      33    84
('男', 5)
     python  math
53      114    92
63       91   105
76       14    18
97       42    43
110      83    20
113     139    46
115      93    49
125      31    85
131      51    18
150      46    90
162      78   123
208       3   130
209     106    92
214      39    67
223       7   102
257      79    44
262     109    34
266     107    72
273     145     2
('男', 6)
     python  math
15      147    53
20       18    24
116     124    92
119     117    60
134      36    17
135      66    19
148     102    26
154      12    18
155      36    19
181      48    51
184      72   114
188     136    82
189     120   116
193      42    21
201     144   124
225     107   140
235      83   117
294     124    90
('男', 7)
     python  math
10       35   134
19      112   127
22      106    60
44      132    15
67      103    40
71       57    64
78      131   122
83       12   149
94       76    54
123      53   109
136      15    64
146      12   101
153      58   110
156     109    49
180      50    73
197      25    71
205     136    80
220     103    48
244      58    55
261      75   136
269      69    86
278      66    70
292     144   149
('男', 8)
     python  math
0        16    70
41       91   118
56       32    30
57       85   129
75      145    99
80       56    94
100      85   100
107     146   133
121      69    21
132      66    41
160      24   125
179      16    44
231      96     6
247      81   139
255      12    13
256      64   101
258     138    27
281      66    23
285      20   118
296     142   134
298      62   127
df.dtypes # 获取字段类型
sex       object
class      int32
python     int32
math       int32
en         int32
dtype: object
for name,group in df.groupby(df.dtypes,axis = 1):
    print(name,group)
int32      class  python  math   en
0        8      16    70   85
1        1      37     8  106
2        4      25   106   63
3        3      60    55  101
4        2     121   114   14
..     ...     ...   ...  ...
295      4      33    84   66
296      8     142   134   89
297      2     150    88  130
298      8      62   127    1
299      1      98   101   80

[300 rows x 4 columns]
object     sex
0     男
1     男
2     男
3     男
4     男
..   ..
295   男
296   男
297   女
298   男
299   男

[300 rows x 1 columns]
m = {'sex':'category','class':'category','python':'科目','math':'科目','en':'科目'}
for name,group in df.groupby(by = m,axis = 1):
    print(name,group)
category     sex  class
0     男      8
1     男      1
2     男      4
3     男      3
4     男      2
..   ..    ...
295   男      4
296   男      8
297   女      2
298   男      8
299   男      1

[300 rows x 2 columns]
科目      python  math   en
0        16    70   85
1        37     8  106
2        25   106   63
3        60    55  101
4       121   114   14
..      ...   ...  ...
295      33    84   66
296     142   134   89
297     150    88  130
298      62   127    1
299      98   101   80

[300 rows x 3 columns]

分组聚合

df.groupby(by='sex')[['python','math']].max()  3 数学和python根据性别分组,quit解的最大值
pythonmath
sex
150150
147149
df.groupby(by=['sex','class']).mean().round(1)  # 多组划分求数学统计,mean平均值
pythonmathen
sexclass
171.878.694.1
263.284.670.6
394.075.766.6
479.557.474.7
563.973.263.7
682.880.277.2
785.583.267.0
885.383.968.1
175.069.165.5
271.074.254.9
372.477.579.4
467.673.979.2
572.564.896.9
685.265.765.8
775.585.586.8
872.080.672.3
df.groupby(by=['sex','class']).size()  # 根据性别和班级统计人数
sex  class
女    1        19
     2        19
     3        13
     4        15
     5        10
     6        17
     7        25
     8        18
男    1        19
     2        21
     3        27
     4        16
     5        19
     6        18
     7        23
     8        21
dtype: int64
df.groupby(by=['sex','class']).describe().round(1)  # 分组描述
pythonmathen
countmeanstdmin25%50%75%maxcountmean...75%maxcountmeanstdmin25%50%75%max
sexclass
119.071.839.63.051.561.0103.0145.019.078.6...115.5150.019.094.148.50.080.5103.0129.0144.0
219.063.245.21.027.053.099.0150.019.084.6...111.0144.019.070.655.80.018.571.0123.5146.0
313.094.048.50.069.089.0139.0147.013.075.7...112.0134.013.066.632.714.042.073.095.0122.0
415.079.546.46.042.084.0122.5148.015.057.4...85.0147.015.074.738.610.044.580.0104.0133.0
510.063.942.06.034.854.595.2137.010.073.2...93.2124.010.063.748.45.030.056.593.5142.0
617.082.837.225.061.086.0110.0134.017.080.2...116.0137.017.077.238.810.045.087.0107.0144.0
725.085.541.39.053.083.0120.0147.025.083.2...103.0148.025.067.039.93.041.055.0106.0146.0
818.085.347.83.044.296.5131.0141.018.083.9...113.8147.018.068.149.04.023.861.0113.5147.0
119.075.041.55.043.079.0105.0140.019.069.1...106.5138.019.065.546.45.014.067.0109.5133.0
221.071.039.68.036.080.0105.0138.021.074.2...114.0143.021.054.935.412.018.054.078.0133.0
327.072.443.68.035.069.0109.0145.027.077.5...115.5146.027.079.439.05.059.086.0101.0147.0
416.067.646.819.032.045.5127.0137.016.073.9...109.8149.016.079.233.119.060.569.0108.8134.0
519.072.543.13.040.579.0106.5145.019.064.8...92.0130.019.096.940.012.073.095.0127.5149.0
618.085.245.012.043.592.5123.0147.018.065.7...108.5140.018.065.841.018.033.852.592.8142.0
723.075.541.412.051.569.0107.5144.023.085.5...116.0149.023.086.838.88.064.583.0112.5150.0
821.072.043.812.032.066.091.0146.021.080.6...125.0139.021.072.346.41.043.070.0103.0146.0

16 rows × 24 columns

分组聚合apply、transform

df
sexclasspythonmathen
08167085
11378106
242510663
336055101
4212111414
..................
2954338466
296814213489
297215088130
2988621271
29919810180

300 rows × 5 columns

# apply是聚合的结果
df.groupby(by=['sex','class'])[['python','en']].apply(np.mean).round(1)
C:\Users\Panda\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\core\fromnumeric.py:3472: FutureWarning: In a future version, DataFrame.mean(axis=None) will return a scalar mean over the entire DataFrame. To retain the old behavior, use 'frame.mean(axis=0)' or just 'frame.mean()'
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
pythonen
sexclass
171.894.1
263.270.6
394.066.6
479.574.7
563.963.7
682.877.2
785.567.0
885.368.1
175.065.5
271.054.9
372.479.4
467.679.2
572.596.9
685.265.8
775.586.8
872.072.3
# transform 计算,返回的结果还是DataFrame长度
df.groupby(by=['sex','class'])[['python','en']].transform(np.mean).round(1)
pythonen
072.072.3
175.065.5
267.679.2
372.479.4
471.054.9
.........
29567.679.2
29672.072.3
29763.270.6
29872.072.3
29975.065.5

300 rows × 2 columns

def _mean(x):  # 自己定义求平均值方法,方法的重写
    return np.round(x.mean(),1)
df.groupby(by=['sex','class'])[['python','en']].transform(_mean)
pythonen
072.072.3
175.065.5
267.679.2
372.479.4
471.054.9
.........
29567.679.2
29672.072.3
29763.270.6
29872.072.3
29975.065.5

300 rows × 2 columns

分组聚合agg

# 可以同时统计汇总多种计算方法
df.groupby(by = ['sex','class']).agg([np.mean,np.max,np.min])
pythonmathen
meanamaxaminmeanamaxaminmeanamaxamin
sexclass
171.789474145378.6315791501794.0526321440
263.210526150184.631579144370.6315791460
394.000000147075.692308134066.61538512214
479.466667148657.400000147574.66666713310
563.900000137673.2000001242763.7000001425
682.7647061342580.2352941371377.17647114410
785.520000147983.240000148966.9600001463
885.333333141383.9444441471968.1111111474
175.000000140569.105263138065.5263161335
271.000000138874.1904761431754.85714313312
372.370370145877.518519146379.4444441475
467.5625001371973.937500149679.18750013419
572.473684145364.842105130296.89473714912
685.2222221471265.7222221401765.77777814218
775.5217391441285.4782611491586.7826091508
872.0000001461280.571429139672.3333331461
# 三列执行相同模式的运算
df.groupby(by = ['sex','class']).agg([('平均值',np.mean),
                                      ('最大值',np.max),
                                     ('最小值',np.min)]).round(1)
pythonmathen
平均值最大值最小值平均值最大值最小值平均值最大值最小值
sexclass
171.8145378.61501794.11440
263.2150184.6144370.61460
394.0147075.7134066.612214
479.5148657.4147574.713310
563.9137673.21242763.71425
682.81342580.21371377.214410
785.5147983.2148967.01463
885.3141383.91471968.11474
175.0140569.1138065.51335
271.0138874.21431754.913312
372.4145877.5146379.41475
467.61371973.9149679.213419
572.5145364.8130296.914912
685.21471265.71401765.814218
775.51441285.51491586.81508
872.01461280.6139672.31461
# 三类执行不同的计算方式
df.groupby(by=['sex','class']).agg({'python':np.max,
                                   'math':np.min,
                                   'en':np.mean}).round(1)
pythonmathen
sexclass
11451794.1
2150370.6
3147066.6
4148574.7
51372763.7
61341377.2
7147967.0
81411968.1
1140065.5
21381754.9
3145379.4
4137679.2
5145296.9
61471765.8
71441586.8
8146672.3

透视表pivot_table

# 透视表也是一种分组聚合运算
df
sexclasspythonmathen
08167085
11378106
242510663
336055101
4212111414
..................
2954338466
296814213489
297215088130
2988621271
29919810180

300 rows × 5 columns

df.pivot_table(values = ['python','en'],
              index=['sex','class'],
              aggfunc='mean').round(1)  # 和分组聚合apply是一样的,但是透视表的功能更加丰富
enpython
sexclass
194.171.8
270.663.2
366.694.0
474.779.5
563.763.9
677.282.8
767.085.5
868.185.3
165.575.0
254.971.0
379.472.4
479.267.6
596.972.5
665.885.2
786.875.5
872.372.0
# 牛逼牛逼牛逼牛逼
df1 = df.pivot_table(values = ['python','math','en'],  # 要透视的值
              index=['sex','class'],  # 索引,根据什么进行透视(分组)
              aggfunc={'python':[('平均值',np.mean)],
                       'math':[('最大值',np.max),
                              ('最小值',np.min)],
                       'en':[('标准差',np.std),
                            ('方差',np.var),
                            ('计数',np.size)]}).round(1)  # 透视的方式是什么()方法
df1
enmathpython
方差标准差计数最大值最小值平均值
sexclass
12356.248.5191501771.8
23115.655.819144363.2
31067.432.713134094.0
41487.438.615147579.5
52342.748.4101242763.9
61508.838.8171371382.8
71593.539.925148985.5
82398.149.0181471985.3
12150.346.419138075.0
21252.535.4211431771.0
31524.539.027146372.4
41097.133.116149667.6
51600.540.019130272.5
61684.141.0181401785.2
71507.138.8231491575.5
82151.746.421139672.0

时间序列

时间戳操作

pd.Timestamp('2020.09.15 20')  # 时刻数据,具体的时间点
Timestamp('2020-09-15 20:00:00')
pd.Period('2020.09.15',freq='M')  # 时期数据,表示一段时间,频率
Period('2020-09', 'M')
index = pd.date_range('2020.09.15',
             freq='D',
             period
  • 2
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值