导入库
import numpy as np
import pandas as pd
import time
import warnings
warnings. filterwarnings( 'ignore' )
pandas基础
pandas数据结构
Series 一维数据
l = np. array( [ 1 , 2 , 3 , 6 , 9 ] )
s1 = pd. Series( data= l)
display( l, s1)
array([1, 2, 3, 6, 9])
0 1
1 2
2 3
3 6
4 9
dtype: int32
s2 = pd. Series( data = l, index = list ( 'ABCDE' ) )
s2
A 1
B 2
C 3
D 6
E 9
dtype: int32
s3 = pd. Series( data = { 'A' : 149 , 'B' : 130 , 'C' : 118 , 'D' : 99 , 'E' : 66 } )
s3
A 149
B 130
C 118
D 99
E 66
dtype: int64
DataFrame 多维数据
df1 = pd. DataFrame( data = np. random. randint( 0 , 151 , size= ( 10 , 3 ) ) ,
index = list ( 'ABCDEFHIJK' ) ,
columns= [ 'python' , 'math' , 'english' ] ,
dtype = np. float16)
df1
python math english A 120.0 76.0 39.0 B 10.0 139.0 85.0 C 69.0 14.0 33.0 D 85.0 97.0 144.0 E 132.0 11.0 35.0 F 129.0 25.0 148.0 H 45.0 66.0 95.0 I 148.0 54.0 61.0 J 56.0 14.0 81.0 K 55.0 137.0 104.0
df2 = pd. DataFrame( data= { 'python' : [ 66 , 99 , 128 ] , 'math' : [ 88 , 65 , 137 ] , 'english' : [ 100 , 121 , 45 ] } )
df2
python math english 0 66 88 100 1 99 65 121 2 128 137 45
数据查看
df = pd. DataFrame( data = np. random. randint( 0 , 151 , size= ( 100 , 3 ) ) ,
columns= [ 'python' , 'math' , 'english' ] )
df
python math english 0 150 111 83 1 31 137 41 2 126 129 130 3 91 23 118 4 96 68 22 ... ... ... ... 95 21 29 78 96 34 7 60 97 134 21 111 98 108 130 137 99 127 105 59
100 rows × 3 columns
df. shape
(100, 3)
df. head( n= 3 )
python math english 0 150 111 83 1 31 137 41 2 126 129 130
df. tail( n= 5 )
python math english 95 21 29 78 96 34 7 60 97 134 21 111 98 108 130 137 99 127 105 59
df. dtypes
python int32
math int32
english int32
dtype: object
df. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 python 100 non-null int32
1 math 100 non-null int32
2 english 100 non-null int32
dtypes: int32(3)
memory usage: 1.3 KB
df. describe( )
python math english count 100.00000 100.000000 100.000000 mean 75.48000 72.330000 78.180000 std 41.94645 40.506404 42.564836 min 1.00000 1.000000 3.000000 25% 47.50000 40.500000 42.500000 50% 75.00000 70.000000 81.500000 75% 109.00000 101.250000 113.500000 max 150.00000 146.000000 150.000000
df. values
array([[150, 111, 83],
[ 31, 137, 41],
[126, 129, 130],
[ 91, 23, 118],
[ 96, 68, 22],
[ 46, 12, 108],
[141, 100, 16],
[ 87, 6, 20],
[104, 133, 101],
[ 73, 108, 134],
[ 57, 146, 18],
[ 57, 133, 49],
[ 33, 70, 28],
[ 69, 37, 80],
[118, 30, 44],
[ 7, 43, 37],
[104, 42, 39],
[145, 58, 107],
[121, 17, 92],
[120, 80, 89],
[ 66, 69, 18],
[ 21, 107, 140],
[ 17, 1, 129],
[ 83, 65, 95],
[ 95, 61, 89],
[ 79, 129, 90],
[ 97, 54, 91],
[123, 14, 50],
[128, 76, 33],
[ 7, 41, 138],
[ 61, 141, 134],
[138, 145, 20],
[ 60, 27, 115],
[ 77, 139, 66],
[112, 83, 119],
[138, 142, 48],
[ 16, 99, 85],
[ 55, 21, 90],
[ 51, 75, 63],
[123, 34, 39],
[ 18, 39, 103],
[129, 73, 57],
[139, 10, 130],
[ 22, 79, 6],
[ 61, 127, 85],
[ 99, 78, 138],
[ 59, 60, 12],
[ 63, 41, 64],
[ 17, 69, 112],
[ 86, 42, 80],
[ 94, 18, 41],
[ 15, 42, 3],
[ 90, 57, 139],
[ 48, 91, 107],
[ 66, 92, 54],
[ 70, 81, 31],
[ 11, 8, 66],
[ 10, 96, 94],
[144, 92, 120],
[ 84, 69, 16],
[ 5, 52, 76],
[ 41, 74, 140],
[ 26, 75, 26],
[ 49, 118, 67],
[ 59, 72, 61],
[ 77, 26, 72],
[122, 52, 7],
[ 70, 34, 20],
[117, 16, 131],
[ 1, 89, 144],
[ 89, 24, 93],
[ 6, 83, 125],
[119, 117, 9],
[138, 89, 104],
[ 65, 110, 29],
[ 87, 126, 70],
[ 51, 118, 75],
[ 29, 82, 43],
[104, 136, 129],
[ 22, 65, 150],
[107, 139, 120],
[ 68, 95, 122],
[ 58, 53, 133],
[117, 70, 87],
[ 28, 131, 6],
[104, 68, 95],
[128, 32, 148],
[ 96, 138, 7],
[ 60, 65, 103],
[ 53, 11, 150],
[ 5, 86, 83],
[145, 53, 60],
[ 88, 70, 113],
[ 88, 32, 59],
[ 54, 70, 50],
[ 21, 29, 78],
[ 34, 7, 60],
[134, 21, 111],
[108, 130, 137],
[127, 105, 59]])
df. columns
Index(['python', 'math', 'english'], dtype='object')
df. index
RangeIndex(start=0, stop=100, step=1)
数据输入与输出
Csv
df = pd. DataFrame( data = np. random. randint( 0 , 151 , size = ( 100 , 3 ) ) ,
columns= [ 'Python' , 'Math' , 'English' ] )
df
Python Math English 0 62 40 150 1 95 61 27 2 18 110 56 3 8 91 59 4 45 40 2 ... ... ... ... 95 40 100 55 96 14 133 135 97 97 95 91 98 45 70 72 99 131 54 49
100 rows × 3 columns
df. to_csv( './data.csv' ,
sep= ',' ,
index = True ,
header = True )
pd. read_csv( './data.csv' ,
index_col = 0
)
Python Math English 0 62 40 150 1 95 61 27 2 18 110 56 3 8 91 59 4 45 40 2 ... ... ... ... 95 40 100 55 96 14 133 135 97 97 95 91 98 45 70 72 99 131 54 49
100 rows × 3 columns
Excel
df
Python Math English 0 62 40 150 1 95 61 27 2 18 110 56 3 8 91 59 4 45 40 2 ... ... ... ... 95 40 100 55 96 14 133 135 97 97 95 91 98 45 70 72 99 131 54 49
100 rows × 3 columns
df. to_excel( './data.xlsx' )
pd. read_excel( './data.xlsx' ,
index_col = 0 )
Python Math English 0 62 40 150 1 95 61 27 2 18 110 56 3 8 91 59 4 45 40 2 ... ... ... ... 95 40 100 55 96 14 133 135 97 97 95 91 98 45 70 72 99 131 54 49
100 rows × 3 columns
HDF5
df
Python Math English 0 62 40 150 1 95 61 27 2 18 110 56 3 8 91 59 4 45 40 2 ... ... ... ... 95 40 100 55 96 14 133 135 97 97 95 91 98 45 70 72 99 131 54 49
100 rows × 3 columns
df. to_hdf( './data.h5' ,
key = 'score' )
df2 = pd. DataFrame( data = np. random. randint( 6 , 100 , size= ( 1000 , 5 ) ) ,
columns = [ '计算机' , '化工' , '生物' , '工程' , '教师' ] )
df2
计算机 化工 生物 工程 教师 0 58 60 64 34 15 1 25 36 94 25 55 2 33 15 68 58 85 3 30 81 98 55 66 4 83 15 35 37 33 ... ... ... ... ... ... 995 96 8 77 34 8 996 40 42 23 13 76 997 35 33 41 23 71 998 80 72 21 18 29 999 19 77 44 30 18
1000 rows × 5 columns
df2. to_hdf( './data.h5' ,
key = 'salary' )
pd. read_hdf( './data.h5' ,
key = 'salary' )
计算机 化工 生物 工程 教师 0 58 60 64 34 15 1 25 36 94 25 55 2 33 15 68 58 85 3 30 81 98 55 66 4 83 15 35 37 33 ... ... ... ... ... ... 995 96 8 77 34 8 996 40 42 23 13 76 997 35 33 41 23 71 998 80 72 21 18 29 999 19 77 44 30 18
1000 rows × 5 columns
SQL
from sqlalchemy import create_engine
engine = create_engine(
'mysql+pymysql://root:123456@localhost/cda?charset=utf8'
)
df2. to_sql( 'salary' ,
con = engine,
index = False
)
1000
pd. read_sql( 'select * from salary limit 10' ,
con= engine)
计算机 化工 生物 工程 教师 0 58 60 64 34 15 1 25 36 94 25 55 2 33 15 68 58 85 3 30 81 98 55 66 4 83 15 35 37 33 5 54 26 43 93 15 6 54 13 42 62 38 7 68 10 26 65 25 8 6 9 51 48 7 9 29 98 21 36 85
数据获取
获取数据
df = pd. DataFrame( np. random. randint( 0 , 151 , size= ( 10 , 3 ) ) ,
index= list ( 'ABCDEFGHIJ' ) ,
columns= [ 'python' , 'math' , 'en' ] )
df
python math en A 17 4 109 B 78 51 91 C 52 29 121 D 118 125 65 E 72 104 144 F 113 69 18 G 86 114 97 H 64 21 142 I 102 72 118 J 114 34 93
df[ 'python' ]
A 27
B 126
C 91
D 66
E 103
F 53
G 45
H 34
I 2
J 26
Name: python, dtype: int32
df. python
A 27
B 126
C 91
D 66
E 103
F 53
G 45
H 34
I 2
J 26
Name: python, dtype: int32
df[ [ 'python' , 'en' ] ]
python en A 27 38 B 126 48 C 91 37 D 66 7 E 103 72 F 53 47 G 45 4 H 34 116 I 2 1 J 26 18
标签选择
df. loc[ 'A' ]
python 27
math 148
en 38
Name: A, dtype: int32
df. loc[ [ 'A' , 'B' , 'D' ] ]
python math en A 27 148 38 B 126 141 48 D 66 16 7
df. loc[ 'A' , 'python' ]
27
df. loc[ [ 'A' , 'C' , 'F' ] , 'python' ]
A 27
C 91
F 53
Name: python, dtype: int32
df. loc[ 'A' : : 2 , [ 'math' , 'en' ] ]
math en A 148 38 C 0 37 E 141 72 G 28 4 I 23 1
df. loc[ 'A' : 'D' , : ]
python math en A 27 148 38 B 126 141 48 C 91 0 37 D 66 16 7
位置选择
df. iloc[ 0 ]
python 27
math 148
en 38
Name: A, dtype: int32
df. iloc[ : 4 , : 2 ]
python math A 27 148 B 126 141 C 91 0 D 66 16
布尔索引
cond = df. python > 80
df[ cond]
python math en B 126 141 48 C 91 0 37 E 103 141 72
df. mean( )
python 57.3
math 71.7
en 38.8
dtype: float64
df. mean( axis= 1 )
A 71.000000
B 105.000000
C 42.666667
D 29.666667
E 105.333333
F 55.666667
G 25.666667
H 73.666667
I 8.666667
J 42.000000
dtype: float64
df[ df. mean( axis= 1 ) > 75 ]
python math en B 126 141 48 E 103 141 72
df[ ( df. python > 70 ) & ( df. math > 70 ) ]
python math en B 126 141 48 E 103 141 72
df. index
Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'], dtype='object')
cond = df. index. isin( [ 'C' , 'E' , 'H' , 'K' ] )
df[ cond]
python math en C 91 0 37 E 103 141 72 H 34 71 116
赋值操作
df[ 'python' ] [ 'A' ] = 150
df
python math en A 150 148 38 B 126 141 48 C 91 0 37 D 66 16 7 E 103 141 72 F 53 67 47 G 45 28 4 H 34 71 116 I 2 23 1 J 26 82 18
df[ 'java' ] = np. random. randint( 1 , 151 , size= 10 )
df
python math en java A 17 4 109 88 B 78 51 91 99 C 52 29 121 54 D 218 125 165 135 E 72 104 144 73 F 113 69 18 53 G 186 114 197 22 H 64 21 142 37 I 102 72 118 39 J 214 34 193 125
df. loc[ [ 'C' , 'D' , 'E' ] , 'math' ] = 147
df
python math en java A 150 148 38 107 B 126 141 48 6 C 91 147 37 133 D 66 147 7 145 E 103 147 72 46 F 53 67 47 89 G 45 28 4 95 H 34 71 116 27 I 2 23 1 104 J 26 82 18 117
df[ df < 60 ] = 60
df
python math en java A 150 148 60 107 B 126 141 60 60 C 91 147 60 133 D 66 147 60 145 E 103 147 72 60 F 60 67 60 89 G 60 60 60 95 H 60 71 116 60 I 60 60 60 104 J 60 82 60 117
df. iloc[ 3 : : 3 , [ 0 , 2 ] ] += 100
df
python math en java A 17 4 109 88 B 78 51 91 99 C 52 29 121 54 D 318 125 265 135 E 72 104 144 73 F 113 69 18 53 G 286 114 297 22 H 64 21 142 37 I 102 72 118 39 J 314 34 293 125
数据集成
concat行列合并
df1 = pd. DataFrame( np. random. randint( 0 , 151 , size= ( 10 , 3 ) ) ,
index= list ( 'ABCDEFGHIJ' ) ,
columns= [ 'python' , 'math' , 'en' ] )
df2 = pd. DataFrame( np. random. randint( 0 , 151 , size= ( 10 , 3 ) ) ,
index= list ( 'HILKLMNOPQ' ) ,
columns= [ 'python' , 'math' , 'en' ] )
df3 = pd. DataFrame( np. random. randint( 0 , 151 , size= ( 10 , 2 ) ) ,
index= list ( 'ABCDEFGHIJ' ) ,
columns= [ 'java' , 'chinese' ] )
pd. concat( [ df1, df2] ,
axis= 0 )
python math en A 107 52 125 B 46 43 64 C 141 147 118 D 51 110 21 E 135 62 46 F 75 3 25 G 43 29 120 H 22 34 39 I 82 92 0 J 32 17 77 H 58 141 110 I 12 137 62 L 29 140 131 K 74 59 33 L 30 29 7 M 3 6 77 N 50 135 12 O 139 134 134 P 140 115 119 Q 141 141 21
pd. concat( [ df1, df3] ,
axis = 1 )
python math en java chinese A 107 52 125 12 26 B 46 43 64 70 13 C 141 147 118 143 130 D 51 110 21 73 115 E 135 62 46 145 2 F 75 3 25 54 49 G 43 29 120 61 119 H 22 34 39 105 137 I 82 92 0 114 114 J 32 17 77 85 49
数据插入
df1. insert( loc = 1 ,
column= 'c++' ,
value = np. random. randint( 0 , 151 , size= 10 ) )
---------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [106], in <cell line: 1>()
----> 1 df1.insert(loc = 1,
2 column='c++',
3 value = np.random.randint(0,151,size=10)) # 插入一列
4 df1
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\frame.py:4434, in DataFrame.insert(self, loc, column, value, allow_duplicates)
4428 raise ValueError(
4429 "Cannot specify 'allow_duplicates=True' when "
4430 "'self.flags.allows_duplicate_labels' is False."
4431 )
4432 if not allow_duplicates and column in self.columns:
4433 # Should this be a different kind of error??
-> 4434 raise ValueError(f"cannot insert {column}, already exists")
4435 if not isinstance(loc, int):
4436 raise TypeError("loc must be int")
ValueError: cannot insert c++, already exists
df1
python c++ math en A 107 94 52 125 B 46 92 43 64 C 141 72 147 118 D 51 7 110 21 E 135 73 62 46 F 75 15 3 25 G 43 60 29 120 H 22 142 34 39 I 82 43 92 0 J 32 94 17 77
pd. concat( [ df1, df3] ,
axis= 0 )
python c++ math en java chinese A 107.0 94.0 52.0 125.0 NaN NaN B 46.0 92.0 43.0 64.0 NaN NaN C 141.0 72.0 147.0 118.0 NaN NaN D 51.0 7.0 110.0 21.0 NaN NaN E 135.0 73.0 62.0 46.0 NaN NaN F 75.0 15.0 3.0 25.0 NaN NaN G 43.0 60.0 29.0 120.0 NaN NaN H 22.0 142.0 34.0 39.0 NaN NaN I 82.0 43.0 92.0 0.0 NaN NaN J 32.0 94.0 17.0 77.0 NaN NaN A NaN NaN NaN NaN 12.0 26.0 B NaN NaN NaN NaN 70.0 13.0 C NaN NaN NaN NaN 143.0 130.0 D NaN NaN NaN NaN 73.0 115.0 E NaN NaN NaN NaN 145.0 2.0 F NaN NaN NaN NaN 54.0 49.0 G NaN NaN NaN NaN 61.0 119.0 H NaN NaN NaN NaN 105.0 137.0 I NaN NaN NaN NaN 114.0 114.0 J NaN NaN NaN NaN 85.0 49.0
Join SQL风格合并
df1 = pd. DataFrame( data= { 'name' : [ 'softpo' , 'brandon' , 'ella' , 'daniel' , '张三' ] ,
'height' : [ 175 , 180 , 169 , 177 , 168 ] } )
df2 = pd. DataFrame( data= { 'name' : [ 'softpo' , 'brandon' , 'ella' , 'daniel' , '张三' ] ,
'weigth' : [ 70 , 65 , 704 , 653 , 88 ] } )
df3 = pd. DataFrame( data= { '名字' : [ 'softpo' , 'brandon' , 'ella' , 'daniel' , '张三' ] ,
'salary' : np. random. randint( 20 , 100 , size= 5 ) } )
display( df1, df2, df3)
name height 0 softpo 175 1 brandon 180 2 ella 169 3 daniel 177 4 张三 168
name weigth 0 softpo 70 1 brandon 65 2 ella 704 3 daniel 653 4 张三 88
名字 salary 0 softpo 74 1 brandon 79 2 ella 39 3 daniel 97 4 张三 62
pd. concat( [ df1, df2] , axis = 1 )
name height name weigth 0 softpo 175 softpo 70 1 brandon 180 brandon 65 2 ella 169 ella 704 3 daniel 177 daniel 653 4 张三 168 张三 88
pd. merge( df1, df2, how= 'inner' )
name height weigth 0 softpo 175 70 1 brandon 180 65 2 ella 169 704 3 daniel 177 653 4 张三 168 88
pd. merge( df1, df2, how= 'outer' )
name height weigth 0 softpo 175 70 1 brandon 180 65 2 ella 169 704 3 daniel 177 653 4 张三 168 88
pd. merge( df1, df2, how= 'left' )
name height weigth 0 softpo 175 70 1 brandon 180 65 2 ella 169 704 3 daniel 177 653 4 张三 168 88
pd. merge( df1, df3, left_on= 'name' , right_on= '名字' )
name height 名字 salary 0 softpo 175 softpo 74 1 brandon 180 brandon 79 2 ella 169 ella 39 3 daniel 177 daniel 97 4 张三 168 张三 62
df4 = pd. DataFrame( data= np. random. randint( 0 , 151 , size= ( 10 , 3 ) ) ,
columns= [ 'python' , 'math' , 'en' ] ,
index= list ( 'ABCDEFGHIJ' ) )
df4
python math en A 83 7 108 B 34 48 128 C 92 99 132 D 112 126 98 E 107 5 83 F 77 33 136 G 58 39 12 H 67 109 69 I 115 29 48 J 19 55 92
score_mean = df4. mean( axis= 1 ) . round ( 1 )
score_mean
A 66.0
B 70.0
C 107.7
D 112.0
E 65.0
F 82.0
G 36.3
H 81.7
I 64.0
J 55.3
dtype: float64
df4. insert( loc= 3 , column= '平均分' , value= score_mean)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [32], in <cell line: 1>()
----> 1 df4.insert(loc=3,column='平均分',value=score_mean)
2 df4
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\frame.py:4434, in DataFrame.insert(self, loc, column, value, allow_duplicates)
4428 raise ValueError(
4429 "Cannot specify 'allow_duplicates=True' when "
4430 "'self.flags.allows_duplicate_labels' is False."
4431 )
4432 if not allow_duplicates and column in self.columns:
4433 # Should this be a different kind of error??
-> 4434 raise ValueError(f"cannot insert {column}, already exists")
4435 if not isinstance(loc, int):
4436 raise TypeError("loc must be int")
ValueError: cannot insert 平均分, already exists
df4
python math 平均分 en A 83 7 66.0 108 B 34 48 70.0 128 C 92 99 107.7 132 D 112 126 112.0 98 E 107 5 65.0 83 F 77 33 82.0 136 G 58 39 36.3 12 H 67 109 81.7 69 I 115 29 64.0 48 J 19 55 55.3 92
df5 = df4. iloc[ : , [ 0 , 1 , 3 ] ]
df5
python math en A 83 7 108 B 34 48 128 C 92 99 132 D 112 126 98 E 107 5 83 F 77 33 136 G 58 39 12 H 67 109 69 I 115 29 48 J 19 55 92
score_mean. name= '平均分'
score_mean
A 66.0
B 70.0
C 107.7
D 112.0
E 65.0
F 82.0
G 36.3
H 81.7
I 64.0
J 55.3
Name: 平均分, dtype: float64
df5
python math en A 83 7 108 B 34 48 128 C 92 99 132 D 112 126 98 E 107 5 83 F 77 33 136 G 58 39 12 H 67 109 69 I 115 29 48 J 19 55 92
pd. merge( df5,
score_mean,
left_index = True ,
right_index = True )
python math en 平均分 A 83 7 108 66.0 B 34 48 128 70.0 C 92 99 132 107.7 D 112 126 98 112.0 E 107 5 83 65.0 F 77 33 136 82.0 G 58 39 12 36.3 H 67 109 69 81.7 I 115 29 48 64.0 J 19 55 92 55.3
数据清洗
df = pd. DataFrame( data = { 'color' : [ 'red' , 'blue' , 'red' , 'green' , 'green' , 'blue' , None , np. NaN, 'green' ] ,
'price' : [ 20 , 15 , 20 , 18 , 18 , 22 , 30 , 30 , 22 ] } )
df
color price 0 red 20 1 blue 15 2 red 20 3 green 18 4 green 18 5 blue 22 6 None 30 7 NaN 30 8 green 22
df. drop_duplicates( )
color price 0 red 20 1 blue 15 3 green 18 5 blue 22 6 None 30 8 green 22
df. dropna( )
color price 0 red 20 1 blue 15 2 red 20 3 green 18 4 green 18 5 blue 22 8 green 22
df. drop( labels= [ 2 , 4 , 6 , 8 ] )
color price 0 red 20 1 blue 15 3 green 18 5 blue 22 7 NaN 30
df. drop( labels= [ 'color' ] , axis = 1 )
price 0 20 1 15 2 20 3 18 4 18 5 22 6 30 7 30 8 22
df. filter ( items= [ 'price' ] )
price 0 20 1 15 2 20 3 18 4 18 5 22 6 30 7 30 8 22
df
color price 0 red 20 1 blue 15 2 red 20 3 green 18 4 green 18 5 blue 22 6 None 30 7 NaN 30 8 green 22
df[ 'size' ] = 1024
df
color price size 0 red 20 1024 1 blue 15 1024 2 red 20 1024 3 green 18 1024 4 green 18 1024 5 blue 22 1024 6 None 30 1024 7 NaN 30 1024 8 green 22 1024
df. filter ( like = 'i' )
price size 0 20 1024 1 15 1024 2 20 1024 3 18 1024 4 18 1024 5 22 1024 6 30 1024 7 30 1024 8 22 1024
df. filter ( regex = 'e$' )
price size 0 20 1024 1 15 1024 2 20 1024 3 18 1024 4 18 1024 5 22 1024 6 30 1024 7 30 1024 8 22 1024
a = np. random. randint( 1 , 200 , size = 1000 )
a
array([ 95, 17, 106, 14, 77, 43, 35, 82, 169, 84, 135, 61, 107,
10, 57, 88, 181, 147, 160, 141, 99, 190, 148, 101, 33, 150,
37, 16, 90, 104, 56, 179, 7, 120, 172, 180, 161, 76, 175,
137, 33, 52, 157, 62, 49, 46, 49, 78, 110, 97, 188, 87,
185, 45, 106, 29, 173, 188, 71, 195, 84, 47, 13, 68, 35,
39, 79, 86, 67, 132, 58, 27, 48, 118, 190, 60, 66, 118,
36, 76, 109, 130, 75, 38, 190, 158, 45, 153, 136, 20, 30,
106, 136, 83, 182, 170, 51, 118, 95, 50, 77, 135, 65, 196,
129, 183, 115, 77, 27, 106, 24, 67, 56, 150, 139, 100, 6,
160, 36, 184, 191, 19, 189, 15, 163, 140, 123, 151, 86, 39,
39, 197, 193, 122, 62, 21, 25, 182, 29, 101, 154, 23, 195,
68, 174, 148, 154, 199, 33, 168, 45, 162, 90, 152, 182, 65,
199, 51, 83, 75, 197, 72, 159, 123, 152, 4, 175, 187, 77,
53, 136, 35, 106, 98, 135, 57, 189, 84, 4, 108, 20, 41,
7, 16, 45, 168, 156, 114, 32, 147, 24, 105, 198, 57, 16,
192, 128, 37, 156, 53, 44, 142, 160, 198, 154, 75, 109, 22,
16, 12, 80, 146, 153, 144, 192, 99, 97, 199, 111, 27, 59,
127, 112, 160, 166, 198, 41, 196, 170, 38, 147, 64, 199, 93,
1, 152, 150, 132, 181, 57, 56, 58, 196, 171, 163, 150, 8,
197, 38, 135, 176, 196, 19, 185, 29, 9, 15, 101, 177, 126,
7, 197, 58, 56, 149, 182, 172, 60, 106, 130, 174, 198, 185,
6, 124, 53, 95, 92, 57, 78, 18, 192, 93, 190, 38, 111,
14, 94, 47, 180, 79, 86, 37, 44, 15, 166, 42, 124, 137,
152, 173, 180, 142, 19, 110, 21, 7, 83, 3, 158, 21, 112,
183, 65, 193, 84, 171, 106, 100, 118, 43, 183, 142, 143, 188,
109, 136, 59, 84, 28, 72, 1, 73, 166, 104, 73, 16, 28,
55, 29, 76, 13, 110, 83, 134, 23, 160, 195, 175, 47, 27,
148, 100, 96, 169, 64, 45, 162, 113, 52, 45, 170, 127, 91,
141, 13, 128, 121, 167, 103, 66, 134, 80, 46, 171, 91, 53,
75, 7, 190, 35, 48, 164, 130, 104, 16, 31, 181, 68, 162,
1, 95, 174, 163, 183, 78, 1, 103, 12, 41, 50, 125, 49,
4, 61, 81, 142, 157, 119, 11, 70, 121, 191, 28, 140, 63,
144, 190, 51, 184, 111, 16, 3, 150, 1, 69, 79, 143, 16,
136, 90, 27, 107, 78, 123, 4, 18, 36, 80, 32, 103, 19,
79, 181, 83, 167, 20, 29, 150, 92, 26, 123, 57, 39, 115,
69, 75, 44, 60, 88, 99, 191, 191, 82, 122, 156, 164, 41,
150, 3, 184, 15, 104, 144, 77, 49, 157, 129, 29, 56, 188,
113, 171, 118, 63, 132, 177, 27, 8, 164, 43, 22, 78, 89,
188, 168, 148, 140, 125, 140, 97, 21, 185, 4, 91, 176, 93,
165, 81, 124, 144, 137, 86, 133, 45, 193, 86, 158, 91, 137,
100, 55, 95, 33, 30, 179, 3, 77, 26, 113, 197, 49, 105,
3, 160, 139, 106, 94, 5, 16, 104, 145, 77, 16, 14, 98,
137, 196, 53, 105, 54, 157, 66, 23, 11, 197, 140, 138, 61,
155, 138, 187, 138, 175, 143, 66, 79, 25, 26, 165, 49, 53,
128, 2, 4, 194, 49, 17, 4, 89, 107, 75, 99, 162, 166,
173, 122, 87, 159, 75, 107, 143, 85, 41, 108, 180, 120, 3,
29, 16, 3, 186, 86, 135, 89, 61, 172, 74, 178, 38, 49,
61, 93, 96, 114, 170, 110, 136, 40, 72, 102, 186, 120, 147,
158, 128, 71, 110, 59, 87, 99, 171, 3, 55, 163, 181, 188,
80, 194, 75, 38, 173, 137, 97, 171, 113, 132, 165, 20, 37,
141, 124, 40, 98, 38, 61, 136, 3, 47, 1, 117, 143, 190,
162, 181, 115, 181, 100, 92, 180, 165, 26, 183, 65, 43, 64,
102, 120, 102, 100, 38, 9, 49, 95, 126, 185, 18, 146, 64,
47, 151, 191, 113, 64, 119, 49, 51, 25, 6, 49, 171, 98,
112, 134, 15, 135, 144, 7, 69, 113, 89, 190, 179, 129, 182,
9, 157, 55, 95, 181, 44, 144, 193, 109, 3, 183, 48, 194,
80, 39, 173, 194, 195, 55, 156, 158, 29, 21, 119, 77, 142,
97, 41, 50, 14, 70, 197, 43, 93, 51, 190, 105, 130, 162,
33, 153, 190, 90, 123, 144, 71, 80, 138, 167, 128, 176, 95,
37, 148, 193, 169, 175, 131, 198, 188, 39, 26, 178, 79, 50,
190, 148, 156, 56, 3, 10, 112, 136, 135, 16, 54, 16, 105,
102, 197, 96, 30, 80, 47, 144, 150, 3, 96, 39, 72, 128,
146, 96, 156, 37, 33, 123, 180, 103, 82, 26, 144, 9, 81,
144, 163, 53, 128, 133, 160, 104, 135, 6, 92, 188, 112, 57,
120, 182, 151, 95, 73, 19, 137, 25, 179, 150, 44, 162, 30,
165, 108, 43, 16, 84, 195, 3, 61, 59, 15, 5, 185, 143,
131, 28, 61, 133, 51, 93, 83, 51, 39, 132, 195, 93, 152,
90, 68, 166, 44, 38, 50, 129, 74, 111, 139, 79, 97, 135,
57, 139, 103, 75, 33, 176, 27, 147, 114, 76, 2, 21, 1,
187, 58, 191, 139, 171, 146, 189, 184, 28, 74, 173, 57, 78,
166, 82, 115, 78, 140, 171, 45, 86, 20, 101, 19, 169, 7,
130, 182, 164, 54, 175, 188, 44, 2, 31, 171, 135, 110, 113,
152, 45, 98, 82, 48, 158, 4, 31, 193, 123, 69, 48, 49,
169, 3, 105, 178, 45, 59, 34, 120, 179, 163, 154, 163, 62,
2, 4, 34, 71, 144, 55, 93, 100, 168, 144, 19, 161, 156,
115, 76, 126, 48, 92, 137, 129, 142, 83, 90, 192, 76, 162,
54, 92, 31, 62, 32, 126, 72, 41, 38, 172, 93, 99])
cond = ( a <= 800 ) & ( a >= 100 )
a[ cond]
array([106, 169, 135, 107, 181, 147, 160, 141, 190, 148, 101, 150, 104,
179, 120, 172, 180, 161, 175, 137, 157, 110, 188, 185, 106, 173,
188, 195, 132, 118, 190, 118, 109, 130, 190, 158, 153, 136, 106,
136, 182, 170, 118, 135, 196, 129, 183, 115, 106, 150, 139, 100,
160, 184, 191, 189, 163, 140, 123, 151, 197, 193, 122, 182, 101,
154, 195, 174, 148, 154, 199, 168, 162, 152, 182, 199, 197, 159,
123, 152, 175, 187, 136, 106, 135, 189, 108, 168, 156, 114, 147,
105, 198, 192, 128, 156, 142, 160, 198, 154, 109, 146, 153, 144,
192, 199, 111, 127, 112, 160, 166, 198, 196, 170, 147, 199, 152,
150, 132, 181, 196, 171, 163, 150, 197, 135, 176, 196, 185, 101,
177, 126, 197, 149, 182, 172, 106, 130, 174, 198, 185, 124, 192,
190, 111, 180, 166, 124, 137, 152, 173, 180, 142, 110, 158, 112,
183, 193, 171, 106, 100, 118, 183, 142, 143, 188, 109, 136, 166,
104, 110, 134, 160, 195, 175, 148, 100, 169, 162, 113, 170, 127,
141, 128, 121, 167, 103, 134, 171, 190, 164, 130, 104, 181, 162,
174, 163, 183, 103, 125, 142, 157, 119, 121, 191, 140, 144, 190,
184, 111, 150, 143, 136, 107, 123, 103, 181, 167, 150, 123, 115,
191, 191, 122, 156, 164, 150, 184, 104, 144, 157, 129, 188, 113,
171, 118, 132, 177, 164, 188, 168, 148, 140, 125, 140, 185, 176,
165, 124, 144, 137, 133, 193, 158, 137, 100, 179, 113, 197, 105,
160, 139, 106, 104, 145, 137, 196, 105, 157, 197, 140, 138, 155,
138, 187, 138, 175, 143, 165, 128, 194, 107, 162, 166, 173, 122,
159, 107, 143, 108, 180, 120, 186, 135, 172, 178, 114, 170, 110,
136, 102, 186, 120, 147, 158, 128, 110, 171, 163, 181, 188, 194,
173, 137, 171, 113, 132, 165, 141, 124, 136, 117, 143, 190, 162,
181, 115, 181, 100, 180, 165, 183, 102, 120, 102, 100, 126, 185,
146, 151, 191, 113, 119, 171, 112, 134, 135, 144, 113, 190, 179,
129, 182, 157, 181, 144, 193, 109, 183, 194, 173, 194, 195, 156,
158, 119, 142, 197, 190, 105, 130, 162, 153, 190, 123, 144, 138,
167, 128, 176, 148, 193, 169, 175, 131, 198, 188, 178, 190, 148,
156, 112, 136, 135, 105, 102, 197, 144, 150, 128, 146, 156, 123,
180, 103, 144, 144, 163, 128, 133, 160, 104, 135, 188, 112, 120,
182, 151, 137, 179, 150, 162, 165, 108, 195, 185, 143, 131, 133,
132, 195, 152, 166, 129, 111, 139, 135, 139, 103, 176, 147, 114,
187, 191, 139, 171, 146, 189, 184, 173, 166, 115, 140, 171, 101,
169, 130, 182, 164, 175, 188, 171, 135, 110, 113, 152, 158, 193,
123, 169, 105, 178, 120, 179, 163, 154, 163, 144, 100, 168, 144,
161, 156, 115, 126, 137, 129, 142, 192, 162, 126, 172])
b = np. random. randn( 100000 )
b
array([ 9.30519601e-01, -3.44360662e-01, 6.65514977e-05, ...,
-2.30123357e+00, 2.17474374e-01, 2.86950376e-01])
cond = np. abs ( b) > 3 * 1
b[ cond]
array([-3.113988 , -3.43316674, -3.28413448, -4.18671551, 3.01815086,
-3.28350433, 3.39229337, -3.8349722 , 3.16620524, -3.01368522,
-3.15619888, -3.02706183, -3.20469248, -3.05930093, -3.25541995,
-3.16289873, 3.16185476, -3.04808409, 3.54344272, -3.2585841 ,
3.08115957, -3.17921412, 3.30126439, -3.18372694, -3.06651084,
3.58377405, -3.07022682, 3.16360384, -3.34376458, -3.81579603,
-3.99645404, 3.58753363, 3.35206183, -3.74457891, -3.33198327,
-3.33993979, -3.17078523, 3.02754635, -3.17691131, -3.12715306,
-3.38487724, 3.3550324 , 3.5759783 , 3.48937298, 3.36398366,
-3.53495624, -3.5156431 , 3.32001882, -3.12741237, 3.02816144,
-3.13263251, 3.76773434, -3.3346886 , 3.40044899, 3.29585109,
3.01097665, -3.06049384, 3.14896826, 3.11488174, 3.15971499,
-3.63196786, 3.21199513, 3.06839176, 3.06794524, 3.06776471,
3.24618032, 3.67058793, 3.00866597, 3.66160146, -3.19700386,
-3.09563525, -3.2564589 , 3.34902208, 3.33267903, 3.46037542,
3.3830624 , -3.10813257, -3.27485335, -3.09740389, 3.0953684 ,
3.53897655, -3.21084279, 3.54120948, 3.15640238, -3.14993374,
3.03442456, -3.10751423, 3.28260234, 3.19101812, 3.15943105,
3.25920238, -3.03706885, -3.30509468, -3.6464602 , 3.26733257,
3.01876074, 3.01606638, 3.21088046, 3.65200452, -3.08139884,
-3.07168219, -3.18855195, -3.06266893, 3.01535252, -3.03137453,
3.12828081, 3.03530021, 3.60066559, -3.10464509, 3.30509352,
3.37102098, 3.75526796, 3.09714856, 3.08665134, -3.4107222 ,
-3.26908219, -3.18781645, 3.63833634, -3.008857 , 3.72641975,
-3.01805091, -3.11050402, -3.31183682, -3.29071569, -3.19408484,
3.1183616 , 3.02554176, 3.21680547, -3.12981011, -3.73713851,
3.24781689, -3.25066575, 3.13496805, 3.17233319, -3.2934357 ,
3.10123582, -3.15880969, -3.74877105, 3.09063324, 3.63149354,
-3.19359058, -3.00209698, 3.15207181, -3.03289971, 3.10833683,
3.72043919, 3.19074043, 3.25338012, -3.02472805, 3.06074732,
3.07540501, -3.73763831, 3.45588068, -3.14578358, -3.24339571,
-3.59522797, -3.53748643, -3.06930718, -3.83786393, 3.23984478,
-3.17985666, -3.16852988, -3.31971537, -3.2654107 , -3.12220388,
3.88867571, 3.08592098, -3.53951333, -3.27504561, 3.30145401,
3.00455928, -3.00767388, 3.00140698, 3.1611202 , -3.28482905,
-4.00614279, -3.32878319, -3.01924921, -4.09578766, 3.09062415,
3.02481019, -3.47085658, 3.13053125, -3.12608597, -3.29770516,
-3.1034331 , 3.03759591, 3.10319671, -3.23967661, -3.08915652,
-3.10352916, 3.06012689, 3.40541064, -3.20405873, 3.46436661,
3.09252034, 3.76867128, 3.08367434, -3.05931862, 3.46965336,
3.22184177, 3.28171723, -3.21577232, 3.20512585, -3.2546693 ,
3.18373904, -3.22062146, -3.01107575, -3.33173298, -3.74574667,
-3.76695662, 3.21289624, 3.46063306, -3.09351324, 3.69547389,
3.07973201, 3.00144524, -3.97588352, 3.15801451, -3.91660008,
-3.00728368, -3.31921622, -3.07473275, -3.18439241, 3.04325623,
-3.11538206, 3.22550755, 3.08726614, 3.1049817 , 3.04386415,
3.17139888, -3.33516451, 3.26625014, 3.25838664, -3.13951313,
3.04181862, -3.01509484, -3.10914859, 3.12164542, -3.44755551,
3.18598457, 3.96503764, -3.09481686, 3.11598421, 3.05800474,
3.04178915, -3.0661321 , -3.13813002, -3.23868465, -3.27279316,
-3.19480029, -3.17663568, 3.06456787, -3.60160737, 3.4677848 ,
3.21025512, -3.11477108, 3.43266798, 3.21094261, -3.0864423 ,
3.09832142, -3.00932592, 3.02609391, 3.28907182, 3.93589522,
-3.25151322, 3.19567292, 3.36226978, 3.57918895, 3.01461819,
3.3557285 , 3.33894176, -3.11794324, -3.66518483, 3.06824796,
3.29518237, -3.28611762, -3.28105128, 3.1483141 , 3.1262014 ,
3.27454636, -3.13272194, 3.12399007, -3.52308132, 3.63540754,
3.29869533, 3.00378091, 3.08858567, 3.6191856 ])
数据转换
轴和元素转换
df = pd. DataFrame( np. random. randint( 0 , 10 , size= ( 10 , 3 ) ) ,
columns = [ 'python' , 'tensorflow' , 'keras' ] ,
index = list ( 'ABCDEFGHIJ' ) )
df. T
A B C D E F G H I J python 4 4 7 4 8 0 5 8 4 2 tensorflow 9 5 8 7 6 1 5 1 7 0 keras 3 1 5 1 7 7 7 6 9 9
df. rename( index= { 'A' : 'X' , 'K' : 'Y' } ,
columns = { 'python' : '人工智能' } ,
inplace = False )
人工智能 tensorflow keras X 4 9 3 B 4 5 1 C 7 8 5 D 4 7 1 E 8 6 7 F 0 1 7 G 5 5 7 H 8 1 6 I 4 7 9 J 2 0 9
df. replace( 5 ,
50 ,
inplace = False )
python tensorflow keras A 4 9 3 B 4 50 1 C 7 8 50 D 4 7 1 E 8 6 7 F 0 1 7 G 50 50 7 H 8 1 6 I 4 7 9 J 2 0 9
df. replace( [ 2 , 7 ] ,
1024 ,
inplace= False )
python tensorflow keras A 4 9 3 B 4 5 1 C 1024 8 5 D 4 1024 1 E 8 6 1024 F 0 1 1024 G 5 5 1024 H 8 1 6 I 4 1024 9 J 1024 0 9
df. iloc[ 4 , 2 ] = np. NaN
df
python tensorflow keras A 4 9 3.0 B 4 5 1.0 C 7 8 5.0 D 4 7 1.0 E 8 6 NaN F 0 1 7.0 G 5 5 7.0 H 8 1 6.0 I 4 7 9.0 J 2 0 9.0
df. replace( { 0 : 2048 , np. NaN: - 100 } ,
inplace = True )
df
python tensorflow keras A 4 9 3.0 B 4 5 1.0 C 7 8 5.0 D 4 7 1.0 E 8 6 -100.0 F 2048 1 7.0 G 5 5 7.0 H 8 1 6.0 I 4 7 9.0 J 2 2048 9.0
df. replace( { 'tensorflow' : 1024 } ,
2048 )
python tensorflow keras A 4 9 3.0 B 4 5 1.0 C 7 8 5.0 D 4 7 1.0 E 8 6 -100.0 F 2048 1 7.0 G 5 5 7.0 H 8 1 6.0 I 4 7 9.0 J 2 2048 9.0
map映射元素转变
df[ '人工智能' ] = np. random. choice( [ 1024 , 2048 , 6 ] ,
10 ,
p= [ 1 / 3 , 1 / 3 , 1 / 3 ] )
df
python tensorflow keras 人工智能 A 4 9 3.0 6 B 4 5 1.0 2048 C 7 8 5.0 2048 D 4 7 1.0 1024 E 8 6 -100.0 2048 F 2048 1 7.0 1024 G 5 5 7.0 2048 H 8 1 6.0 1024 I 4 7 9.0 6 J 2 2048 9.0 2048
df[ '人工智能' ] . map ( { 1024 : 3.14 , 2048 : 2.718 , 6 : 1108 } )
A 1108.000
B 2.718
C 2.718
D 3.140
E 2.718
F 3.140
G 2.718
H 3.140
I 1108.000
J 2.718
Name: 人工智能, dtype: float64
df[ 'keras' ] . map ( lambda x: True if x > 0 else False )
A True
B True
C True
D True
E False
F True
G True
H True
I True
J True
Name: keras, dtype: bool
def convert ( x) :
if x >= 5 :
return True
else :
return False
df[ 'level' ] = df[ 'tensorflow' ] . map ( convert)
df
python tensorflow keras 人工智能 level A 4 9 3.0 6 True B 4 5 1.0 2048 True C 7 8 5.0 2048 True D 4 7 1.0 1024 True E 8 6 -100.0 2048 True F 2048 1 7.0 1024 False G 5 5 7.0 2048 True H 8 1 6.0 1024 False I 4 7 9.0 6 True J 2 2048 9.0 2048 True
apply映射元素转变
df[ '人工智能' ] . apply ( lambda x: x+ 100 )
A 106
B 2148
C 2148
D 1124
E 2148
F 1124
G 2148
H 1124
I 106
J 2148
Name: 人工智能, dtype: int64
df[ 'level' ] . apply ( lambda x: 1 if x else 0 )
A 1
B 1
C 1
D 1
E 1
F 0
G 1
H 0
I 1
J 1
Name: level, dtype: int64
df. apply ( lambda x: x+ 1000 )
python tensorflow keras 人工智能 level A 1004 1009 1003.0 1006 1001 B 1004 1005 1001.0 3048 1001 C 1007 1008 1005.0 3048 1001 D 1004 1007 1001.0 2024 1001 E 1008 1006 900.0 3048 1001 F 3048 1001 1007.0 2024 1000 G 1005 1005 1007.0 3048 1001 H 1008 1001 1006.0 2024 1000 I 1004 1007 1009.0 1006 1001 J 1002 3048 1009.0 3048 1001
def convert ( x) :
return ( x. median( ) ,
x. count( ) ,
x. min ( ) ,
x. max ( ) ,
x. std( )
)
df. apply ( convert) . round ( 1 )
python tensorflow keras 人工智能 level 0 4.5 6.5 5.5 1536.0 1.0 1 10.0 10.0 10.0 10.0 10 2 2.0 1.0 -100.0 6.0 False 3 2048.0 2048.0 9.0 2048.0 True 4 646.0 645.9 33.4 840.9 0.421637
df. apply ( convert, axis = 1 )
A (4.0, 5, True, 9, 3.0495901363953815)
B (4.0, 5, 1.0, 2048, 914.6653486385062)
C (7.0, 5, True, 2048, 913.5495060477018)
D (4.0, 5, 1.0, 1024, 456.5000547645093)
E (6.0, 5, -100.0, 2048, 926.5164866315116)
F (7.0, 5, False, 2048, 914.555903157374)
G (5.0, 5, True, 2048, 913.8835812071469)
H (6.0, 5, False, 1024, 456.2819303895345)
I (6.0, 5, True, 9.0, 3.0495901363953815)
J (9.0, 5, True, 2048, 1119.5491503279345)
dtype: object
transform元素转变
df = pd. DataFrame( np. random. randint( 0 , 10 , size= ( 10 , 3 ) ) ,
columns= [ 'python' , 'tensorflow' , 'keras' ] ,
index= list ( 'ABCDEFGHIJ' ) )
df
python tensorflow keras A 4 9 4 B 9 4 9 C 4 4 6 D 3 1 9 E 1 3 5 F 5 6 7 G 4 2 3 H 1 1 2 I 7 6 3 J 4 2 9
df[ 'python' ] . transform( lambda x: 1024 if x > 5 else - 1024 )
A -1024
B 1024
C -1024
D -1024
E -1024
F -1024
G -1024
H -1024
I 1024
J -1024
Name: python, dtype: int64
df[ 'tensorflow' ] . transform( [ np. sqrt,
np. square,
np. cumsum]
)
sqrt square cumsum A 3.000000 81 9 B 2.000000 16 13 C 2.000000 16 17 D 1.000000 1 18 E 1.732051 9 21 F 2.449490 36 27 G 1.414214 4 29 H 1.000000 1 30 I 2.449490 36 36 J 1.414214 4 38
def convert ( x) :
if x> 5 :
return True
else :
return False
df. transform( { 'python' : np. cumsum,
'tensorflow' : np. square,
'keras' : convert} )
python tensorflow keras A 4 81 False B 13 16 True C 17 16 True D 20 1 True E 21 9 False F 26 36 True G 30 4 False H 31 1 False I 38 36 False J 42 4 True
重排随机抽样哑变量
df
python tensorflow keras A 4 9 4 B 9 4 9 C 4 4 6 D 3 1 9 E 1 3 5 F 5 6 7 G 4 2 3 H 1 1 2 I 7 6 3 J 4 2 9
index = np. random. permutation( 10 )
index
array([0, 4, 6, 2, 7, 8, 5, 3, 1, 9])
df. take( index)
python tensorflow keras A 4 9 4 E 1 3 5 G 4 2 3 C 4 4 6 H 1 1 2 I 7 6 3 F 5 6 7 D 3 1 9 B 9 4 9 J 4 2 9
df. take( np. random. randint( 0 , 10 , size= 20 ) )
python tensorflow keras B 9 4 9 I 7 6 3 F 5 6 7 H 1 1 2 F 5 6 7 E 1 3 5 J 4 2 9 B 9 4 9 E 1 3 5 A 4 9 4 C 4 4 6 H 1 1 2 B 9 4 9 J 4 2 9 F 5 6 7 B 9 4 9 C 4 4 6 G 4 2 3 F 5 6 7 H 1 1 2
df2 = pd. DataFrame( data= { 'key' : [ 'a' , 'a' , 'b' , 'b' , 'c' , 'b' , 'c' ] } )
df2
pd. get_dummies( df2,
prefix= '' ,
prefix_sep= '' )
a b c 0 1 0 0 1 1 0 0 2 0 1 0 3 0 1 0 4 0 0 1 5 0 1 0 6 0 0 1
数据重塑
df. T
A B C D E F G H I J python 4 9 4 3 1 5 4 1 7 4 tensorflow 9 4 4 1 3 6 2 1 6 2 keras 4 9 6 9 5 7 3 2 3 9
df2 = pd. DataFrame( np. random. randint( 0 , 10 , size= ( 20 , 3 ) ) ,
columns= [ 'python' , 'math' , 'en' ] ,
index= pd. MultiIndex. from_product( [ list ( 'ABCDEFGHIJ' ) ,
[ '期中' , '期末' ] ] ) )
df2
python math en A 期中 9 7 8 期末 9 2 2 B 期中 1 1 5 期末 5 6 4 C 期中 5 8 5 期末 5 6 9 D 期中 4 1 2 期末 0 1 7 E 期中 4 3 2 期末 4 6 6 F 期中 0 1 0 期末 9 7 3 G 期中 9 9 5 期末 6 9 5 H 期中 4 5 2 期末 8 1 1 I 期中 3 5 4 期末 5 4 8 J 期中 8 4 3 期末 9 7 8
df2. unstack( level = 1 )
python math en 期中 期末 期中 期末 期中 期末 A 9 9 7 2 8 2 B 1 5 1 6 5 4 C 5 5 8 6 5 9 D 4 0 1 1 2 7 E 4 4 3 6 2 6 F 0 9 1 7 0 3 G 9 6 9 9 5 5 H 4 8 5 1 2 1 I 3 5 5 4 4 8 J 8 9 4 7 3 8
df2. unstack( level = 0 )
python ... en A B C D E F G H I J ... A B C D E F G H I J 期中 9 1 5 4 4 0 9 4 3 8 ... 8 5 5 2 2 0 5 2 4 3 期末 9 5 5 0 4 9 6 8 5 9 ... 2 4 9 7 6 3 5 1 8 8
2 rows × 30 columns
df2. unstack( level = - 1 )
python math en 期中 期末 期中 期末 期中 期末 A 9 9 7 2 8 2 B 1 5 1 6 5 4 C 5 5 8 6 5 9 D 4 0 1 1 2 7 E 4 4 3 6 2 6 F 0 9 1 7 0 3 G 9 6 9 9 5 5 H 4 8 5 1 2 1 I 3 5 5 4 4 8 J 8 9 4 7 3 8
df2. stack( )
A 期中 python 9
math 7
en 8
期末 python 9
math 2
en 2
B 期中 python 1
math 1
en 5
期末 python 5
math 6
en 4
C 期中 python 5
math 8
en 5
期末 python 5
math 6
en 9
D 期中 python 4
math 1
en 2
期末 python 0
math 1
en 7
E 期中 python 4
math 3
en 2
期末 python 4
math 6
en 6
F 期中 python 0
math 1
en 0
期末 python 9
math 7
en 3
G 期中 python 9
math 9
en 5
期末 python 6
math 9
en 5
H 期中 python 4
math 5
en 2
期末 python 8
math 1
en 1
I 期中 python 3
math 5
en 4
期末 python 5
math 4
en 8
J 期中 python 8
math 4
en 3
期末 python 9
math 7
en 8
dtype: int32
df2. unstack( ) . stack( level = 0 )
期中 期末 A en 8 2 math 7 2 python 9 9 B en 5 4 math 1 6 python 1 5 C en 5 9 math 8 6 python 5 5 D en 2 7 math 1 1 python 4 0 E en 2 6 math 3 6 python 4 4 F en 0 3 math 1 7 python 0 9 G en 5 5 math 9 9 python 9 6 H en 2 1 math 5 1 python 4 8 I en 4 8 math 5 4 python 3 5 J en 3 8 math 4 7 python 8 9
df2. mean( axis = 0 )
python 5.35
math 4.65
en 4.45
dtype: float64
df2. mean( axis = 1 )
A 期中 8.000000
期末 4.333333
B 期中 2.333333
期末 5.000000
C 期中 6.000000
期末 6.666667
D 期中 2.333333
期末 2.666667
E 期中 3.000000
期末 5.333333
F 期中 0.333333
期末 6.333333
G 期中 7.666667
期末 6.666667
H 期中 3.666667
期末 3.333333
I 期中 4.000000
期末 5.666667
J 期中 5.000000
期末 8.000000
dtype: float64
df2. groupby( level= 1 ) . mean( )
python math en 期中 4.7 4.4 3.6 期末 6.0 4.9 5.3
df2. groupby( level= 0 ) . mean( )
python math en A 9.0 4.5 5.0 B 3.0 3.5 4.5 C 5.0 7.0 7.0 D 2.0 1.0 4.5 E 4.0 4.5 4.0 F 4.5 4.0 1.5 G 7.5 9.0 5.0 H 6.0 3.0 1.5 I 4.0 4.5 6.0 J 8.5 5.5 5.5
pandas高级应用
数学和统计方法
简单统计指标
df = pd. DataFrame( np. random. randint( 0 , 10 , size= ( 20 , 3 ) ) ,
columns= [ 'python' , 'math' , 'en' ] ,
index= list ( 'QWERTYUIOPLKJHGFDSAZ' ) )
df
python math en Q 6 1 5 W 3 1 1 E 2 7 3 R 0 2 0 T 6 2 6 Y 5 0 5 U 9 7 5 I 7 7 8 O 5 3 3 P 0 8 2 L 7 0 3 K 4 8 5 J 2 0 7 H 4 6 3 G 5 9 8 F 7 6 1 D 9 1 6 S 8 2 3 A 8 7 4 Z 3 7 1
df. iloc[ 6 , 2 ] = np. NaN
df
python math en Q 6 1 5.0 W 3 1 1.0 E 2 7 3.0 R 0 2 0.0 T 6 2 6.0 Y 5 0 5.0 U 9 7 NaN I 7 7 8.0 O 5 3 3.0 P 0 8 2.0 L 7 0 3.0 K 4 8 5.0 J 2 0 7.0 H 4 6 3.0 G 5 9 8.0 F 7 6 1.0 D 9 1 6.0 S 8 2 3.0 A 8 7 4.0 Z 3 7 1.0
df. count( )
python 20
math 20
en 19
dtype: int64
df. mean( )
python 5.000000
math 4.200000
en 3.894737
dtype: float64
df. median( )
python 5.0
math 4.5
en 3.0
dtype: float64
df. max ( )
python 9.0
math 9.0
en 8.0
dtype: float64
df. min ( )
python 0.0
math 0.0
en 0.0
dtype: float64
df[ 'python' ] . unique( )
array([6, 3, 2, 0, 5, 9, 7, 4, 8])
df[ 'math' ] . value_counts( )
7 5
1 3
2 3
0 3
8 2
6 2
3 1
9 1
Name: math, dtype: int64
df. quantile( q= [ 0 , 0.25 , 0.5 , 0.75 , 1 ] )
python math en 0.00 0.0 0.0 0.0 0.25 3.0 1.0 2.5 0.50 5.0 4.5 3.0 0.75 7.0 7.0 5.5 1.00 9.0 9.0 8.0
df. describe( ) . round ( )
python math en count 20.0 20.0 19.0 mean 5.0 4.0 4.0 std 3.0 3.0 2.0 min 0.0 0.0 0.0 25% 3.0 1.0 2.0 50% 5.0 4.0 3.0 75% 7.0 7.0 6.0 max 9.0 9.0 8.0
索引标签、位置获取
df[ 'python' ] . argmax( )
6
df. idxmax( )
python U
math G
en I
dtype: object
更多统计指标
df. cumsum( )
python math en Q 6 1 5.0 W 9 2 6.0 E 11 9 9.0 R 11 11 9.0 T 17 13 15.0 Y 22 13 20.0 U 31 20 NaN I 38 27 28.0 O 43 30 31.0 P 43 38 33.0 L 50 38 36.0 K 54 46 41.0 J 56 46 48.0 H 60 52 51.0 G 65 61 59.0 F 72 67 60.0 D 81 68 66.0 S 89 70 69.0 A 97 77 73.0 Z 100 84 74.0
df. cumprod( )
python math en Q 6 1 5.0 W 18 1 5.0 E 36 7 15.0 R 0 14 0.0 T 0 28 0.0 Y 0 0 0.0 U 0 0 NaN I 0 0 0.0 O 0 0 0.0 P 0 0 0.0 L 0 0 0.0 K 0 0 0.0 J 0 0 0.0 H 0 0 0.0 G 0 0 0.0 F 0 0 0.0 D 0 0 0.0 S 0 0 0.0 A 0 0 0.0 Z 0 0 0.0
df. cummin( )
python math en Q 6 1 5.0 W 3 1 1.0 E 2 1 1.0 R 0 1 0.0 T 0 1 0.0 Y 0 0 0.0 U 0 0 NaN I 0 0 0.0 O 0 0 0.0 P 0 0 0.0 L 0 0 0.0 K 0 0 0.0 J 0 0 0.0 H 0 0 0.0 G 0 0 0.0 F 0 0 0.0 D 0 0 0.0 S 0 0 0.0 A 0 0 0.0 Z 0 0 0.0
df. std( )
python 2.733804
math 3.221637
en 2.401267
dtype: float64
df. var( )
python 7.473684
math 10.378947
en 5.766082
dtype: float64
df. diff( )
python math en Q NaN NaN NaN W -3.0 0.0 -4.0 E -1.0 6.0 2.0 R -2.0 -5.0 -3.0 T 6.0 0.0 6.0 Y -1.0 -2.0 -1.0 U 4.0 7.0 NaN I -2.0 0.0 NaN O -2.0 -4.0 -5.0 P -5.0 5.0 -1.0 L 7.0 -8.0 1.0 K -3.0 8.0 2.0 J -2.0 -8.0 2.0 H 2.0 6.0 -4.0 G 1.0 3.0 5.0 F 2.0 -3.0 -7.0 D 2.0 -5.0 5.0 S -1.0 1.0 -3.0 A 0.0 5.0 1.0 Z -5.0 0.0 -3.0
df. pct_change( ) . round ( 2 )
python math en Q NaN NaN NaN W -0.50 0.00 -0.80 E -0.33 6.00 2.00 R -1.00 -0.71 -1.00 T inf 0.00 inf Y -0.17 -1.00 -0.17 U 0.80 inf 0.00 I -0.22 0.00 0.60 O -0.29 -0.57 -0.62 P -1.00 1.67 -0.33 L inf -1.00 0.50 K -0.43 inf 0.67 J -0.50 -1.00 0.40 H 1.00 inf -0.57 G 0.25 0.50 1.67 F 0.40 -0.33 -0.88 D 0.29 -0.83 5.00 S -0.11 1.00 -0.50 A 0.00 2.50 0.33 Z -0.62 0.00 -0.75
高级统计指标
df. cov( )
python math en python 7.473684 -0.684211 2.421053 math -0.684211 10.378947 0.172515 en 2.421053 0.172515 5.766082
df. var( )
python 7.473684
math 10.378947
en 5.766082
dtype: float64
df[ 'python' ] . cov( df[ 'math' ] )
-0.6842105263157894
df. corr( )
python math en python 1.000000 -0.077687 0.382359 math -0.077687 1.000000 0.022174 en 0.382359 0.022174 1.000000
df. corrwith( df[ 'en' ] )
python 0.382359
math 0.022174
en 1.000000
dtype: float64
排序
df = pd. DataFrame( np. random. randint( 0 , 20 , size= ( 20 , 3 ) ) ,
columns= [ 'python' , 'math' , 'en' ] ,
index= list ( 'QWERTYUIOPLKJHGDSAZM' ) )
df
python math en Q 9 16 17 W 5 3 5 E 1 0 4 R 0 12 14 T 6 8 3 Y 16 14 0 U 17 17 8 I 5 4 8 O 13 5 12 P 4 5 10 L 15 3 7 K 17 4 10 J 8 14 12 H 10 3 13 G 7 16 16 D 0 3 8 S 15 16 14 A 3 11 11 Z 18 15 4 M 10 14 14
df. sort_index( axis = 0 ,
ascending = False )
python math en Z 18 15 4 Y 16 14 0 W 5 3 5 U 17 17 8 T 6 8 3 S 15 16 14 R 0 12 14 Q 9 16 17 P 4 5 10 O 13 5 12 M 10 14 14 L 15 3 7 K 17 4 10 J 8 14 12 I 5 4 8 H 10 3 13 G 7 16 16 E 1 0 4 D 0 3 8 A 3 11 11
df. sort_values( by= 'python' ,
ascending= True )
python math en R 0 12 14 D 0 3 8 E 1 0 4 A 3 11 11 P 4 5 10 W 5 3 5 I 5 4 8 T 6 8 3 G 7 16 16 J 8 14 12 Q 9 16 17 H 10 3 13 M 10 14 14 O 13 5 12 S 15 16 14 L 15 3 7 Y 16 14 0 K 17 4 10 U 17 17 8 Z 18 15 4
df. sort_values( by= [ 'python' , 'math' ] ,
ascending= True )
python math en D 0 3 8 R 0 12 14 E 1 0 4 A 3 11 11 P 4 5 10 W 5 3 5 I 5 4 8 T 6 8 3 G 7 16 16 J 8 14 12 Q 9 16 17 H 10 3 13 M 10 14 14 O 13 5 12 L 15 3 7 S 15 16 14 Y 16 14 0 K 17 4 10 U 17 17 8 Z 18 15 4
df. nlargest( n= 5 ,
columns= 'python' )
python math en Z 18 15 4 U 17 17 8 K 17 4 10 Y 16 14 0 L 15 3 7
df. nsmallest( n= 5 ,
columns= 'python' )
python math en R 0 12 14 D 0 3 8 E 1 0 4 A 3 11 11 P 4 5 10
分箱操作
df = pd. DataFrame( np. random. randint( 0 , 151 , size= ( 100 , 3 ) ) ,
columns= [ 'python' , 'math' , 'en' ] )
df
python math en 0 127 124 50 1 150 91 130 2 24 113 69 3 90 2 123 4 133 113 60 ... ... ... ... 95 45 134 58 96 74 21 75 97 100 36 93 98 130 21 10 99 20 66 87
100 rows × 3 columns
pd. cut( df. python,
bins = 4 ,
labels= [ '不及格' , '及格' , '中等' , '优秀' ] ,
right= True
)
0 优秀
1 优秀
2 不及格
3 中等
4 优秀
...
95 及格
96 及格
97 中等
98 优秀
99 不及格
Name: python, Length: 100, dtype: category
Categories (4, object): ['不及格' < '及格' < '中等' < '优秀']
df[ '等级' ] = pd. cut( df. python,
bins = [ 0 , 30 , 60 , 90 , 120 , 150 ] ,
labels= [ '极差' , '不及格' , '及格' , '中等' , '优秀' ] )
df
python math en 等级 0 127 124 50 优秀 1 150 91 130 优秀 2 24 113 69 极差 3 90 2 123 及格 4 133 113 60 优秀 ... ... ... ... ... 95 45 134 58 不及格 96 74 21 75 及格 97 100 36 93 中等 98 130 21 10 优秀 99 20 66 87 极差
100 rows × 4 columns
pd. qcut( df. python,
q = 4 ,
labels= [ '不及格' , '及格' , '中等' , '优秀' ] ,
)
0 优秀
1 优秀
2 不及格
3 中等
4 优秀
...
95 及格
96 中等
97 优秀
98 优秀
99 不及格
Name: python, Length: 100, dtype: category
Categories (4, object): ['不及格' < '及格' < '中等' < '优秀']
pd. qcut( df. python,
q = 4 ,
labels= [ '不及格' , '及格' , '中等' , '优秀' ] ,
) . value_counts( )
不及格 25
及格 25
中等 25
优秀 25
Name: python, dtype: int64
分组聚合
分组
df = pd. DataFrame( data= { 'sex' : np. random. randint( 0 , 2 , size = 300 ) ,
'class' : np. random. randint( 1 , 9 , size = 300 ) ,
'python' : np. random. randint( 0 , 151 , size = 300 ) ,
'math' : np. random. randint( 0 , 151 , size = 300 ) ,
'en' : np. random. randint( 0 , 151 , size = 300 ) } )
df[ 'sex' ] = df[ 'sex' ] . map ( { 0 : '男' , 1 : '女' } )
df
sex class python math en 0 男 8 16 70 85 1 男 1 37 8 106 2 男 4 25 106 63 3 男 3 60 55 101 4 男 2 121 114 14 ... ... ... ... ... ... 295 男 4 33 84 66 296 男 8 142 134 89 297 女 2 150 88 130 298 男 8 62 127 1 299 男 1 98 101 80
300 rows × 5 columns
for name, group in df. groupby( by = 'sex' ) :
print ( name)
print ( group)
女
sex class python math en
6 女 8 102 26 26
11 女 2 94 140 71
12 女 5 109 27 127
18 女 6 65 86 52
21 女 7 147 58 55
.. .. ... ... ... ...
289 女 3 139 62 26
290 女 2 11 52 1
291 女 4 37 14 42
293 女 4 30 21 47
297 女 2 150 88 130
[136 rows x 5 columns]
男
sex class python math en
0 男 8 16 70 85
1 男 1 37 8 106
2 男 4 25 106 63
3 男 3 60 55 101
4 男 2 121 114 14
.. .. ... ... ... ...
294 男 6 124 90 107
295 男 4 33 84 66
296 男 8 142 134 89
298 男 8 62 127 1
299 男 1 98 101 80
[164 rows x 5 columns]
for name, group in df. groupby( by = [ 'sex' , 'class' ] ) :
print ( name)
print ( group)
('女', 1)
sex class python math en
27 女 1 128 67 30
34 女 1 10 116 91
35 女 1 3 22 144
73 女 1 102 143 120
84 女 1 29 23 120
99 女 1 53 77 144
143 女 1 104 84 103
157 女 1 145 77 92
173 女 1 61 118 143
174 女 1 114 26 130
178 女 1 85 21 0
186 女 1 26 46 7
192 女 1 76 115 79
198 女 1 61 132 128
217 女 1 58 53 89
230 女 1 61 104 144
234 女 1 50 150 128
238 女 1 115 103 13
277 女 1 83 17 82
('女', 2)
sex class python math en
11 女 2 94 140 71
37 女 2 9 71 129
47 女 2 31 5 14
79 女 2 49 142 113
82 女 2 105 67 17
122 女 2 23 143 96
142 女 2 142 144 29
147 女 2 53 107 0
163 女 2 66 87 1
171 女 2 104 80 118
206 女 2 16 100 140
222 女 2 38 12 112
226 女 2 118 85 20
242 女 2 1 3 136
260 女 2 50 115 146
268 女 2 70 93 22
272 女 2 71 74 47
290 女 2 11 52 1
297 女 2 150 88 130
('女', 3)
sex class python math en
30 女 3 24 90 78
40 女 3 89 134 99
65 女 3 48 81 73
69 女 3 0 0 66
72 女 3 147 65 95
104 女 3 143 122 54
140 女 3 140 119 97
144 女 3 125 96 14
145 女 3 84 112 42
249 女 3 85 59 122
283 女 3 69 34 27
286 女 3 129 10 73
289 女 3 139 62 26
('女', 4)
sex class python math en
32 女 4 116 5 119
43 女 4 60 96 103
48 女 4 13 147 133
64 女 4 126 140 10
92 女 4 122 49 71
118 女 4 125 34 23
139 女 4 84 58 66
164 女 4 59 64 80
167 女 4 47 109 94
183 女 4 96 14 97
187 女 4 123 26 21
203 女 4 6 74 105
274 女 4 148 10 109
291 女 4 37 14 42
293 女 4 30 21 47
('女', 5)
sex class python math en
12 女 5 109 27 127
25 女 5 6 124 98
55 女 5 78 63 10
70 女 5 30 71 29
90 女 5 20 52 78
152 女 5 60 116 35
240 女 5 49 27 33
264 女 5 137 95 142
275 女 5 101 69 80
276 女 5 49 88 5
('女', 6)
sex class python math en
18 女 6 65 86 52
26 女 6 112 25 89
31 女 6 86 88 92
42 女 6 134 37 111
66 女 6 25 119 113
85 女 6 29 123 36
91 女 6 109 46 87
98 女 6 100 132 10
102 女 6 27 113 107
141 女 6 35 22 144
159 女 6 108 13 136
185 女 6 77 116 103
218 女 6 120 137 45
219 女 6 132 95 42
232 女 6 110 51 47
241 女 6 77 57 40
259 女 6 61 104 58
('女', 7)
sex class python math en
21 女 7 147 58 55
28 女 7 141 69 22
33 女 7 53 70 146
51 女 7 120 64 42
59 女 7 89 30 107
74 女 7 64 145 40
89 女 7 36 100 61
96 女 7 66 82 49
106 女 7 77 65 28
127 女 7 103 9 115
133 女 7 31 138 106
138 女 7 130 33 120
151 女 7 83 82 43
161 女 7 22 127 52
165 女 7 47 51 3
170 女 7 100 23 56
194 女 7 118 146 142
199 女 7 125 132 87
213 女 7 64 46 41
216 女 7 43 99 114
228 女 7 140 91 70
243 女 7 76 103 44
245 女 7 9 89 12
250 女 7 139 81 40
288 女 7 115 148 79
('女', 8)
sex class python math en
6 女 8 102 26 26
88 女 8 19 44 134
114 女 8 133 19 74
124 女 8 91 77 120
130 女 8 141 135 21
166 女 8 3 114 91
168 女 8 136 69 119
172 女 8 69 147 32
177 女 8 37 81 49
195 女 8 66 69 4
224 女 8 18 29 141
233 女 8 117 63 23
239 女 8 68 71 71
248 女 8 31 71 9
252 女 8 141 113 147
267 女 8 125 136 17
271 女 8 102 137 97
279 女 8 137 110 51
('男', 1)
sex class python math en
1 男 1 37 8 106
8 男 1 8 48 113
23 男 1 116 43 13
24 男 1 79 115 123
29 男 1 24 112 126
39 男 1 112 115 133
68 男 1 131 138 80
95 男 1 86 125 5
101 男 1 49 0 67
137 男 1 59 5 62
176 男 1 91 75 68
191 男 1 90 91 15
196 男 1 129 59 9
211 男 1 78 97 13
215 男 1 31 27 66
229 男 1 62 21 28
237 男 1 140 66 126
282 男 1 5 67 12
299 男 1 98 101 80
('男', 2)
sex class python math en
4 男 2 121 114 14
7 男 2 55 125 62
14 男 2 36 142 77
16 男 2 80 143 49
36 男 2 105 101 88
38 男 2 51 55 18
45 男 2 80 17 13
62 男 2 23 35 20
105 男 2 115 106 59
128 男 2 86 18 78
149 男 2 81 34 12
158 男 2 80 24 133
169 男 2 55 100 47
175 男 2 138 87 43
182 男 2 111 62 54
210 男 2 69 128 67
227 男 2 16 34 79
246 男 2 29 29 86
253 男 2 130 117 119
265 男 2 8 21 18
284 男 2 22 66 16
('男', 3)
sex class python math en
3 男 3 60 55 101
5 男 3 85 69 89
9 男 3 69 42 34
13 男 3 91 142 86
17 男 3 36 50 15
50 男 3 116 60 68
54 男 3 34 144 96
58 男 3 102 126 95
61 男 3 127 97 78
77 男 3 16 74 7
81 男 3 130 54 112
86 男 3 92 3 92
93 男 3 51 83 25
103 男 3 118 122 64
108 男 3 41 109 124
109 男 3 84 123 71
111 男 3 15 32 94
117 男 3 18 62 86
126 男 3 8 76 64
129 男 3 145 9 5
204 男 3 134 5 54
207 男 3 61 22 143
212 男 3 66 146 49
251 男 3 132 109 147
254 男 3 10 146 117
280 男 3 89 109 101
287 男 3 24 24 128
('男', 4)
sex class python math en
2 男 4 25 106 63
46 男 4 24 149 108
49 男 4 133 134 64
52 男 4 46 37 134
60 男 4 137 40 45
87 男 4 64 6 19
112 男 4 136 73 114
120 男 4 133 18 63
190 男 4 45 25 111
200 男 4 38 41 92
202 男 4 19 121 72
221 男 4 29 132 83
236 男 4 54 100 53
263 男 4 40 14 48
270 男 4 125 103 132
295 男 4 33 84 66
('男', 5)
sex class python math en
53 男 5 114 92 128
63 男 5 91 105 125
76 男 5 14 18 56
97 男 5 42 43 95
110 男 5 83 20 49
113 男 5 139 46 146
115 男 5 93 49 142
125 男 5 31 85 95
131 男 5 51 18 12
150 男 5 46 90 33
162 男 5 78 123 127
208 男 5 3 130 94
209 男 5 106 92 92
214 男 5 39 67 119
223 男 5 7 102 137
257 男 5 79 44 88
262 男 5 109 34 96
266 男 5 107 72 149
273 男 5 145 2 58
('男', 6)
sex class python math en
15 男 6 147 53 58
20 男 6 18 24 53
116 男 6 124 92 52
119 男 6 117 60 33
134 男 6 36 17 136
135 男 6 66 19 48
148 男 6 102 26 42
154 男 6 12 18 31
155 男 6 36 19 136
181 男 6 48 51 142
184 男 6 72 114 74
188 男 6 136 82 26
189 男 6 120 116 36
193 男 6 42 21 69
201 男 6 144 124 24
225 男 6 107 140 18
235 男 6 83 117 99
294 男 6 124 90 107
('男', 7)
sex class python math en
10 男 7 35 134 96
19 男 7 112 127 64
22 男 7 106 60 8
44 男 7 132 15 124
67 男 7 103 40 93
71 男 7 57 64 150
78 男 7 131 122 136
83 男 7 12 149 83
94 男 7 76 54 118
123 男 7 53 109 50
136 男 7 15 64 51
146 男 7 12 101 65
153 男 7 58 110 107
156 男 7 109 49 138
180 男 7 50 73 86
197 男 7 25 71 74
205 男 7 136 80 50
220 男 7 103 48 80
244 男 7 58 55 99
261 男 7 75 136 83
269 男 7 69 86 149
278 男 7 66 70 81
292 男 7 144 149 11
('男', 8)
sex class python math en
0 男 8 16 70 85
41 男 8 91 118 70
56 男 8 32 30 146
57 男 8 85 129 98
75 男 8 145 99 135
80 男 8 56 94 57
100 男 8 85 100 53
107 男 8 146 133 64
121 男 8 69 21 9
132 男 8 66 41 43
160 男 8 24 125 136
179 男 8 16 44 92
231 男 8 96 6 1
247 男 8 81 139 133
255 男 8 12 13 113
256 男 8 64 101 20
258 男 8 138 27 103
281 男 8 66 23 17
285 男 8 20 118 54
296 男 8 142 134 89
298 男 8 62 127 1
for name, group in df[ [ 'python' , 'math' ] ] . groupby( df[ 'sex' ] ) :
print ( name)
print ( group)
女
python math
6 102 26
11 94 140
12 109 27
18 65 86
21 147 58
.. ... ...
289 139 62
290 11 52
291 37 14
293 30 21
297 150 88
[136 rows x 2 columns]
男
python math
0 16 70
1 37 8
2 25 106
3 60 55
4 121 114
.. ... ...
294 124 90
295 33 84
296 142 134
298 62 127
299 98 101
[164 rows x 2 columns]
for name, group in df[ [ 'python' , 'math' ] ] . groupby( [ df[ 'sex' ] , df[ 'class' ] ] ) :
print ( name)
print ( group)
('女', 1)
python math
27 128 67
34 10 116
35 3 22
73 102 143
84 29 23
99 53 77
143 104 84
157 145 77
173 61 118
174 114 26
178 85 21
186 26 46
192 76 115
198 61 132
217 58 53
230 61 104
234 50 150
238 115 103
277 83 17
('女', 2)
python math
11 94 140
37 9 71
47 31 5
79 49 142
82 105 67
122 23 143
142 142 144
147 53 107
163 66 87
171 104 80
206 16 100
222 38 12
226 118 85
242 1 3
260 50 115
268 70 93
272 71 74
290 11 52
297 150 88
('女', 3)
python math
30 24 90
40 89 134
65 48 81
69 0 0
72 147 65
104 143 122
140 140 119
144 125 96
145 84 112
249 85 59
283 69 34
286 129 10
289 139 62
('女', 4)
python math
32 116 5
43 60 96
48 13 147
64 126 140
92 122 49
118 125 34
139 84 58
164 59 64
167 47 109
183 96 14
187 123 26
203 6 74
274 148 10
291 37 14
293 30 21
('女', 5)
python math
12 109 27
25 6 124
55 78 63
70 30 71
90 20 52
152 60 116
240 49 27
264 137 95
275 101 69
276 49 88
('女', 6)
python math
18 65 86
26 112 25
31 86 88
42 134 37
66 25 119
85 29 123
91 109 46
98 100 132
102 27 113
141 35 22
159 108 13
185 77 116
218 120 137
219 132 95
232 110 51
241 77 57
259 61 104
('女', 7)
python math
21 147 58
28 141 69
33 53 70
51 120 64
59 89 30
74 64 145
89 36 100
96 66 82
106 77 65
127 103 9
133 31 138
138 130 33
151 83 82
161 22 127
165 47 51
170 100 23
194 118 146
199 125 132
213 64 46
216 43 99
228 140 91
243 76 103
245 9 89
250 139 81
288 115 148
('女', 8)
python math
6 102 26
88 19 44
114 133 19
124 91 77
130 141 135
166 3 114
168 136 69
172 69 147
177 37 81
195 66 69
224 18 29
233 117 63
239 68 71
248 31 71
252 141 113
267 125 136
271 102 137
279 137 110
('男', 1)
python math
1 37 8
8 8 48
23 116 43
24 79 115
29 24 112
39 112 115
68 131 138
95 86 125
101 49 0
137 59 5
176 91 75
191 90 91
196 129 59
211 78 97
215 31 27
229 62 21
237 140 66
282 5 67
299 98 101
('男', 2)
python math
4 121 114
7 55 125
14 36 142
16 80 143
36 105 101
38 51 55
45 80 17
62 23 35
105 115 106
128 86 18
149 81 34
158 80 24
169 55 100
175 138 87
182 111 62
210 69 128
227 16 34
246 29 29
253 130 117
265 8 21
284 22 66
('男', 3)
python math
3 60 55
5 85 69
9 69 42
13 91 142
17 36 50
50 116 60
54 34 144
58 102 126
61 127 97
77 16 74
81 130 54
86 92 3
93 51 83
103 118 122
108 41 109
109 84 123
111 15 32
117 18 62
126 8 76
129 145 9
204 134 5
207 61 22
212 66 146
251 132 109
254 10 146
280 89 109
287 24 24
('男', 4)
python math
2 25 106
46 24 149
49 133 134
52 46 37
60 137 40
87 64 6
112 136 73
120 133 18
190 45 25
200 38 41
202 19 121
221 29 132
236 54 100
263 40 14
270 125 103
295 33 84
('男', 5)
python math
53 114 92
63 91 105
76 14 18
97 42 43
110 83 20
113 139 46
115 93 49
125 31 85
131 51 18
150 46 90
162 78 123
208 3 130
209 106 92
214 39 67
223 7 102
257 79 44
262 109 34
266 107 72
273 145 2
('男', 6)
python math
15 147 53
20 18 24
116 124 92
119 117 60
134 36 17
135 66 19
148 102 26
154 12 18
155 36 19
181 48 51
184 72 114
188 136 82
189 120 116
193 42 21
201 144 124
225 107 140
235 83 117
294 124 90
('男', 7)
python math
10 35 134
19 112 127
22 106 60
44 132 15
67 103 40
71 57 64
78 131 122
83 12 149
94 76 54
123 53 109
136 15 64
146 12 101
153 58 110
156 109 49
180 50 73
197 25 71
205 136 80
220 103 48
244 58 55
261 75 136
269 69 86
278 66 70
292 144 149
('男', 8)
python math
0 16 70
41 91 118
56 32 30
57 85 129
75 145 99
80 56 94
100 85 100
107 146 133
121 69 21
132 66 41
160 24 125
179 16 44
231 96 6
247 81 139
255 12 13
256 64 101
258 138 27
281 66 23
285 20 118
296 142 134
298 62 127
df. dtypes
sex object
class int32
python int32
math int32
en int32
dtype: object
for name, group in df. groupby( df. dtypes, axis = 1 ) :
print ( name, group)
int32 class python math en
0 8 16 70 85
1 1 37 8 106
2 4 25 106 63
3 3 60 55 101
4 2 121 114 14
.. ... ... ... ...
295 4 33 84 66
296 8 142 134 89
297 2 150 88 130
298 8 62 127 1
299 1 98 101 80
[300 rows x 4 columns]
object sex
0 男
1 男
2 男
3 男
4 男
.. ..
295 男
296 男
297 女
298 男
299 男
[300 rows x 1 columns]
m = { 'sex' : 'category' , 'class' : 'category' , 'python' : '科目' , 'math' : '科目' , 'en' : '科目' }
for name, group in df. groupby( by = m, axis = 1 ) :
print ( name, group)
category sex class
0 男 8
1 男 1
2 男 4
3 男 3
4 男 2
.. .. ...
295 男 4
296 男 8
297 女 2
298 男 8
299 男 1
[300 rows x 2 columns]
科目 python math en
0 16 70 85
1 37 8 106
2 25 106 63
3 60 55 101
4 121 114 14
.. ... ... ...
295 33 84 66
296 142 134 89
297 150 88 130
298 62 127 1
299 98 101 80
[300 rows x 3 columns]
分组聚合
df. groupby( by= 'sex' ) [ [ 'python' , 'math' ] ] . max ( ) 3 数学和python根据性别分组,quit解的最大值
python math sex 女 150 150 男 147 149
df. groupby( by= [ 'sex' , 'class' ] ) . mean( ) . round ( 1 )
python math en sex class 女 1 71.8 78.6 94.1 2 63.2 84.6 70.6 3 94.0 75.7 66.6 4 79.5 57.4 74.7 5 63.9 73.2 63.7 6 82.8 80.2 77.2 7 85.5 83.2 67.0 8 85.3 83.9 68.1 男 1 75.0 69.1 65.5 2 71.0 74.2 54.9 3 72.4 77.5 79.4 4 67.6 73.9 79.2 5 72.5 64.8 96.9 6 85.2 65.7 65.8 7 75.5 85.5 86.8 8 72.0 80.6 72.3
df. groupby( by= [ 'sex' , 'class' ] ) . size( )
sex class
女 1 19
2 19
3 13
4 15
5 10
6 17
7 25
8 18
男 1 19
2 21
3 27
4 16
5 19
6 18
7 23
8 21
dtype: int64
df. groupby( by= [ 'sex' , 'class' ] ) . describe( ) . round ( 1 )
python math en count mean std min 25% 50% 75% max count mean ... 75% max count mean std min 25% 50% 75% max sex class 女 1 19.0 71.8 39.6 3.0 51.5 61.0 103.0 145.0 19.0 78.6 ... 115.5 150.0 19.0 94.1 48.5 0.0 80.5 103.0 129.0 144.0 2 19.0 63.2 45.2 1.0 27.0 53.0 99.0 150.0 19.0 84.6 ... 111.0 144.0 19.0 70.6 55.8 0.0 18.5 71.0 123.5 146.0 3 13.0 94.0 48.5 0.0 69.0 89.0 139.0 147.0 13.0 75.7 ... 112.0 134.0 13.0 66.6 32.7 14.0 42.0 73.0 95.0 122.0 4 15.0 79.5 46.4 6.0 42.0 84.0 122.5 148.0 15.0 57.4 ... 85.0 147.0 15.0 74.7 38.6 10.0 44.5 80.0 104.0 133.0 5 10.0 63.9 42.0 6.0 34.8 54.5 95.2 137.0 10.0 73.2 ... 93.2 124.0 10.0 63.7 48.4 5.0 30.0 56.5 93.5 142.0 6 17.0 82.8 37.2 25.0 61.0 86.0 110.0 134.0 17.0 80.2 ... 116.0 137.0 17.0 77.2 38.8 10.0 45.0 87.0 107.0 144.0 7 25.0 85.5 41.3 9.0 53.0 83.0 120.0 147.0 25.0 83.2 ... 103.0 148.0 25.0 67.0 39.9 3.0 41.0 55.0 106.0 146.0 8 18.0 85.3 47.8 3.0 44.2 96.5 131.0 141.0 18.0 83.9 ... 113.8 147.0 18.0 68.1 49.0 4.0 23.8 61.0 113.5 147.0 男 1 19.0 75.0 41.5 5.0 43.0 79.0 105.0 140.0 19.0 69.1 ... 106.5 138.0 19.0 65.5 46.4 5.0 14.0 67.0 109.5 133.0 2 21.0 71.0 39.6 8.0 36.0 80.0 105.0 138.0 21.0 74.2 ... 114.0 143.0 21.0 54.9 35.4 12.0 18.0 54.0 78.0 133.0 3 27.0 72.4 43.6 8.0 35.0 69.0 109.0 145.0 27.0 77.5 ... 115.5 146.0 27.0 79.4 39.0 5.0 59.0 86.0 101.0 147.0 4 16.0 67.6 46.8 19.0 32.0 45.5 127.0 137.0 16.0 73.9 ... 109.8 149.0 16.0 79.2 33.1 19.0 60.5 69.0 108.8 134.0 5 19.0 72.5 43.1 3.0 40.5 79.0 106.5 145.0 19.0 64.8 ... 92.0 130.0 19.0 96.9 40.0 12.0 73.0 95.0 127.5 149.0 6 18.0 85.2 45.0 12.0 43.5 92.5 123.0 147.0 18.0 65.7 ... 108.5 140.0 18.0 65.8 41.0 18.0 33.8 52.5 92.8 142.0 7 23.0 75.5 41.4 12.0 51.5 69.0 107.5 144.0 23.0 85.5 ... 116.0 149.0 23.0 86.8 38.8 8.0 64.5 83.0 112.5 150.0 8 21.0 72.0 43.8 12.0 32.0 66.0 91.0 146.0 21.0 80.6 ... 125.0 139.0 21.0 72.3 46.4 1.0 43.0 70.0 103.0 146.0
16 rows × 24 columns
分组聚合apply、transform
df
sex class python math en 0 男 8 16 70 85 1 男 1 37 8 106 2 男 4 25 106 63 3 男 3 60 55 101 4 男 2 121 114 14 ... ... ... ... ... ... 295 男 4 33 84 66 296 男 8 142 134 89 297 女 2 150 88 130 298 男 8 62 127 1 299 男 1 98 101 80
300 rows × 5 columns
df. groupby( by= [ 'sex' , 'class' ] ) [ [ 'python' , 'en' ] ] . apply ( np. mean) . round ( 1 )
C:\Users\Panda\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\core\fromnumeric.py:3472: FutureWarning: In a future version, DataFrame.mean(axis=None) will return a scalar mean over the entire DataFrame. To retain the old behavior, use 'frame.mean(axis=0)' or just 'frame.mean()'
return mean(axis=axis, dtype=dtype, out=out, **kwargs)
python en sex class 女 1 71.8 94.1 2 63.2 70.6 3 94.0 66.6 4 79.5 74.7 5 63.9 63.7 6 82.8 77.2 7 85.5 67.0 8 85.3 68.1 男 1 75.0 65.5 2 71.0 54.9 3 72.4 79.4 4 67.6 79.2 5 72.5 96.9 6 85.2 65.8 7 75.5 86.8 8 72.0 72.3
df. groupby( by= [ 'sex' , 'class' ] ) [ [ 'python' , 'en' ] ] . transform( np. mean) . round ( 1 )
python en 0 72.0 72.3 1 75.0 65.5 2 67.6 79.2 3 72.4 79.4 4 71.0 54.9 ... ... ... 295 67.6 79.2 296 72.0 72.3 297 63.2 70.6 298 72.0 72.3 299 75.0 65.5
300 rows × 2 columns
def _mean ( x) :
return np. round ( x. mean( ) , 1 )
df. groupby( by= [ 'sex' , 'class' ] ) [ [ 'python' , 'en' ] ] . transform( _mean)
python en 0 72.0 72.3 1 75.0 65.5 2 67.6 79.2 3 72.4 79.4 4 71.0 54.9 ... ... ... 295 67.6 79.2 296 72.0 72.3 297 63.2 70.6 298 72.0 72.3 299 75.0 65.5
300 rows × 2 columns
分组聚合agg
df. groupby( by = [ 'sex' , 'class' ] ) . agg( [ np. mean, np. max , np. min ] )
python math en mean amax amin mean amax amin mean amax amin sex class 女 1 71.789474 145 3 78.631579 150 17 94.052632 144 0 2 63.210526 150 1 84.631579 144 3 70.631579 146 0 3 94.000000 147 0 75.692308 134 0 66.615385 122 14 4 79.466667 148 6 57.400000 147 5 74.666667 133 10 5 63.900000 137 6 73.200000 124 27 63.700000 142 5 6 82.764706 134 25 80.235294 137 13 77.176471 144 10 7 85.520000 147 9 83.240000 148 9 66.960000 146 3 8 85.333333 141 3 83.944444 147 19 68.111111 147 4 男 1 75.000000 140 5 69.105263 138 0 65.526316 133 5 2 71.000000 138 8 74.190476 143 17 54.857143 133 12 3 72.370370 145 8 77.518519 146 3 79.444444 147 5 4 67.562500 137 19 73.937500 149 6 79.187500 134 19 5 72.473684 145 3 64.842105 130 2 96.894737 149 12 6 85.222222 147 12 65.722222 140 17 65.777778 142 18 7 75.521739 144 12 85.478261 149 15 86.782609 150 8 8 72.000000 146 12 80.571429 139 6 72.333333 146 1
df. groupby( by = [ 'sex' , 'class' ] ) . agg( [ ( '平均值' , np. mean) ,
( '最大值' , np. max ) ,
( '最小值' , np. min ) ] ) . round ( 1 )
python math en 平均值 最大值 最小值 平均值 最大值 最小值 平均值 最大值 最小值 sex class 女 1 71.8 145 3 78.6 150 17 94.1 144 0 2 63.2 150 1 84.6 144 3 70.6 146 0 3 94.0 147 0 75.7 134 0 66.6 122 14 4 79.5 148 6 57.4 147 5 74.7 133 10 5 63.9 137 6 73.2 124 27 63.7 142 5 6 82.8 134 25 80.2 137 13 77.2 144 10 7 85.5 147 9 83.2 148 9 67.0 146 3 8 85.3 141 3 83.9 147 19 68.1 147 4 男 1 75.0 140 5 69.1 138 0 65.5 133 5 2 71.0 138 8 74.2 143 17 54.9 133 12 3 72.4 145 8 77.5 146 3 79.4 147 5 4 67.6 137 19 73.9 149 6 79.2 134 19 5 72.5 145 3 64.8 130 2 96.9 149 12 6 85.2 147 12 65.7 140 17 65.8 142 18 7 75.5 144 12 85.5 149 15 86.8 150 8 8 72.0 146 12 80.6 139 6 72.3 146 1
df. groupby( by= [ 'sex' , 'class' ] ) . agg( { 'python' : np. max ,
'math' : np. min ,
'en' : np. mean} ) . round ( 1 )
python math en sex class 女 1 145 17 94.1 2 150 3 70.6 3 147 0 66.6 4 148 5 74.7 5 137 27 63.7 6 134 13 77.2 7 147 9 67.0 8 141 19 68.1 男 1 140 0 65.5 2 138 17 54.9 3 145 3 79.4 4 137 6 79.2 5 145 2 96.9 6 147 17 65.8 7 144 15 86.8 8 146 6 72.3
透视表pivot_table
df
sex class python math en 0 男 8 16 70 85 1 男 1 37 8 106 2 男 4 25 106 63 3 男 3 60 55 101 4 男 2 121 114 14 ... ... ... ... ... ... 295 男 4 33 84 66 296 男 8 142 134 89 297 女 2 150 88 130 298 男 8 62 127 1 299 男 1 98 101 80
300 rows × 5 columns
df. pivot_table( values = [ 'python' , 'en' ] ,
index= [ 'sex' , 'class' ] ,
aggfunc= 'mean' ) . round ( 1 )
en python sex class 女 1 94.1 71.8 2 70.6 63.2 3 66.6 94.0 4 74.7 79.5 5 63.7 63.9 6 77.2 82.8 7 67.0 85.5 8 68.1 85.3 男 1 65.5 75.0 2 54.9 71.0 3 79.4 72.4 4 79.2 67.6 5 96.9 72.5 6 65.8 85.2 7 86.8 75.5 8 72.3 72.0
df1 = df. pivot_table( values = [ 'python' , 'math' , 'en' ] ,
index= [ 'sex' , 'class' ] ,
aggfunc= { 'python' : [ ( '平均值' , np. mean) ] ,
'math' : [ ( '最大值' , np. max ) ,
( '最小值' , np. min ) ] ,
'en' : [ ( '标准差' , np. std) ,
( '方差' , np. var) ,
( '计数' , np. size) ] } ) . round ( 1 )
df1
en math python 方差 标准差 计数 最大值 最小值 平均值 sex class 女 1 2356.2 48.5 19 150 17 71.8 2 3115.6 55.8 19 144 3 63.2 3 1067.4 32.7 13 134 0 94.0 4 1487.4 38.6 15 147 5 79.5 5 2342.7 48.4 10 124 27 63.9 6 1508.8 38.8 17 137 13 82.8 7 1593.5 39.9 25 148 9 85.5 8 2398.1 49.0 18 147 19 85.3 男 1 2150.3 46.4 19 138 0 75.0 2 1252.5 35.4 21 143 17 71.0 3 1524.5 39.0 27 146 3 72.4 4 1097.1 33.1 16 149 6 67.6 5 1600.5 40.0 19 130 2 72.5 6 1684.1 41.0 18 140 17 85.2 7 1507.1 38.8 23 149 15 75.5 8 2151.7 46.4 21 139 6 72.0
时间序列
时间戳操作
pd. Timestamp( '2020.09.15 20' )
Timestamp('2020-09-15 20:00:00')
pd. Period( '2020.09.15' , freq= 'M' )
Period('2020-09', 'M')
index = pd. date_range( '2020.09.15' ,
freq= 'D' ,
period