import pandas as pd
import numpy as np
np. array( [ 1 , 2 , 3 ] )
array([1, 2, 3])
#常见函数
np. arange( 10 )
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
np. linspace( 2 , 3 , num= 5 , endpoint= False )
array([2. , 2.2, 2.4, 2.6, 2.8])
np. random. randn( 4 , 4 )
array([[ 0.31090058, 1.36645632, 0.86897091, 0.09761096],
[ 0.35996995, 1.0850638 , 0.68081139, -1.54973962],
[ 1.09726152, 0.85757526, 1.00096873, 1.19056036],
[-0.71615881, 0.25503605, 0.23917234, 0.53867985]])
numpy.random.randint(low, high=None, size=None, dtype=’l’)
np. random. randint( 0 , 4 , size= ( 2 , 4 ) )
array([[1, 0, 2, 2],
[2, 3, 3, 1]])
np. zeros( 4 )
array([0., 0., 0., 0.])
np. zeros( 4 ) . reshape( 2 , 2 )
array([[0., 0.],
[0., 0.]])
df = pd. DataFrame( {
'国家' : [ '中国' , '美国' , '日本' ] ,
'地区' : [ '亚洲' , '北美' , '亚洲' ] ,
'人口' : [ '13.97' , '3.28' , '1.26' ] ,
'GDP' : [ 14.34 , 21.43 , 5.08 ]
} )
df
国家 地区 人口 GDP 0 中国 亚洲 13.97 14.34 1 美国 北美 3.28 21.43 2 日本 亚洲 1.26 5.08
s = pd. Series( np. random. randn( 5 ) , index = [ 'a' , 'b' , 'c' , 'd' , 'e' ] )
s
a -0.610498
b -1.111079
c 1.495161
d 1.233846
e 0.854973
dtype: float64
*df = pd.DataFrame(data, index, columns=None)
*利用Series组成字典
d = {
'x' : pd. Series( [ 1 , 2 , 3 ] , index= [ 'a' , 'b' , 'c' ] ) ,
'y' : pd. Series( [ 1 , 2 , 3 , 4 ] , index= [ 'a' , 'b' , 'c' , 'd' ] )
}
pd. DataFrame( d)
建立索引
data = 'https://www.gairuo.com/file/data/dataset/team.xlsx'
df = pd. read_excel( data, index_col= 'name' )
df
team Q1 Q2 Q3 Q4 name Liver E 89 21 24 64 Arry C 36 37 37 57 Ack A 57 60 18 84 Eorge C 93 96 71 78 Oah D 65 49 61 86 ... ... ... ... ... ... Gabriel C 48 59 87 74 Austin7 C 21 31 30 43 Lincoln4 C 98 93 1 20 Eli E 11 74 58 91 Ben E 21 43 41 74
100 rows × 5 columns
df = pd. read_excel( data)
df
name team Q1 Q2 Q3 Q4 0 Liver E 89 21 24 64 1 Arry C 36 37 37 57 2 Ack A 57 60 18 84 3 Eorge C 93 96 71 78 4 Oah D 65 49 61 86 ... ... ... ... ... ... ... 95 Gabriel C 48 59 87 74 96 Austin7 C 21 31 30 43 97 Lincoln4 C 98 93 1 20 98 Eli E 11 74 58 91 99 Ben E 21 43 41 74
100 rows × 6 columns
df. set_index( 'name' )
team Q1 Q2 Q3 Q4 name Liver E 89 21 24 64 Arry C 36 37 37 57 Ack A 57 60 18 84 Eorge C 93 96 71 78 Oah D 65 49 61 86 ... ... ... ... ... ... Gabriel C 48 59 87 74 Austin7 C 21 31 30 43 Lincoln4 C 98 93 1 20 Eli E 11 74 58 91 Ben E 21 43 41 74
100 rows × 5 columns
s = pd. Series( [ i for i in range ( 100 ) ] )
s
0 0
1 1
2 2
3 3
4 4
..
95 95
96 96
97 97
98 98
99 99
Length: 100, dtype: int64
df. set_index( s)
name team Q1 Q2 Q3 Q4 0 Liver E 89 21 24 64 1 Arry C 36 37 37 57 2 Ack A 57 60 18 84 3 Eorge C 93 96 71 78 4 Oah D 65 49 61 86 ... ... ... ... ... ... ... 95 Gabriel C 48 59 87 74 96 Austin7 C 21 31 30 43 97 Lincoln4 C 98 93 1 20 98 Eli E 11 74 58 91 99 Ben E 21 43 41 74
100 rows × 6 columns
重置索引
df. reset_index( )
index name team Q1 Q2 Q3 Q4 0 0 Liver E 89 21 24 64 1 1 Arry C 36 37 37 57 2 2 Ack A 57 60 18 84 3 3 Eorge C 93 96 71 78 4 4 Oah D 65 49 61 86 ... ... ... ... ... ... ... ... 95 95 Gabriel C 48 59 87 74 96 96 Austin7 C 21 31 30 43 97 97 Lincoln4 C 98 93 1 20 98 98 Eli E 11 74 58 91 99 99 Ben E 21 43 41 74
100 rows × 7 columns
df
name team Q1 Q2 Q3 Q4 0 Liver E 89 21 24 64 1 Arry C 36 37 37 57 2 Ack A 57 60 18 84 3 Eorge C 93 96 71 78 4 Oah D 65 49 61 86 ... ... ... ... ... ... ... 95 Gabriel C 48 59 87 74 96 Austin7 C 21 31 30 43 97 Lincoln4 C 98 93 1 20 98 Eli E 11 74 58 91 99 Ben E 21 43 41 74
100 rows × 6 columns
df. index
RangeIndex(start=0, stop=100, step=1)
df. reset_index( drop= True )
name team Q1 Q2 Q3 Q4 0 Liver E 89 21 24 64 1 Arry C 36 37 37 57 2 Ack A 57 60 18 84 3 Eorge C 93 96 71 78 4 Oah D 65 49 61 86 ... ... ... ... ... ... ... 95 Gabriel C 48 59 87 74 96 Austin7 C 21 31 30 43 97 Lincoln4 C 98 93 1 20 98 Eli E 11 74 58 91 99 Ben E 21 43 41 74
100 rows × 6 columns
df. index
RangeIndex(start=0, stop=100, step=1)
df. head( )
name team Q1 Q2 Q3 Q4 0 Liver E 89 21 24 64 1 Arry C 36 37 37 57 2 Ack A 57 60 18 84 3 Eorge C 93 96 71 78 4 Oah D 65 49 61 86
dataframe.reset_index(drop=True, inplace=True)
df. tail( )
name team Q1 Q2 Q3 Q4 95 Gabriel C 48 59 87 74 96 Austin7 C 21 31 30 43 97 Lincoln4 C 98 93 1 20 98 Eli E 11 74 58 91 99 Ben E 21 43 41 74
df. columns
Index(['name', 'team', 'Q1', 'Q2', 'Q3', 'Q4'], dtype='object')
df. describe( )
Q1 Q2 Q3 Q4 count 100.000000 100.000000 100.000000 100.000000 mean 49.200000 52.550000 52.670000 52.780000 std 29.962603 29.845181 26.543677 27.818524 min 1.000000 1.000000 1.000000 2.000000 25% 19.500000 26.750000 29.500000 29.500000 50% 51.500000 49.500000 55.000000 53.000000 75% 74.250000 77.750000 76.250000 75.250000 max 98.000000 99.000000 99.000000 99.000000
df. mean( )
E:\Anaconda\envs\py37\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
"""Entry point for launching an IPython kernel.
Q1 49.20
Q2 52.55
Q3 52.67
Q4 52.78
dtype: float64
df. mean( 1 )
E:\Anaconda\envs\py37\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
"""Entry point for launching an IPython kernel.
0 49.50
1 41.75
2 54.75
3 84.50
4 65.25
...
95 67.00
96 31.25
97 53.00
98 58.50
99 44.75
Length: 100, dtype: float64
df. value_counts( )
name team Q1 Q2 Q3 Q4
Aaron A 96 75 55 8 1
Liam B 2 80 24 25 1
Matthew C 44 33 41 98 1
Mason D 80 96 26 49 1
Luke6 D 15 97 95 99 1
..
Elliott B 9 31 33 60 1
Elliot C 15 17 76 22 1
Elijah B 97 89 15 46 1
Eli E 11 74 58 91 1
Zachary E 12 71 85 93 1
Length: 100, dtype: int64
df
name team Q1 Q2 Q3 Q4 0 Liver E 89 21 24 64 1 Arry C 36 37 37 57 2 Ack A 57 60 18 84 3 Eorge C 93 96 71 78 4 Oah D 65 49 61 86 ... ... ... ... ... ... ... 95 Gabriel C 48 59 87 74 96 Austin7 C 21 31 30 43 97 Lincoln4 C 98 93 1 20 98 Eli E 11 74 58 91 99 Ben E 21 43 41 74
100 rows × 6 columns
df[ 'team' ] . value_counts( )
C 22
B 22
E 20
D 19
A 17
Name: team, dtype: int64
df. groupby( by = [ 'team' ] ) . count( )
name Q1 Q2 Q3 Q4 team A 17 17 17 17 17 B 22 22 22 22 22 C 22 22 22 22 22 D 19 19 19 19 19 E 20 20 20 20 20
diff() : 用于上一个数据和下一个数据的差值 shift()
df
name team Q1 Q2 Q3 Q4 0 Liver E 89 21 24 64 1 Arry C 36 37 37 57 2 Ack A 57 60 18 84 3 Eorge C 93 96 71 78 4 Oah D 65 49 61 86 ... ... ... ... ... ... ... 95 Gabriel C 48 59 87 74 96 Austin7 C 21 31 30 43 97 Lincoln4 C 98 93 1 20 98 Eli E 11 74 58 91 99 Ben E 21 43 41 74
100 rows × 6 columns
df[ 'Q1' ] . shift( )
0 NaN
1 89.0
2 36.0
3 57.0
4 93.0
...
95 20.0
96 48.0
97 21.0
98 98.0
99 11.0
Name: Q1, Length: 100, dtype: float64
value_counts(values,sort=True, ascending=False, normalize=False,bins=None,dropna=True)
truncate(before= , after= )
Pandas 高级操作
df. Q1> 36
0 True
1 False
2 True
3 True
4 True
...
95 True
96 False
97 True
98 False
99 False
Name: Q1, Length: 100, dtype: bool
df. loc[ : , 'Q1' : 'Q3' ]
Q1 Q2 Q3 0 89 21 24 1 36 37 37 2 57 60 18 3 93 96 71 4 65 49 61 ... ... ... ... 95 48 59 87 96 21 31 30 97 98 93 1 98 11 74 58 99 21 43 41
100 rows × 3 columns
df.query()
df. query( 'Q1>Q2>90' )
name team Q1 Q2 Q3 Q4 97 Lincoln4 C 98 93 1 20
df. query( '(Q1<50)&(Q2>40)&(Q3>90)' )
name team Q1 Q2 Q3 Q4 56 David B 21 47 99 2 72 Luke6 D 15 97 95 99
df.filter() 支持模糊匹配,正则表达式
df. filter ( items= [ 'Q1' , 'Q2' ] )
Q1 Q2 0 89 21 1 36 37 2 57 60 3 93 96 4 65 49 ... ... ... 95 48 59 96 21 31 97 98 93 98 11 74 99 21 43
100 rows × 2 columns
类型转换astype()
df. Q1. dtypes
dtype('int64')
df. Q1. astype( 'int32' )
0 89
1 36
2 57
3 93
4 65
..
95 48
96 21
97 98
98 11
99 21
Name: Q1, Length: 100, dtype: int32
转为时间类型 to_datatime()
t = pd. Series( [ '20220810' , '20220811' ] )
t
0 20220810
1 20220811
dtype: object
pd. to_datetime( t)
0 2022-08-10
1 2022-08-11
dtype: datetime64[ns]
数据排序- sort() axis= 1表示列,axis=0表示行
df. sort_index( ascending= True )
name team Q1 Q2 Q3 Q4 0 Liver E 89 21 24 64 1 Arry C 36 37 37 57 2 Ack A 57 60 18 84 3 Eorge C 93 96 71 78 4 Oah D 65 49 61 86 ... ... ... ... ... ... ... 95 Gabriel C 48 59 87 74 96 Austin7 C 21 31 30 43 97 Lincoln4 C 98 93 1 20 98 Eli E 11 74 58 91 99 Ben E 21 43 41 74
100 rows × 6 columns
添加修改
replace(a,b) # 将数据中的a 换成b insert(列的具体位置,列名,列的值)# 插入新列,列的具体位置,列名,列的值
df.where()
df
name team Q1 Q2 Q3 Q4 0 Liver E 89 21 24 64 1 Arry C 36 37 37 57 2 Ack A 57 60 18 84 3 Eorge C 93 96 71 78 4 Oah D 65 49 61 86 ... ... ... ... ... ... ... 95 Gabriel C 48 59 87 74 96 Austin7 C 21 31 30 43 97 Lincoln4 C 98 93 1 20 98 Eli E 11 74 58 91 99 Ben E 21 43 41 74
100 rows × 6 columns
df.iterrows()
for index, row in df. iterrows( ) :
print ( index, row[ 'name' ] , row. Q1)
0 Liver 89
1 Arry 36
2 Ack 57
3 Eorge 93
4 Oah 65
5 Harlie 24
6 Acob 61
7 Lfie 9
8 Reddie 64
9 Oscar 77
10 Leo 17
11 Logan 9
12 Archie 83
13 Theo 51
14 Thomas 80
15 James 48
16 Joshua 63
17 Henry 91
18 William 80
19 Max 97
20 Lucas 60
21 Ethan 79
22 Arthur 44
23 Mason 80
24 Isaac 74
25 Harrison 89
26 Teddy 71
27 Finley 62
28 Daniel 50
29 Riley 35
30 Edward 57
31 Joseph 67
32 Alexander 91
33 Adam 90
34 Reggie1 30
35 Samuel 9
36 Jaxon 88
37 Sebastian 1
38 Elijah 97
39 Harley 2
40 Toby 52
41 Arlo8 48
42 Dylan 86
43 Jude 8
44 Benjamin 15
45 Rory9 8
46 Tommy 29
47 Jake3 69
48 Louie 24
49 Carter7 57
50 Jenson 66
51 Hugo0 28
52 Bobby1 50
53 Frankie 18
54 Ollie3 10
55 Zachary 12
56 David 21
57 Albie1 79
58 Lewis 4
59 Luca 5
60 Ronnie 53
61 Jackson5 6
62 Matthew 44
63 Alex 14
64 Harvey2 43
65 Reuben 70
66 Jayden6 64
67 Caleb 64
68 Hunter3 38
69 Theodore3 43
70 Nathan 87
71 Blake 78
72 Luke6 15
73 Elliot 15
74 Roman 73
75 Stanley 69
76 Dexter 73
77 Michael 89
78 Elliott 9
79 Tyler 75
80 Ryan 92
81 Ellis 34
82 Finn 4
83 Albert0 85
84 Kai 66
85 Liam 2
86 Calum 14
87 Louis2 13
88 Aaron 96
89 Ezra 16
90 Leon 38
91 Connor 62
92 Grayson7 59
93 Jamie0 39
94 Aiden 20
95 Gabriel 48
96 Austin7 21
97 Lincoln4 98
98 Eli 11
99 Ben 21
for column in df:
print ( column)
name
team
Q1
Q2
Q3
Q4
for label, ser in df. items( ) :
print ( label)
print ( ser[ : 3 ] )
name
0 Liver
1 Arry
2 Ack
Name: name, dtype: object
team
0 E
1 C
2 A
Name: team, dtype: object
Q1
0 89
1 36
2 57
Name: Q1, dtype: int64
Q2
0 21
1 37
2 60
Name: Q2, dtype: int64
Q3
0 24
1 37
2 18
Name: Q3, dtype: int64
Q4
0 64
1 57
2 84
Name: Q4, dtype: int64
apply() 函数
df. name. apply ( lambda x: x. lower( ) )
0 liver
1 arry
2 ack
3 eorge
4 oah
...
95 gabriel
96 austin7
97 lincoln4
98 eli
99 ben
Name: name, Length: 100, dtype: object
def mylen ( x) :
return len ( str ( x) )
df. applymap( mylen)
name team Q1 Q2 Q3 Q4 0 5 1 2 2 2 2 1 4 1 2 2 2 2 2 3 1 2 2 2 2 3 5 1 2 2 2 2 4 3 1 2 2 2 2 ... ... ... ... ... ... ... 95 7 1 2 2 2 2 96 7 1 2 2 2 2 97 8 1 2 2 1 2 98 3 1 2 2 2 2 99 3 1 2 2 2 2
100 rows × 6 columns
agg() # 聚合函数
df
name team Q1 Q2 Q3 Q4 0 Liver E 89 21 24 64 1 Arry C 36 37 37 57 2 Ack A 57 60 18 84 3 Eorge C 93 96 71 78 4 Oah D 65 49 61 86 ... ... ... ... ... ... ... 95 Gabriel C 48 59 87 74 96 Austin7 C 21 31 30 43 97 Lincoln4 C 98 93 1 20 98 Eli E 11 74 58 91 99 Ben E 21 43 41 74
100 rows × 6 columns
df. agg( { 'Q1' : [ 'sum' , 'min' ] , 'Q2' : [ 'min' , 'max' ] } )
Q1 Q2 sum 4920.0 NaN min 1.0 1.0 max NaN 99.0
df. Q1. max ( )
98
df. agg( a= ( 'Q1' , 'max' ) ,
b= ( 'Q2' , 'min' ) )
Pandas 分组聚合
df.groupby(by=None,axis=0,level=None, as_index: bool =True, sort: bool=True,
group_keys:bool=True,observed: bool = False,
dropna : = False)
df. groupby( 'team' ) . sum ( )
Q1 Q2 Q3 Q4 team A 1066 639 875 783 B 975 1218 1202 1136 C 1056 1194 1068 1127 D 860 1191 1241 1199 E 963 1013 881 1033
df. groupby( 'team' ) . count( )
name Q1 Q2 Q3 Q4 team A 17 17 17 17 17 B 22 22 22 22 22 C 22 22 22 22 22 D 19 19 19 19 19 E 20 20 20 20 20
df. team. value_counts( )
C 22
B 22
E 20
D 19
A 17
Name: team, dtype: int64
df. groupby( lambda x: x> 50 ) . sum ( )
Q1 Q2 Q3 Q4 False 2832 2705 2597 2687 True 2088 2550 2670 2591
数据合并与对比
df.append(self, other, ignore_insex=False, verify_integrity=False,sort=False)
pd.concat(objs:连接数据 ,axis=1, join=‘outer’, ignore_index=False, keys=None, levels=None,names=None,sort=False,copy=True)
df1 = pd. DataFrame( { 'x' : [ 1 , 2 ] , 'y' : [ 3 , 4 ] } )
df2 = pd. DataFrame( { 'x' : [ 5 , 6 ] , 'y' : [ 7 , 8 ] } )
df1
pd. concat( [ df1, df2] )
pd. concat( [ df1, df2] , axis= 1 )
z = pd. Series( [ 9 , 9 ] , name= 'z' )
z
0 9
1 9
Name: z, dtype: int64
df. groupby( [ 'team' , df. mean( 1 ) >= 60 ] ) . count( )
E:\Anaconda\envs\py37\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
"""Entry point for launching an IPython kernel.
name Q1 Q2 Q3 Q4 team A False 14 14 14 14 14 True 3 3 3 3 3 B False 14 14 14 14 14 True 8 8 8 8 8 C False 17 17 17 17 17 True 5 5 5 5 5 D False 10 10 10 10 10 True 9 9 9 9 9 E False 15 15 15 15 15 True 5 5 5 5 5
arrays = [ [ 1 , 1 , 2 , 2 ] , [ 'A' , 'B' , 'A' , 'B' ] ]
index = pd. MultiIndex. from_arrays( arrays, names= ( 'class' , 'team' ) )
index
MultiIndex([(1, 'A'),
(1, 'B'),
(2, 'A'),
(2, 'B')],
names=['class', 'team'])
pd. DataFrame( [ { 'Q1' : 60 , 'Q2' : 70 } ] , index= index)
Q1 Q2 class team 1 A 60 70 B 60 70 2 A 60 70 B 60 70
insex_arrays = [ [ 1 , 1 , 2 , 2 ] , [ '男' , '女' , '男' , '女' ] ]
columns_arrays = [ [ '2019' , '2019' , '2020' , '2020' ] , [ '上半年' , '下半年' , '上半年' , '下半年' ] ]
数据重现与透视
dff = pd. DataFrame(
{
'A' : [ 'a1' , 'a1' , 'a2' , 'a3' , 'a3' , 'a3' ] ,
'B' : [ 'b1' , 'b2' , 'b3' , 'b1' , 'b2' , 'b3' ] ,
'C' : [ 'c1' , 'c2' , 'c3' , 'c4' , 'c5' , 'c6' ] ,
'D' : [ 'd1' , 'd2' , 'd3' , 'd4' , 'd5' , 'd6' ]
} )
dff
A B C D 0 a1 b1 c1 d1 1 a1 b2 c2 d2 2 a2 b3 c3 d3 3 a3 b1 c4 d4 4 a3 b2 c5 d5 5 a3 b3 c6 d6
dff. pivot( index= 'A' , columns = 'B' , values = 'C' )
B b1 b2 b3 A a1 c1 c2 NaN a2 NaN NaN c3 a3 c4 c5 c6
pd.pivot_table(df, index=[], columns=[], value=, aggfunc=np.sum, fill_value=0, margins=True)
dff
A B C D 0 a1 b1 c1 d1 1 a1 b2 c2 d2 2 a2 b3 c3 d3 3 a3 b1 c4 d4 4 a3 b2 c5 d5 5 a3 b3 c6 d6
dff. D= np. arange( 1 , 7 )
dff
A B C D 0 a1 b1 c1 1 1 a1 b2 c2 2 2 a2 b3 c3 3 3 a3 b1 c4 4 4 a3 b2 c5 5 5 a3 b3 c6 6
dff. pivot_table( index= 'A' , columns= 'B' , values= 'D' , fill_value= 0 )
聚合高级操作
dff. pivot_table( index= [ 'A' , 'B' ] , columns= [ 'C' ] , values= 'D' , aggfunc= np. sum , fill_value= 0 , margins= True )
C c1 c2 c3 c4 c5 c6 All A B a1 b1 1 0 0 0 0 0 1 b2 0 2 0 0 0 0 2 a2 b3 0 0 3 0 0 0 3 a3 b1 0 0 0 4 0 0 4 b2 0 0 0 0 5 0 5 b3 0 0 0 0 0 6 6 All 1 2 3 4 5 6 21
dff1 = pd. DataFrame( {
'A' : [ 'a1' , 'a1' , 'a2' , 'a2' ] ,
'B' : [ 'b1' , 'b2' , 'b1' , 'b2' ] ,
'C' : [ 1 , 2 , 3 , 4 ] ,
'D' : [ 5 , 6 , 7 , 8 ] ,
'E' : [ 5 , 6 , 7 , 8 ]
} )
dff1
A B C D E 0 a1 b1 1 5 5 1 a1 b2 2 6 6 2 a2 b1 3 7 7 3 a2 b2 4 8 8
dff1. set_index( [ 'A' , 'B' ] , inplace= True )
dff1
C D E A B a1 b1 1 5 5 b2 2 6 6 a2 b1 3 7 7 b2 4 8 8
dff1. stack( )
A B
a1 b1 C 1
D 5
E 5
b2 C 2
D 6
E 6
a2 b1 C 3
D 7
E 7
b2 C 4
D 8
E 8
dtype: int64
s = dff1. stack( )
s. unstack( )
C D E A B a1 b1 1 5 5 b2 2 6 6 a2 b1 3 7 7 b2 4 8 8
交叉表
dff2 = pd. DataFrame(
{
'A' : [ 'a1' , 'a1' , 'a2' , 'a3' , 'a3' , 'a3' ] ,
'B' : [ 'b1' , 'b2' , 'b3' , 'b1' , 'b2' , 'b3' ] ,
'C' : [ 1 , 2 , 3 , 4 , 5 , 6 ]
} )
dff2
A B C 0 a1 b1 1 1 a1 b2 2 2 a2 b3 3 3 a3 b1 4 4 a3 b2 5 5 a3 b3 6
pd. crosstab( dff2[ 'A' ] , dff2[ 'B' ] , normalize= True )
B b1 b2 b3 A a1 0.166667 0.166667 0.000000 a2 0.000000 0.000000 0.166667 a3 0.166667 0.166667 0.166667
dff2
A B C 0 a1 b1 1 1 a1 b2 2 2 a2 b3 3 3 a3 b1 4 4 a3 b2 5 5 a3 b3 6
dff2. melt( id_vars= [ 'A' , 'B' ] )
A B variable value 0 a1 b1 C 1 1 a1 b2 C 2 2 a2 b3 C 3 3 a3 b1 C 4 4 a3 b2 C 5 5 a3 b3 C 6
dff2. melt( )
variable value 0 A a1 1 A a1 2 A a2 3 A a3 4 A a3 5 A a3 6 B b1 7 B b2 8 B b3 9 B b1 10 B b2 11 B b3 12 C 1 13 C 2 14 C 3 15 C 4 16 C 5 17 C 6
dff2. melt( value_vars= [ 'C' , 'B' ] )
variable value 0 C 1 1 C 2 2 C 3 3 C 4 4 C 5 5 C 6 6 B b1 7 B b2 8 B b3 9 B b1 10 B b2 11 B b3
dff2. melt( id_vars= [ 'A' ] , value_vars= [ 'B' ] , var_name= 'B_lable' , value_name= 'B_value' )
A B_lable B_value 0 a1 B b1 1 a1 B b2 2 a2 B b3 3 a3 B b1 4 a3 B b2 5 a3 B b3
虚拟变量
pd.get_dummies(data,prefix=None, prefix_sep=’_’,dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)
data = [ 'b' , 'b' , 'a' , 'c' , 'b' ]
codes, unique = pd. factorize( data)
codes
unique
array(['b', 'a', 'c'], dtype=object)
c = pd. Series( [ [ 1 , 2 , 3 ] , 'foo' , [ ] , [ 3 , 4 ] ] )
c
0 [1, 2, 3]
1 foo
2 []
3 [3, 4]
dtype: object
c. explode( )
0 1
0 2
0 3
1 foo
2 NaN
3 3
3 4
dtype: object
c1 = pd. Series( { 'A' : [ [ 1 , 2 , 3 ] , 'foo' , [ ] , [ 3 , 4 ] ] , 'B' : range ( 4 ) } )
c1
A [[1, 2, 3], foo, [], [3, 4]]
B (0, 1, 2, 3)
dtype: object
c1. explode( )
A [1, 2, 3]
A foo
A []
A [3, 4]
B 0
B 1
B 2
B 3
dtype: object
c1. explode( 'A' )
0 [1, 2, 3]
1 foo
2 []
3 [3, 4]
4 0
5 1
6 2
7 3
dtype: object
Pandas 数据清洗
da = pd. DataFrame( {
'A' : [ 'a1' , 'a1' , 'a2' , 'a2' ] ,
'B' : [ 'b1' , 'b2' , None , 'b2' ] ,
'C' : [ 1 , 2 , 3 , 4 ] ,
'D' : [ 5 , 6 , None , 8 ] ,
'E' : [ 5 , None , 7 , 8 ]
}
)
da
A B C D E 0 a1 b1 1 5.0 5.0 1 a1 b2 2 6.0 NaN 2 a2 None 3 NaN 7.0 3 a2 b2 4 8.0 8.0
da. isna( )
A B C D E 0 False False False False False 1 False False False False True 2 False True False True False 3 False False False False False
da. isnull( )
A B C D E 0 False False False False False 1 False False False False True 2 False True False True False 3 False False False False False
da. isna( ) . sum ( )
A 0
B 1
C 0
D 1
E 1
dtype: int64
"""
1 识别重复值
df.duplicated(subset=None,keep='first')
keep : first 第一次出现值为True
last 最后一次出现值为True
FALSE 所有重复值为True
2 删除重复值
df.drop_duplicate(subset=None,
keep='first',
inplace=False,
ignore_index = False)
keep : first 保留第一次出现的重复值
last 保留最后一次出现重复值
FALSE 删除所有的重复值
"""
"\n1 识别重复值\ndf.duplicated(subset=None,keep='first')\nkeep : first 第一次出现值为True\n last 最后一次出现值为True\n FALSE 所有重复值为True\n2 删除重复值\ndf.drop_duplicate(subset=None,\n keep='first',\n inplace=False,\n ignore_index = False)\nkeep : first 保留第一次出现的重复值\n last 保留最后一次出现重复值\n FALSE 删除所有的重复值\n"
s1 = pd. Series( [ 'A' , 'Boy' , 'C' , np. nan] , dtype= "string" )
s1
0 A
1 Boy
2 C
3 <NA>
dtype: string
s1. str . lower( )
0 a
1 boy
2 c
3 <NA>
dtype: string
df
name team Q1 Q2 Q3 Q4 0 Liver E 89 21 24 64 1 Arry C 36 37 37 57 2 Ack A 57 60 18 84 3 Eorge C 93 96 71 78 4 Oah D 65 49 61 86 ... ... ... ... ... ... ... 95 Gabriel C 48 59 87 74 96 Austin7 C 21 31 30 43 97 Lincoln4 C 98 93 1 20 98 Eli E 11 74 58 91 99 Ben E 21 43 41 74
100 rows × 6 columns
df. Q1. astype( str ) . str
<pandas.core.strings.accessor.StringMethods at 0x1fdd73e7588>
s2 = pd. Series( [ '天_地_人' , '你_我_他' , np. nan, '风_水_火' ] , dtype= 'string' )
s2
0 天_地_人
1 你_我_他
2 <NA>
3 风_水_火
dtype: string
s2. str . split( '_' )
0 [天, 地, 人]
1 [你, 我, 他]
2 <NA>
3 [风, 水, 火]
dtype: object
s2. str . split( '_' ) . str [ 1 ]
0 地
1 我
2 <NA>
3 水
dtype: object
s2. str . split( '_' , expand= True )
0 1 2 0 天 地 人 1 你 我 他 2 <NA> <NA> <NA> 3 风 水 火
pd. Series( [ 'a' , 'b' , 'c' ] ) . repeat( 2 )
0 a
0 a
1 b
1 b
2 c
2 c
dtype: object
pd. Series( [ 'a' , 'b' , 'c' ] ) . str . repeat( 2 )
0 aa
1 bb
2 cc
dtype: object
pd. Series( [ 'a' , 'b' , 'c' ] ) . str . repeat( [ 1 , 2 , 3 ] )
0 a
1 bb
2 ccc
dtype: object
s3 = pd. Series( [ 'x' , 'y' , 'z' ] , dtype= 'string' )
s3
0 x
1 y
2 z
dtype: string
s3. str . cat( )
'xyz'
s3. str . cat( sep= ',' )
'x,y,z'