pandas的两种重要的数据类型DataFrame和Series
import numpy as np
import pandas as pd
import os
os. getcwd( )
'C:\\Users\\dell'
df1 = pd. DataFrame( np. arange( 10 ) . reshape( 2 , 5 ) )
df1
df2= pd. read_csv( r'C:\Users\dell\data.csv' )
df2. head( )
account name street city state postal-code Jan Feb Mar 0 211829 Kerluke, Koepp and Hilpert 34456 Sean Highway New Jaycob Texas 28752 10000 62000 35000 1 320563 Walter-Trantow 1311 Alvis Tunnel Port Khadijah NorthCarolina 38365 95000 45000 35000 2 648336 Bashirian, Kunde and Price 62184 Schamberger Underpass Apt. 231 New Lilianland Iowa 76517 91000 120000 35000 3 109996 D'Amore, Gleichner and Bode 155 Fadel Crescent Apt. 144 Hyattburgh Maine 46021 45000 120000 10000 4 121213 Bauch-Goldner 7274 Marissa Common Shanahanchester California 49681 162000 120000 35000
df2. index
df2. index. size
df2. columns
df2. columns. size
9
df2. shape
(15, 9)
print ( "行数为" , df2. shape[ 0 ] )
print ( "列数为" , df2. shape[ 1 ] )
行数为 15
列数为 9
df2[ "name" ] . head( )
df2[ name] [ 2 ]
0 Kerluke, Koepp and Hilpert
1 Walter-Trantow
2 Bashirian, Kunde and Price
3 D'Amore, Gleichner and Bode
4 Bauch-Goldner
Name: name, dtype: object
df2. name. head( )
df2. name[ 2 ]
0 Kerluke, Koepp and Hilpert
1 Walter-Trantow
2 Bashirian, Kunde and Price
3 D'Amore, Gleichner and Bode
4 Bauch-Goldner
Name: name, dtype: object
df2[ "city" ] [ [ 2 , 5 ] ]
2 New Lilianland
5 Jeremieburgh
Name: city, dtype: object
df2. loc[ 1 , "street" ]
'1311 Alvis Tunnel'
df2. iloc[ 1 , 2 ]
'1311 Alvis Tunnel'
del df2[ "street" ]
df2. head( )
account name city state postal-code Jan Feb Mar 0 211829 Kerluke, Koepp and Hilpert New Jaycob Texas 28752 10000 62000 35000 1 320563 Walter-Trantow Port Khadijah NorthCarolina 38365 95000 45000 35000 2 648336 Bashirian, Kunde and Price New Lilianland Iowa 76517 91000 120000 35000 3 109996 D'Amore, Gleichner and Bode Hyattburgh Maine 46021 45000 120000 10000 4 121213 Bauch-Goldner Shanahanchester California 49681 162000 120000 35000
df3= df2[ [ "account" , "name" , "city" ] ]
df3. drop( [ "name" , "city" ] , axis= 1 , inplace= True )
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py:3997: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
errors=errors,
df2[ df2. Feb> 46000 ] . head( )
df2[ df2. Jan> 46000 ] [ [ "city" , "name" ] ] . head( )
df2[ df2. city== 'New Jaycob' ] . count( )
account 1
name 1
city 1
state 1
postal-code 1
Jan 1
Feb 1
Mar 1
dtype: int64
dff= df2[ [ "name" , "Feb" ] ]
dff. sort_values( by= "Feb" , axis= 0 , ascending= True ) . head( )
name Feb 11 Hahn-Moore 10000 1 Walter-Trantow 45000 0 Kerluke, Koepp and Hilpert 62000 7 Kovacek-Johnston 95000 8 Champlin-Morar 95000
算数运算
df4= pd. DataFrame( np. arange( 6 ) . reshape( 2 , 3 ) )
df4
df5= pd. DataFrame( np. arange( 10 ) . reshape( 2 , 5 ) )
df5
df4+ df5
0 1 2 3 4 0 0 2 4 NaN NaN 1 8 10 12 NaN NaN
df6= df4. add( df5, fill_value= 10 )
s1= pd. Series( np. arange( 3 ) )
df6- s1
0 1 2 3 4 0 0.0 1.0 2.0 NaN NaN 1 8.0 9.0 10.0 NaN NaN
df4. rolling( 2 ) . sum
<bound method Rolling.sum of Rolling [window=2,center=False,axis=0]>
df4. cov( )
0 1 2 0 4.5 4.5 4.5 1 4.5 4.5 4.5 2 4.5 4.5 4.5
df4. corr( )
0 1 2 0 1.0 1.0 1.0 1 1.0 1.0 1.0 2 1.0 1.0 1.0
df7= df4. T
df7
缺失值处理
df3. empty
False
np. nan+ 1
np. nan- np. nan
nan
A= pd. DataFrame( np. array( [ 10 , 10 , 20 , 20 ] ) . reshape( 2 , 2 ) , columns= list ( "ab" ) , index= list ( "sw" ) )
A
A. stack( )
s a 10
b 10
w a 20
b 20
dtype: int32
A. mean( )
a 15.0
b 15.0
dtype: float64
A. notnull( )