pandas Series and DataFrame 拼接:
import numpy as np
import pandas as pd
def make_df ( cols, ind) :
data= { c: [ str ( c) + str ( i) for i in ind] for c in cols}
return pd. DataFrame( data, ind)
make_df( 'AB' , [ 1 , 2 ] )
a= [ 1 , 2 , 3 ]
b= [ 4 , 5 , 6 ]
c= [ 7 , 8 , 9 ]
np. concatenate( [ a, b, c] )
array([1, 2, 3, 4, 5, 6, 7, 8, 9])
np. concatenate( [ a, b, c] , axis= 0 )
array([1, 2, 3, 4, 5, 6, 7, 8, 9])
d= [ [ 1 , 2 ] ,
[ 3 , 4 ] ]
e= [ [ 5 , 6 ] ,
[ 7 , 8 ] ]
np. concatenate( [ d, e] , axis= 0 ) , np. concatenate( [ d, e] , axis= 1 )
(array([[1, 2],
[3, 4],
[5, 6],
[7, 8]]),
array([[1, 2, 5, 6],
[3, 4, 7, 8]]))
f= [ 9 , 10 ]
np. concatenate( [ d, f] , axis= 0 )
pandas拼接:
主要用到pd.concat()
ser1= pd. Series( [ 'A' , 'B' , 'C' ] , index= [ 1 , 2 , 3 ] )
ser2= pd. Series( [ 'D' , 'E' , 'F' ] , index= [ 4 , 5 , 6 ] )
pd. concat( [ ser1, ser2] )
1 A
2 B
3 C
4 D
5 E
6 F
dtype: object
df1= make_df( 'AB' , [ 1 , 2 ] )
df2= make_df( 'AB' , [ 3 , 4 ] )
print ( df1)
print ( df2)
print ( pd. concat( [ df1, df2] , axis= 'index' ) )
print ( pd. concat( [ df1, df2] , axis= 'columns' ) )
A B
1 A1 B1
2 A2 B2
A B
3 A3 B3
4 A4 B4
A B
1 A1 B1
2 A2 B2
3 A3 B3
4 A4 B4
A B A B
1 A1 B1 NaN NaN
2 A2 B2 NaN NaN
3 NaN NaN A3 B3
4 NaN NaN A4 B4
注意以上索引的处理! 下面是 行(index) 索引重复的处理:
df2. index= df1. index
print ( pd. concat( [ df1, df2] , ignore_index= True ) )
print ( pd. concat( [ df1, df2] , keys= [ "df1" , "df2" ] ) )
A B
0 A1 B1
1 A2 B2
2 A3 B3
3 A4 B4
A B
df1 1 A1 B1
2 A2 B2
df2 1 A3 B3
2 A4 B4
下面是 列(columns) 索引重复:
df5= make_df( 'ABC' , [ 1 , 2 ] )
df6= make_df( 'BCD' , [ 3 , 4 ] )
print ( df5)
print ( df6)
print ( pd. concat( [ df5, df6] ) )
A B C
1 A1 B1 C1
2 A2 B2 C2
B C D
3 B3 C3 D3
4 B4 C4 D4
A B C D
1 A1 B1 C1 NaN
2 A2 B2 C2 NaN
3 NaN B3 C3 D3
4 NaN B4 C4 D4
print ( pd. concat( [ df5, df6] , join= 'inner' ) )
print ( pd. concat( [ df5, df6] , join= 'outer' ) )
B C
1 B1 C1
2 B2 C2
3 B3 C3
4 B4 C4
A B C D
1 A1 B1 C1 NaN
2 A2 B2 C2 NaN
3 NaN B3 C3 D3
4 NaN B4 C4 D4
print ( pd. concat( [ df5, df5, df6] ) )
A B C D
1 A1 B1 C1 NaN
2 A2 B2 C2 NaN
1 A1 B1 C1 NaN
2 A2 B2 C2 NaN
3 NaN B3 C3 D3
4 NaN B4 C4 D4
print ( df5. append( [ df6, df6] , ignore_index= True ) )
A B C D
0 A1 B1 C1 NaN
1 A2 B2 C2 NaN
2 NaN B3 C3 D3
3 NaN B4 C4 D4
4 NaN B3 C3 D3
5 NaN B4 C4 D4
合并数据集:
Pandas 的基本特性之一就是高性能的内存式数据连接(join)与合并(merge)操作 pd.merge() 实现的功能基于关系代数(relational algebra)的一部分。关系代数是处理关系型数据的通用理论 ,绝大部分数据库的可用操作都以此为理论基础。关系代数方法论的强大之处在于,它提出的若干简单操作规则经过组合就可以为任意数据集构建十分复杂的操作 。
pd.merge(),发现两个DataFrame的相同列(一个或多个)作为键连接:有1:1,1:many,many:many三种情况。
参数:
on=指定列作为键连接(可以是列表) left_on,right_on:指定列名,重复的可用DataFrame.drop(列名,axis=1)
去掉 left_index=True,right_index=True:将index指定为键连接,df1.join(df2)
也可实现 2,3可混用的。 函数默认丢弃原行索引 how= ‘inner’,‘outer’,‘left’,‘right’,分别对应重复列的交集,并集,使用左列值,使用右列值。 suffixes 对合并后的重复列自定义后缀,默认’_x’’_y’
gf1= pd. DataFrame( { 'employee' : ( 'Bob' , 'Lisa' , 'Jeff' , "Tessa" ) , 'wages' : ( 1200 , 1300 , 900 , 1000 ) } )
gf2= pd. DataFrame( { 'employee' : ( 'Lisa' , 'Tessa' , 'James' , 'Bob' ) , 'days' : ( 1200 , 5 , 3 , 6 ) } )
gf1, gf2
( employee wages
0 Bob 1200
1 Lisa 1300
2 Jeff 900
3 Tessa 1000,
employee days
0 Lisa 1200
1 Tessa 5
2 James 3
3 Bob 6)
pd. merge( gf1, gf2, left_on= 'wages' , right_on= 'days' )
employee_x wages employee_y days 0 Bob 1200 Lisa 1200
gf1= gf1. set_index( 'employee' )
gf2= gf2. set_index( "employee" )
gf1. join( gf2) , pd. merge( gf1, gf2, left_index= True , right_on= 'employee' )
( wages days
employee
Bob 1200 6.0
Lisa 1300 1200.0
Jeff 900 NaN
Tessa 1000 5.0,
wages days
employee
Bob 1200 6
Lisa 1300 1200
Tessa 1000 5)
pd. merge( gf1, gf2, how= 'outer' , left_index= True , right_index= True )
wages days employee Bob 1200.0 6.0 James NaN 3.0 Jeff 900.0 NaN Lisa 1300.0 1200.0 Tessa 1000.0 5.0
gh1= pd. DataFrame( [ { 'Adam' , 'Panda' , 'Jim' , 'Frau' } , { 'ice cream' , 'meat' , 'bread' , 'noodles' } ] )
gh2= pd. DataFrame( [ { 'Adam' , 'Pearson' , 'Jim' , 'Kay' } , { 'vermouth' , 'cola' , 'beer' , 'water' } ] )
gh1= gh1. T
gh2= gh2. T
gh1, gh2
( 0 1
0 Frau meat
1 Panda bread
2 Jim ice cream
3 Adam noodles,
0 1
0 Jim water
1 Pearson beer
2 Kay cola
3 Adam vermouth)
pd. merge( gh1, gh2, how= 'right' )
0 1 0 Jim water 1 Pearson beer 2 Kay cola 3 Adam vermouth
gh1. columns= [ 'name' , 'food' ]
gh2. columns= [ 'people' , 'water' ]
print ( pd. merge( gh1, gh2, how= 'left' , left_on= 'name' , right_on= 'people' ) . drop( 'people' , axis= 1 ) )
pd. merge( gh1, gh2, how= 'left' , left_on= 'name' , right_on= 'people' )
name food water
0 Frau meat NaN
1 Panda bread NaN
2 Jim ice cream water
3 Adam noodles vermouth
name food people water 0 Frau meat NaN NaN 1 Panda bread NaN NaN 2 Jim ice cream Jim water 3 Adam noodles Adam vermouth
popu= pd. read_csv( 'H:\data\state-population.csv' )
areas= pd. read_csv( 'H:\data\state-areas.csv' )
abbrevs= pd. read_csv( 'H:\data\state-abbrevs.csv' )
popu. head( ) , areas. head( ) , abbrevs. head( )
( state/region ages year population
0 AL under18 2012 1117489.0
1 AL total 2012 4817528.0
2 AL under18 2010 1130966.0
3 AL total 2010 4785570.0
4 AL under18 2011 1125763.0,
state area (sq. mi)
0 Alabama 52423
1 Alaska 656425
2 Arizona 114006
3 Arkansas 53182
4 California 163707,
state abbreviation
0 Alabama AL
1 Alaska AK
2 Arizona AZ
3 Arkansas AR
4 California CA)
m1= pd. merge( popu, abbrevs, how= 'outer' , left_on= 'state/region' , right_on= 'abbreviation' )
m1= m1. drop( 'abbreviation' , axis= 1 )
m1. head( )
state/region ages year population state 0 AL under18 2012 1117489.0 Alabama 1 AL total 2012 4817528.0 Alabama 2 AL under18 2010 1130966.0 Alabama 3 AL total 2010 4785570.0 Alabama 4 AL under18 2011 1125763.0 Alabama
m1. isnull( ) . any ( )
state/region False
ages False
year False
population True
state True
dtype: bool
m1[ m1[ 'population' ] . isnull( ) ] . head( )
state/region ages year population state 2448 PR under18 1990 NaN NaN 2449 PR total 1990 NaN NaN 2450 PR total 1991 NaN NaN 2451 PR under18 1991 NaN NaN 2452 PR total 1993 NaN NaN
m1. loc[ m1[ 'state' ] . isnull( ) , 'state/region' ] . unique( )
array(['PR', 'USA'], dtype=object)
m1. loc[ m1[ 'state/region' ] == 'PR' , 'state' ] = 'Puerto Rico'
m1. loc[ m1[ 'state/region' ] == 'USA' , 'state' ] = 'United States'
final= pd. merge( m1, areas, how= 'outer' )
final. head( )
state/region ages year population state area (sq. mi) 0 AL under18 2012 1117489.0 Alabama 52423.0 1 AL total 2012 4817528.0 Alabama 52423.0 2 AL under18 2010 1130966.0 Alabama 52423.0 3 AL total 2010 4785570.0 Alabama 52423.0 4 AL under18 2011 1125763.0 Alabama 52423.0
final. isnull( ) . any ( )
state/region False
ages False
year False
population True
state False
area (sq. mi) True
dtype: bool
final. loc[ final[ 'area (sq. mi)' ] . isnull( ) , 'state' ] . unique( )
array(['United States'], dtype=object)
final[ 'state' ] [ final[ 'area (sq. mi)' ] . isnull( ) ] . unique( )
array(['United States'], dtype=object)
final. dropna( inplace= True )
data2010= final. query( "year==2010&ages=='total'" )
data2010. set_index( 'state' , inplace= True )
density= data2010[ 'population' ] / data2010[ 'area (sq. mi)' ]
density. head( )
state
Alabama 91.287603
Alaska 1.087509
Arizona 56.214497
Arkansas 54.948667
California 228.051342
dtype: float64
density. sort_values( ascending= False , inplace= True )
density. head( )
density. tail( )
state
South Dakota 10.583512
North Dakota 9.537565
Montana 6.736171
Wyoming 5.768079
Alaska 1.087509
dtype: float64