数据聚合与分组运算
GroupBy技术
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
df = DataFrame( { 'key1' : [ 'a' , 'a' , 'b' , 'b' , 'a' ] ,
'key2' : [ 'one' , 'two' , 'one' , 'two' , 'one' ] ,
'data1' : np. random. randn( 5 ) ,
'data2' : np. random. randn( 5 ) } )
df
key1 key2 data1 data2 0 a one -0.074122 -0.571432 1 a two 0.347874 -0.794645 2 b one 0.399766 -0.596056 3 b two 1.209857 -0.266257 4 a one -0.001175 0.180895
grouped = df[ 'data1' ] . groupby( df[ 'key1' ] )
grouped
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001CCD8450910>
grouped. mean( )
key1
a 0.090859
b 0.804812
Name: data1, dtype: float64
means = df[ 'data1' ] . groupby( [ df[ 'key1' ] , df[ 'key2' ] ] ) . mean( )
means
key1 key2
a one -0.037649
two 0.347874
b one 0.399766
two 1.209857
Name: data1, dtype: float64
means. unstack( )
key2 one two key1 a -0.037649 0.347874 b 0.399766 1.209857
states = np. array( [ 'Ohio' , 'California' , 'California' , 'Ohio' , 'Ohio' ] )
years = np. array( [ 2005 , 2005 , 2006 , 2005 , 2006 ] )
df[ 'data1' ] . groupby( [ states, years] ) . mean( )
California 2005 0.347874
2006 0.399766
Ohio 2005 0.567867
2006 -0.001175
Name: data1, dtype: float64
df. groupby( 'key1' ) . mean( )
data1 data2 key1 a 0.090859 -0.395061 b 0.804812 -0.431157
df. groupby( [ 'key1' , 'key2' ] ) . mean( )
data1 data2 key1 key2 a one -0.037649 -0.195268 two 0.347874 -0.794645 b one 0.399766 -0.596056 two 1.209857 -0.266257
df. groupby( [ 'key1' , 'key2' ] ) . size( )
key1 key2
a one 2
two 1
b one 1
two 1
dtype: int64
对分组进行迭代
for name, group in df. groupby( 'key1' ) :
print ( name)
print ( group)
a
key1 key2 data1 data2
0 a one -0.074122 -0.571432
1 a two 0.347874 -0.794645
4 a one -0.001175 0.180895
b
key1 key2 data1 data2
2 b one 0.399766 -0.596056
3 b two 1.209857 -0.266257
for ( k1, k2) , group in df. groupby( [ 'key1' , 'key2' ] ) :
print ( k1, k2)
print ( group)
a one
key1 key2 data1 data2
0 a one -0.074122 -0.571432
4 a one -0.001175 0.180895
a two
key1 key2 data1 data2
1 a two 0.347874 -0.794645
b one
key1 key2 data1 data2
2 b one 0.399766 -0.596056
b two
key1 key2 data1 data2
3 b two 1.209857 -0.266257
pieces = dict ( list ( df. groupby( 'key1' ) ) )
pieces[ 'b' ]
key1 key2 data1 data2 2 b one 0.399766 -0.596056 3 b two 1.209857 -0.266257
df. dtypes
key1 object
key2 object
data1 float64
data2 float64
dtype: object
grouped = df. groupby( df. dtypes, axis= 1 )
dict ( list ( grouped) )
{dtype('float64'): data1 data2
0 -0.074122 -0.571432
1 0.347874 -0.794645
2 0.399766 -0.596056
3 1.209857 -0.266257
4 -0.001175 0.180895,
dtype('O'): key1 key2
0 a one
1 a two
2 b one
3 b two
4 a one}
选取一个或一组列
df. groupby( 'key1' ) [ 'data1' ]
df. groupby( 'key1' ) [ [ 'data2' ] ]
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001CCCF3F55A0>
df. groupby( [ 'key1' , 'key2' ] ) [ [ 'data2' ] ] . mean( )
data2 key1 key2 a one -0.195268 two -0.794645 b one -0.596056 two -0.266257
s_grouped = df. groupby( [ 'key1' , 'key2' ] ) [ 'data2' ]
s_grouped
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001CCD8452DA0>
s_grouped. mean( )
key1 key2
a one -0.195268
two -0.794645
b one -0.596056
two -0.266257
Name: data2, dtype: float64
通过字典或Series进行分组
people = DataFrame( np. random. randn( 5 , 5 ) ,
columns= [ 'a' , 'b' , 'c' , 'd' , 'e' ] ,
index= [ 'Joe' , 'Steve' , 'Wes' , 'Jim' , 'Travis' ] )
people. loc[ 2 : 3 , [ 'b' , 'c' ] ] = np. nan
people
C:\windows\ FutureWarning: Slicing a positional slice with .loc is not supported, and will raise TypeError in a future version. Use .loc with labels or .iloc with positions instead.
people.loc[2:3,['b','c']] = np.nan#添加几个NA值
a b c d e Joe 0.309327 1.658107 1.146959 -0.123471 0.159285 Steve 1.380735 -0.703245 0.158134 -1.602958 1.455772 Wes -0.766580 NaN NaN 0.074462 1.430541 Jim -0.615666 2.578830 -0.002766 0.885567 -0.375239 Travis -0.033534 1.158113 0.637327 1.473547 0.373215
mapping = { 'a' : 'red' , 'b' : 'red' , 'c' : 'blue' , 'd' : 'blue' , 'e' : 'red' , 'f' : 'orange' }
by_column = people. groupby( mapping, axis= 1 )
by_column. sum ( )
blue red Joe 1.023488 2.126719 Steve -1.444824 2.133263 Wes 0.074462 0.663960 Jim 0.882800 1.587925 Travis 2.110874 1.497794
map_series = Series( mapping)
map_series
a red
b red
c blue
d blue
e red
f orange
dtype: object
people. groupby( map_series, axis= 1 ) . count( )
blue red Joe 2 3 Steve 2 3 Wes 1 2 Jim 2 3 Travis 2 3
通过函数进行分组
people. groupby( len ) . sum ( )
a b c d e 3 -1.072920 4.236937 1.144193 0.836558 1.214587 5 1.380735 -0.703245 0.158134 -1.602958 1.455772 6 -0.033534 1.158113 0.637327 1.473547 0.373215
key_list = [ 'one' , 'one' , 'one' , 'two' , 'two' ]
people. groupby( [ len , key_list] ) . min ( )
a b c d e 3 one -0.766580 1.658107 1.146959 -0.123471 0.159285 two -0.615666 2.578830 -0.002766 0.885567 -0.375239 5 one 1.380735 -0.703245 0.158134 -1.602958 1.455772 6 two -0.033534 1.158113 0.637327 1.473547 0.373215
根据索引级别分组
columns = pd. MultiIndex. from_arrays( [ [ 'US' , 'US' , 'US' , 'JP' , 'JP' ] ,
[ 1 , 3 , 5 , 1 , 3 ] ] , names= [ 'cty' , 'tenor' ] )
hier_df = DataFrame( np. random. randn( 4 , 5 ) , columns= columns)
hier_df
cty US JP tenor 1 3 5 1 3 0 0.971689 -0.207027 0.641528 1.197729 -0.800907 1 0.906871 -0.087288 0.204273 -0.009374 0.637842 2 0.649755 -0.800055 -0.057130 -1.087200 0.435762 3 -0.618737 0.325816 -0.702310 -0.519860 -0.101653
hier_df. groupby( level= 'cty' , axis= 1 ) . count( )
数据聚合
grouped = df. groupby( 'key1' )
grouped[ 'data1' ] . quantile( 0.9 )
key1
a 0.278064
b 1.128848
Name: data1, dtype: float64
def peak_to_peak ( arr) :
return arr. max ( ) - arr. min ( )
grouped. agg( peak_to_peak)
C:\windows\TFutureWarning: ['key2'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning.
grouped.agg(peak_to_peak)
data1 data2 key1 a 0.421996 0.975541 b 0.810090 0.329799
grouped. describe( )
data1 data2 count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max key1 a 3.0 0.090859 0.22555 -0.074122 -0.037649 -0.001175 0.173349 0.347874 3.0 -0.395061 0.511126 -0.794645 -0.683039 -0.571432 -0.195268 0.180895 b 2.0 0.804812 0.57282 0.399766 0.602289 0.804812 1.007334 1.209857 2.0 -0.431157 0.233203 -0.596056 -0.513606 -0.431157 -0.348707 -0.266257
import matplotlib. pyplot as plt
from pylab import *
img = plt. imread( '经过优化的GroupBy的方法.png' )
imshow( img)
tips = pd. read_csv( "E:\\python_study_files\\python\\pydata-book-2nd-edition\\examples\\tips.csv" )
tips[ 'tip_pct' ] = tips[ 'tip' ] / tips[ 'total_bill' ]
tips[ : 6 ]
total_bill tip smoker day time size tip_pct 0 16.99 1.01 No Sun Dinner 2 0.059447 1 10.34 1.66 No Sun Dinner 3 0.160542 2 21.01 3.50 No Sun Dinner 3 0.166587 3 23.68 3.31 No Sun Dinner 2 0.139780 4 24.59 3.61 No Sun Dinner 4 0.146808 5 25.29 4.71 No Sun Dinner 4 0.186240
面向列的多函数应用
grouped = tips. groupby( [ 'day' , 'smoker' ] )
grouped_pct = grouped[ 'tip_pct' ]
grouped_pct. agg( 'mean' )
day smoker
Fri No 0.151650
Yes 0.174783
Sat No 0.158048
Yes 0.147906
Sun No 0.160113
Yes 0.187250
Thur No 0.160298
Yes 0.163863
Name: tip_pct, dtype: float64
grouped_pct. agg( [ 'mean' , 'std' , peak_to_peak] )
mean std peak_to_peak day smoker Fri No 0.151650 0.028123 0.067349 Yes 0.174783 0.051293 0.159925 Sat No 0.158048 0.039767 0.235193 Yes 0.147906 0.061375 0.290095 Sun No 0.160113 0.042347 0.193226 Yes 0.187250 0.154134 0.644685 Thur No 0.160298 0.038774 0.193350 Yes 0.163863 0.039389 0.151240
grouped_pct. agg( [ ( 'foo' , 'mean' ) , ( 'bar' , np. std) ] )
foo bar day smoker Fri No 0.151650 0.028123 Yes 0.174783 0.051293 Sat No 0.158048 0.039767 Yes 0.147906 0.061375 Sun No 0.160113 0.042347 Yes 0.187250 0.154134 Thur No 0.160298 0.038774 Yes 0.163863 0.039389
functions = [ 'count' , 'mean' , 'max' ]
result = grouped[ 'tip_pct' , 'total_bill' ] . agg( functions)
result
C:\windowFutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
result = grouped['tip_pct','total_bill'].agg(functions)
tip_pct total_bill count mean max count mean max day smoker Fri No 4 0.151650 0.187735 4 18.420000 22.75 Yes 15 0.174783 0.263480 15 16.813333 40.17 Sat No 45 0.158048 0.291990 45 19.661778 48.33 Yes 42 0.147906 0.325733 42 21.276667 50.81 Sun No 57 0.160113 0.252672 57 20.506667 48.17 Yes 19 0.187250 0.710345 19 24.120000 45.35 Thur No 45 0.160298 0.266312 45 17.113111 41.19 Yes 17 0.163863 0.241255 17 19.190588 43.11
result[ 'tip_pct' ]
count mean max day smoker Fri No 4 0.151650 0.187735 Yes 15 0.174783 0.263480 Sat No 45 0.158048 0.291990 Yes 42 0.147906 0.325733 Sun No 57 0.160113 0.252672 Yes 19 0.187250 0.710345 Thur No 45 0.160298 0.266312 Yes 17 0.163863 0.241255
ftuples = [ ( 'Durchschnitt' , 'mean' ) , ( 'Abweichung' , np. var) ]
grouped[ 'tip_pct' , 'total_bill' ] . agg( ftuples)
C:\windowsFutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
grouped['tip_pct','total_bill'].agg(ftuples)
tip_pct total_bill Durchschnitt Abweichung Durchschnitt Abweichung day smoker Fri No 0.151650 0.000791 18.420000 25.596333 Yes 0.174783 0.002631 16.813333 82.562438 Sat No 0.158048 0.001581 19.661778 79.908965 Yes 0.147906 0.003767 21.276667 101.387535 Sun No 0.160113 0.001793 20.506667 66.099980 Yes 0.187250 0.023757 24.120000 109.046044 Thur No 0.160298 0.001503 17.113111 59.625081 Yes 0.163863 0.001551 19.190588 69.808518
grouped. agg( { 'tip' : np. max , 'size' : 'sum' } )
tip size day smoker Fri No 3.50 9 Yes 4.73 31 Sat No 9.00 115 Yes 10.00 104 Sun No 6.00 167 Yes 6.50 49 Thur No 6.70 112 Yes 5.00 40
grouped. agg( { 'tip_pct' : [ 'min' , 'max' , 'mean' , 'std' ] ,
'size' : 'sum' } )
tip_pct size min max mean std sum day smoker Fri No 0.120385 0.187735 0.151650 0.028123 9 Yes 0.103555 0.263480 0.174783 0.051293 31 Sat No 0.056797 0.291990 0.158048 0.039767 115 Yes 0.035638 0.325733 0.147906 0.061375 104 Sun No 0.059447 0.252672 0.160113 0.042347 167 Yes 0.065660 0.710345 0.187250 0.154134 49 Thur No 0.072961 0.266312 0.160298 0.038774 112 Yes 0.090014 0.241255 0.163863 0.039389 40
以“无索引”的形式返回聚合数据
tips. groupby( [ 'day' , 'smoker' ] , as_index= False ) . mean( )
day smoker total_bill tip size tip_pct 0 Fri No 18.420000 2.812500 2.250000 0.151650 1 Fri Yes 16.813333 2.714000 2.066667 0.174783 2 Sat No 19.661778 3.102889 2.555556 0.158048 3 Sat Yes 21.276667 2.875476 2.476190 0.147906 4 Sun No 20.506667 3.167895 2.929825 0.160113 5 Sun Yes 24.120000 3.516842 2.578947 0.187250 6 Thur No 17.113111 2.673778 2.488889 0.160298 7 Thur Yes 19.190588 3.030000 2.352941 0.163863
分组级运算和转换
k1_means = df. groupby( 'key1' ) . mean( ) . add_prefix( 'mean_' )
k1_means
mean_data1 mean_data2 key1 a 0.090859 -0.395061 b 0.804812 -0.431157
pd. merge( df, k1_means, left_on= 'key1' , right_index= True )
key1 key2 data1 data2 mean_data1 mean_data2 0 a one -0.074122 -0.571432 0.090859 -0.395061 1 a two 0.347874 -0.794645 0.090859 -0.395061 4 a one -0.001175 0.180895 0.090859 -0.395061 2 b one 0.399766 -0.596056 0.804812 -0.431157 3 b two 1.209857 -0.266257 0.804812 -0.431157
key = [ 'one' , 'two' , 'one' , 'two' , 'one' ]
people. groupby( key) . mean( )
a b c d e one -0.163596 1.408110 0.892143 0.474846 0.654347 two 0.382534 0.937792 0.077684 -0.358695 0.540267
people. groupby( key) . transform( np. mean)
a b c d e Joe -0.163596 1.408110 0.892143 0.474846 0.654347 Steve 0.382534 0.937792 0.077684 -0.358695 0.540267 Wes -0.163596 1.408110 0.892143 0.474846 0.654347 Jim 0.382534 0.937792 0.077684 -0.358695 0.540267 Travis -0.163596 1.408110 0.892143 0.474846 0.654347
def demean ( arr) :
return arr- arr. mean( )
demeaned = people. groupby( key) . transform( demean)
demeaned
a b c d e Joe 0.472923 0.249997 0.254816 -0.598317 -0.495062 Steve 0.998201 -1.641038 0.080450 -1.244262 0.915506 Wes -0.602985 NaN NaN -0.400384 0.776194 Jim -0.998201 1.641038 -0.080450 1.244262 -0.915506 Travis 0.130062 -0.249997 -0.254816 0.998701 -0.281132
demeaned. groupby( key) . mean( )
a b c d e one 2.775558e-17 0.000000e+00 -5.551115e-17 7.401487e-17 -1.110223e-16 two 0.000000e+00 1.110223e-16 -6.938894e-18 0.000000e+00 -5.551115e-17
apply:一般性的“拆分——应用——合并”
def top ( df, n= 5 , column= 'tip_pct' ) :
return df. sort_values( by= column) [ - n: ]
top( tips, n= 6 )
total_bill tip smoker day time size tip_pct 109 14.31 4.00 Yes Sat Dinner 2 0.279525 183 23.17 6.50 Yes Sun Dinner 4 0.280535 232 11.61 3.39 No Sat Dinner 2 0.291990 67 3.07 1.00 Yes Sat Dinner 1 0.325733 178 9.60 4.00 Yes Sun Dinner 2 0.416667 172 7.25 5.15 Yes Sun Dinner 2 0.710345
tips. groupby( 'smoker' ) . apply ( top)
total_bill tip smoker day time size tip_pct smoker No 88 24.71 5.85 No Thur Lunch 2 0.236746 185 20.69 5.00 No Sun Dinner 5 0.241663 51 10.29 2.60 No Sun Dinner 2 0.252672 149 7.51 2.00 No Thur Lunch 2 0.266312 232 11.61 3.39 No Sat Dinner 2 0.291990 Yes 109 14.31 4.00 Yes Sat Dinner 2 0.279525 183 23.17 6.50 Yes Sun Dinner 4 0.280535 67 3.07 1.00 Yes Sat Dinner 1 0.325733 178 9.60 4.00 Yes Sun Dinner 2 0.416667 172 7.25 5.15 Yes Sun Dinner 2 0.710345
tips. groupby( [ 'smoker' , 'day' ] ) . apply ( top, n= 1 , column= 'total_bill' )
total_bill tip smoker day time size tip_pct smoker day No Fri 94 22.75 3.25 No Fri Dinner 2 0.142857 Sat 212 48.33 9.00 No Sat Dinner 4 0.186220 Sun 156 48.17 5.00 No Sun Dinner 6 0.103799 Thur 142 41.19 5.00 No Thur Lunch 5 0.121389 Yes Fri 95 40.17 4.73 Yes Fri Dinner 4 0.117750 Sat 170 50.81 10.00 Yes Sat Dinner 3 0.196812 Sun 182 45.35 3.50 Yes Sun Dinner 3 0.077178 Thur 197 43.11 5.00 Yes Thur Lunch 4 0.115982
result = tips. groupby( 'smoker' ) [ 'tip_pct' ] . describe( )
result
count mean std min 25% 50% 75% max smoker No 151.0 0.159328 0.039910 0.056797 0.136906 0.155625 0.185014 0.291990 Yes 93.0 0.163196 0.085119 0.035638 0.106771 0.153846 0.195059 0.710345
result. unstack( 'smoker' )
smoker
count No 151.000000
Yes 93.000000
mean No 0.159328
Yes 0.163196
std No 0.039910
Yes 0.085119
min No 0.056797
Yes 0.035638
25% No 0.136906
Yes 0.106771
50% No 0.155625
Yes 0.153846
75% No 0.185014
Yes 0.195059
max No 0.291990
Yes 0.710345
dtype: float64
当调用describe之类的方法时,实际上只是应用了以下两条代码的快捷方式: f = lambda x: x.describe() grouped.apply(f)
禁止分组键
tips. groupby( 'smoker' , group_keys= False ) . apply ( top)
total_bill tip smoker day time size tip_pct 88 24.71 5.85 No Thur Lunch 2 0.236746 185 20.69 5.00 No Sun Dinner 5 0.241663 51 10.29 2.60 No Sun Dinner 2 0.252672 149 7.51 2.00 No Thur Lunch 2 0.266312 232 11.61 3.39 No Sat Dinner 2 0.291990 109 14.31 4.00 Yes Sat Dinner 2 0.279525 183 23.17 6.50 Yes Sun Dinner 4 0.280535 67 3.07 1.00 Yes Sat Dinner 1 0.325733 178 9.60 4.00 Yes Sun Dinner 2 0.416667 172 7.25 5.15 Yes Sun Dinner 2 0.710345
分位数和桶分析
frame = DataFrame( { 'data1' : np. random. randn( 1000 ) ,
'data2' : np. random. randn( 1000 ) } )
factor = pd. cut( frame. data1, 4 )
factor[ : 10 ]
0 (-1.448, 0.107]
1 (-1.448, 0.107]
2 (-1.448, 0.107]
3 (-1.448, 0.107]
4 (0.107, 1.663]
5 (0.107, 1.663]
6 (0.107, 1.663]
7 (-1.448, 0.107]
8 (-1.448, 0.107]
9 (0.107, 1.663]
Name: data1, dtype: category
Categories (4, interval[float64, right]): [(-3.01, -1.448] < (-1.448, 0.107] < (0.107, 1.663] < (1.663, 3.218]]
def get_stats ( group) :
return { 'min' : group. min ( ) , 'max' : group. max ( ) ,
'count' : group. count( ) , 'mean' : group. mean( ) }
grouped = frame. data2. groupby( factor)
grouped. apply ( get_stats) . unstack( )
min max count mean data1 (-3.01, -1.448] -2.614910 2.368046 70.0 -0.092146 (-1.448, 0.107] -2.534962 2.783160 479.0 0.009041 (0.107, 1.663] -3.073771 2.513553 398.0 -0.091291 (1.663, 3.218] -2.699080 2.373634 53.0 -0.099021
grouping = pd. qcut( frame. data1, 10 , labels= False )
grouped = frame. data2. groupby( grouping)
grouped. apply ( get_stats) . unstack( )
min max count mean data1 0 -2.614910 2.783160 100.0 0.006906 1 -2.534962 2.490249 100.0 -0.101695 2 -2.015862 2.261854 100.0 0.084059 3 -2.250966 2.509572 100.0 -0.000924 4 -2.068747 2.425219 100.0 0.119523 5 -2.913492 2.032037 100.0 -0.233505 6 -2.432055 1.983781 100.0 -0.038541 7 -2.339164 2.046824 100.0 -0.096358 8 -3.073771 2.235941 100.0 -0.091584 9 -2.699080 2.513553 100.0 -0.084895
示例:用特定于分组的值填充缺失值
s = Series( np. random. randn( 6 ) )
s[ : : 2 ] = np. nan
s
0 NaN
1 0.209858
2 NaN
3 1.379023
4 NaN
5 -0.743300
dtype: float64
s. fillna( s. mean( ) )
0 0.281860
1 0.209858
2 0.281860
3 1.379023
4 0.281860
5 -0.743300
dtype: float64
states = [ 'Ohio' , 'New York' , 'Vermont' , 'Florida' ,
'Oregon' , 'Nevada' , 'California' , 'Idaho' ]
group_key = [ 'East' ] * 4 + [ 'West' ] * 4
data = Series( np. random. randn( 8 ) , index= states)
data[ [ 'Vermont' , 'Nevada' , 'Idaho' ] ] = np. nan
data
Ohio 0.155978
New York -0.133767
Vermont NaN
Florida -0.765162
Oregon 0.682524
Nevada NaN
California 0.730390
Idaho NaN
dtype: float64
data. groupby( group_key) . mean( )
East -0.247650
West 0.706457
dtype: float64
fill_mean = lambda g: g. fillna( g. mean( ) )
data. groupby( group_key) . apply ( fill_mean)
Ohio 0.155978
New York -0.133767
Vermont -0.247650
Florida -0.765162
Oregon 0.682524
Nevada 0.706457
California 0.730390
Idaho 0.706457
dtype: float64
fill_values = { 'East' : 0.5 , 'West' : - 1 }
fill_func = lambda g: g. fillna( fill_values[ g. name] )
data. groupby( group_key) . apply ( fill_func)
Ohio 0.155978
New York -0.133767
Vermont 0.500000
Florida -0.765162
Oregon 0.682524
Nevada -1.000000
California 0.730390
Idaho -1.000000
dtype: float64
示例:随机采样和排列
抽取的一个办法:选取np.random.permutation(N)的前K个元素,其中N为完整数据的大小,K为期望的样本大小。
suits = [ 'H' , 'S' , 'C' , 'D' ]
card_val = ( list ( range ( 1 , 11 ) ) + [ 10 ] * 3 ) * 4
base_names = [ 'A' ] + list ( range ( 2 , 11 ) ) + [ 'J' , 'K' , 'Q' ]
cards = [ ]
for suit in [ 'H' , 'S' , 'C' , 'D' ] :
cards. extend( str ( num) + suit for num in base_names)
deck = Series( card_val, index= cards)
deck[ : 13 ]
AH 1
2H 2
3H 3
4H 4
5H 5
6H 6
7H 7
8H 8
9H 9
10H 10
JH 10
KH 10
QH 10
dtype: int64
def draw ( deck, n= 5 ) :
return deck. take( np. random. permutation( len ( deck) ) [ : n] )
draw( deck)
7H 7
4D 4
8H 8
QC 10
4S 4
dtype: int64
get_suit = lambda card: card[ - 1 ]
deck. groupby( get_suit) . apply ( draw, n= 2 )
C AC 1
JC 10
D 5D 5
8D 8
H 10H 10
JH 10
S 9S 9
5S 5
dtype: int64
deck. groupby( get_suit, group_keys= False ) . apply ( draw, n= 2 )
10C 10
AC 1
KD 10
10D 10
3H 3
9H 9
5S 5
8S 8
dtype: int64
示例:分组加权平均数和相关系数
df = DataFrame( { 'category' : [ 'a' , 'a' , 'a' , 'a' , 'b' , 'b' , 'b' , 'b' ] ,
'data' : np. random. randn( 8 ) ,
'weights' : np. random. randn( 8 ) } )
df
category data weights 0 a 0.591317 -1.032939 1 a -0.589692 0.436704 2 a -0.128848 2.257153 3 a -0.774626 0.811910 4 b -2.050679 1.144802 5 b 1.216111 0.736471 6 b -0.801366 0.139008 7 b -1.577430 -0.576198
grouped = df. groupby( 'category' )
get_wavg = lambda g: np. average( g[ 'data' ] , weights= g[ 'weights' ] )
grouped. apply ( get_wavg)
category
a -0.723088
b -0.453212
dtype: float64
close_px = pd. read_csv( "E:\python_study_files\python\pydata-book-2nd-edition\examples\stock_px.csv" , parse_dates= True , index_col= 0 )
close_px
AA AAPL GE IBM JNJ MSFT PEP SPX XOM 1990-02-01 4.98 7.86 2.87 16.79 4.27 0.51 6.04 328.79 6.12 1990-02-02 5.04 8.00 2.87 16.89 4.37 0.51 6.09 330.92 6.24 1990-02-05 5.07 8.18 2.87 17.32 4.34 0.51 6.05 331.85 6.25 1990-02-06 5.01 8.12 2.88 17.56 4.32 0.51 6.15 329.66 6.23 1990-02-07 5.04 7.77 2.91 17.93 4.38 0.51 6.17 333.75 6.33 ... ... ... ... ... ... ... ... ... ... 2011-10-10 10.09 388.81 16.14 186.62 64.43 26.94 61.87 1194.89 76.28 2011-10-11 10.30 400.29 16.14 185.00 63.96 27.00 60.95 1195.54 76.27 2011-10-12 10.05 402.19 16.40 186.12 64.33 26.96 62.70 1207.25 77.16 2011-10-13 10.10 408.43 16.22 186.82 64.23 27.18 62.36 1203.66 76.37 2011-10-14 10.26 422.00 16.60 190.53 64.72 27.27 62.24 1224.58 78.11
5472 rows × 9 columns
close_px[ - 4 : ]
AA AAPL GE IBM JNJ MSFT PEP SPX XOM 2011-10-11 10.30 400.29 16.14 185.00 63.96 27.00 60.95 1195.54 76.27 2011-10-12 10.05 402.19 16.40 186.12 64.33 26.96 62.70 1207.25 77.16 2011-10-13 10.10 408.43 16.22 186.82 64.23 27.18 62.36 1203.66 76.37 2011-10-14 10.26 422.00 16.60 190.53 64.72 27.27 62.24 1224.58 78.11
rets = close_px. pct_change( ) . dropna( )
spx_corr = lambda x: x. corrwith( x[ 'SPX' ] )
by_year = rets. groupby( lambda x: x. year)
by_year. apply ( spx_corr)
AA AAPL GE IBM JNJ MSFT PEP SPX XOM 1990 0.595024 0.545067 0.752187 0.738361 0.801145 0.586691 0.783168 1.0 0.517586 1991 0.453574 0.365315 0.759607 0.557046 0.646401 0.524225 0.641775 1.0 0.569335 1992 0.398180 0.498732 0.632685 0.262232 0.515740 0.492345 0.473871 1.0 0.318408 1993 0.259069 0.238578 0.447257 0.211269 0.451503 0.425377 0.385089 1.0 0.318952 1994 0.428549 0.268420 0.572996 0.385162 0.372962 0.436585 0.450516 1.0 0.395078 1995 0.291532 0.161829 0.519126 0.416390 0.315733 0.453660 0.413144 1.0 0.368752 1996 0.292344 0.191482 0.750724 0.388497 0.569232 0.564015 0.421477 1.0 0.538736 1997 0.564427 0.211435 0.827512 0.646823 0.703538 0.606171 0.509344 1.0 0.695653 1998 0.533802 0.379883 0.815243 0.623982 0.591988 0.698773 0.494213 1.0 0.369264 1999 0.099033 0.425584 0.710928 0.486167 0.517061 0.631315 0.336593 1.0 0.315383 2000 0.265359 0.440161 0.610362 0.445114 0.189765 0.538005 0.077525 1.0 0.084163 2001 0.624069 0.577152 0.794632 0.696038 0.111493 0.696447 0.133975 1.0 0.336869 2002 0.748021 0.580548 0.822373 0.716490 0.584758 0.784728 0.487211 1.0 0.759933 2003 0.690466 0.545582 0.777643 0.741775 0.562399 0.750534 0.541487 1.0 0.662775 2004 0.591485 0.374283 0.728626 0.601740 0.354690 0.588531 0.466854 1.0 0.557742 2005 0.564267 0.467540 0.675637 0.516846 0.444728 0.562374 0.489559 1.0 0.631010 2006 0.487638 0.428267 0.612388 0.598636 0.394026 0.406126 0.335054 1.0 0.518514 2007 0.642427 0.508118 0.796945 0.603906 0.568423 0.658770 0.651911 1.0 0.786264 2008 0.781057 0.681434 0.777337 0.833074 0.801005 0.804626 0.709264 1.0 0.828303 2009 0.735642 0.707103 0.713086 0.684513 0.603146 0.654902 0.541474 1.0 0.797921 2010 0.745700 0.710105 0.822285 0.783638 0.689896 0.730118 0.626655 1.0 0.839057 2011 0.882045 0.691931 0.864595 0.802730 0.752379 0.800996 0.592029 1.0 0.859975
by_year. apply ( lambda g: g[ 'AAPL' ] . corr( g[ 'MSFT' ] ) )
1990 0.408271
1991 0.266807
1992 0.450592
1993 0.236917
1994 0.361638
1995 0.258642
1996 0.147539
1997 0.196144
1998 0.364106
1999 0.329484
2000 0.275298
2001 0.563156
2002 0.571435
2003 0.486262
2004 0.259024
2005 0.300093
2006 0.161735
2007 0.417738
2008 0.611901
2009 0.432738
2010 0.571946
2011 0.581987
dtype: float64
示例:面向分组的线性回归
import statsmodels. api as sm
def regress ( data, yvar, xvars) :
Y = data[ yvar]
X = data[ xvars]
X[ 'intercept' ] = 1 .
result = sm. OLS( Y, X) . fit( )
return result. params
by_year. apply ( regress, 'AAPL' , [ 'SPX' ] )
E:\python_study_files\python_pip\.venvs\lpthw\lib\site-packages\statsmodels\compat\pandas.py:65: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
from pandas import Int64Index as NumericIndex
SPX intercept 1990 1.512772 0.001395 1991 1.187351 0.000396 1992 1.832427 0.000164 1993 1.390470 -0.002657 1994 1.190277 0.001617 1995 0.858818 -0.001423 1996 0.829389 -0.001791 1997 0.749928 -0.001901 1998 1.164582 0.004075 1999 1.384989 0.003273 2000 1.733802 -0.002523 2001 1.676128 0.003122 2002 1.080795 -0.000219 2003 1.187770 0.000690 2004 1.363463 0.004201 2005 1.766415 0.003246 2006 1.645496 0.000080 2007 1.198761 0.003438 2008 0.968016 -0.001110 2009 0.879103 0.002954 2010 1.052608 0.001261 2011 0.806605 0.001514
透视表和交叉表
tips. pivot_table( index= [ 'day' , 'smoker' ] )
size tip tip_pct total_bill day smoker Fri No 2.250000 2.812500 0.151650 18.420000 Yes 2.066667 2.714000 0.174783 16.813333 Sat No 2.555556 3.102889 0.158048 19.661778 Yes 2.476190 2.875476 0.147906 21.276667 Sun No 2.929825 3.167895 0.160113 20.506667 Yes 2.578947 3.516842 0.187250 24.120000 Thur No 2.488889 2.673778 0.160298 17.113111 Yes 2.352941 3.030000 0.163863 19.190588
tips. pivot_table( [ 'tip_pct' , 'size' ] , index= [ 'time' , 'day' ] , columns= 'smoker' )
size tip_pct smoker No Yes No Yes time day Dinner Fri 2.000000 2.222222 0.139622 0.165347 Sat 2.555556 2.476190 0.158048 0.147906 Sun 2.929825 2.578947 0.160113 0.187250 Thur 2.000000 NaN 0.159744 NaN Lunch Fri 3.000000 1.833333 0.187735 0.188937 Thur 2.500000 2.352941 0.160311 0.163863
tips. pivot_table( [ 'tip_pct' , 'size' ] , index= [ 'time' , 'day' ] ,
columns= 'smoker' , margins= True )
size tip_pct smoker No Yes All No Yes All time day Dinner Fri 2.000000 2.222222 2.166667 0.139622 0.165347 0.158916 Sat 2.555556 2.476190 2.517241 0.158048 0.147906 0.153152 Sun 2.929825 2.578947 2.842105 0.160113 0.187250 0.166897 Thur 2.000000 NaN 2.000000 0.159744 NaN 0.159744 Lunch Fri 3.000000 1.833333 2.000000 0.187735 0.188937 0.188765 Thur 2.500000 2.352941 2.459016 0.160311 0.163863 0.161301 All 2.668874 2.408602 2.569672 0.159328 0.163196 0.160803
tips. pivot_table( 'tip_pct' , index= [ 'time' , 'smoker' ] , columns= 'day' ,
aggfunc= len , margins= True )
day Fri Sat Sun Thur All time smoker Dinner No 3.0 45.0 57.0 1.0 106 Yes 9.0 42.0 19.0 NaN 70 Lunch No 1.0 NaN NaN 44.0 45 Yes 6.0 NaN NaN 17.0 23 All 19.0 87.0 76.0 62.0 244
tips. pivot_table( 'size' , index= [ 'time' , 'smoker' ] ,
columns= 'day' , aggfunc= 'sum' , fill_value= 0 )
day Fri Sat Sun Thur time smoker Dinner No 6 115 167 2 Yes 20 104 49 0 Lunch No 3 0 0 110 Yes 11 0 0 40
import matplotlib. pyplot as plt
from pylab import *
img = plt. imread( 'pivot_table的参数.png' )
imshow( img)
rows改为index,cols改为columns
交叉表:crosstab
pd. crosstab( [ tips. time, tips. day] , tips. smoker, margins= True )
smoker No Yes All time day Dinner Fri 3 9 12 Sat 45 42 87 Sun 57 19 76 Thur 1 0 1 Lunch Fri 1 6 7 Thur 44 17 61 All 151 93 244
示例:2012联邦选举委员会数据库
fec = pd. read_csv( "E:\\python_study_files\\python\\pydata-book-2nd-edition\\datasets\\fec\\P00000001-ALL.csv" )
fec
C:\windowsDtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.
fec =pd.read_csv("E:\\python_study_files\\python\\pydata-book-2nd-edition\\datasets\\fec\\P00000001-ALL.csv")
cmte_id cand_id cand_nm contbr_nm contbr_city contbr_st contbr_zip contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt receipt_desc memo_cd memo_text form_tp file_num 0 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 366010290.0 RETIRED RETIRED 250.0 20-JUN-11 NaN NaN NaN SA17A 736166 1 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 366010290.0 RETIRED RETIRED 50.0 23-JUN-11 NaN NaN NaN SA17A 736166 2 C00410118 P20002978 Bachmann, Michelle SMITH, LANIER LANETT AL 368633403.0 INFORMATION REQUESTED INFORMATION REQUESTED 250.0 05-JUL-11 NaN NaN NaN SA17A 749073 3 C00410118 P20002978 Bachmann, Michelle BLEVINS, DARONDA PIGGOTT AR 724548253.0 NONE RETIRED 250.0 01-AUG-11 NaN NaN NaN SA17A 749073 4 C00410118 P20002978 Bachmann, Michelle WARDENBURG, HAROLD HOT SPRINGS NATION AR 719016467.0 NONE RETIRED 300.0 20-JUN-11 NaN NaN NaN SA17A 736166 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 1001726 C00500587 P20003281 Perry, Rick GORMAN, CHRIS D. MR. INFO REQUESTED XX 99999 INFORMATION REQUESTED PER BEST EFFORTS INFORMATION REQUESTED PER BEST EFFORTS 5000.0 29-SEP-11 REATTRIBUTION / REDESIGNATION REQUESTED (AUTOM... NaN REATTRIBUTION / REDESIGNATION REQUESTED (AUTOM... SA17A 751678 1001727 C00500587 P20003281 Perry, Rick DUFFY, DAVID A. MR. INFO REQUESTED XX 99999 DUFFY EQUIPMENT COMPANY INC. BUSINESS OWNER 2500.0 30-SEP-11 NaN NaN NaN SA17A 751678 1001728 C00500587 P20003281 Perry, Rick GRANE, BRYAN F. MR. INFO REQUESTED XX 99999 INFORMATION REQUESTED PER BEST EFFORTS INFORMATION REQUESTED PER BEST EFFORTS 500.0 29-SEP-11 NaN NaN NaN SA17A 751678 1001729 C00500587 P20003281 Perry, Rick TOLBERT, DARYL MR. INFO REQUESTED XX 99999 T.A.C.C. LONGWALL MAINTENANCE FOREMAN 500.0 30-SEP-11 NaN NaN NaN SA17A 751678 1001730 C00500587 P20003281 Perry, Rick ANDERSON, MARILEE MRS. INFO REQUESTED XX 99999 INFORMATION REQUESTED PER BEST EFFORTS INFORMATION REQUESTED PER BEST EFFORTS 2500.0 31-AUG-11 NaN NaN NaN SA17A 751678
1001731 rows × 16 columns
fec. loc[ 123456 ]
cmte_id C00431445
cand_id P80003338
cand_nm Obama, Barack
contbr_nm ELLMAN, IRA
contbr_city TEMPE
contbr_st AZ
contbr_zip 852816719
contbr_employer ARIZONA STATE UNIVERSITY
contbr_occupation PROFESSOR
contb_receipt_amt 50.0
contb_receipt_dt 01-DEC-11
receipt_desc NaN
memo_cd NaN
memo_text NaN
form_tp SA17A
file_num 772372
Name: 123456, dtype: object
unique_cands = fec. cand_nm. unique( )
unique_cands
array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
"Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick',
'Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G',
'Huntsman, Jon', 'Perry, Rick'], dtype=object)
unique_cands[ 2 ]
'Obama, Barack'
parties = { 'Bachmann, Michelle' : 'Republican' ,
'Cain, Herman' : 'Republican' ,
'Gingrich, Newt' : 'Republican' ,
'Huntsman, Jon' : 'Republican' ,
'Johnson, Gary Earl' : 'Republican' ,
'McCotter, Thaddeus G' : 'Republican' ,
'Obama, Barack' : 'Democrat' ,
'Paul, Ron' : 'Republican' ,
'Pawlenty, Timothy' : 'Republican' ,
'Perry, Rick' : 'Republican' ,
"Roemer, Charles E. 'Buddy' Ⅲ" : 'Republican' ,
'Romney, Mitt' : 'Republican' ,
'Santorum, Rick' : 'Republican' }
fec. cand_nm[ 123456 : 123461 ]
125611 Obama, Barack
125612 Obama, Barack
125613 Obama, Barack
125614 Obama, Barack
125615 Obama, Barack
Name: cand_nm, dtype: object
fec. cand_nm[ 123456 : 123461 ] . map ( parties)
125611 Democrat
125612 Democrat
125613 Democrat
125614 Democrat
125615 Democrat
Name: cand_nm, dtype: object
fec[ 'party' ] = fec. cand_nm. map ( parties)
fec[ 'party' ] . value_counts( )
Democrat 589127
Republican 396504
Name: party, dtype: int64
( fec. contb_receipt_amt> 0 ) . value_counts( )
True 991475
Name: contb_receipt_amt, dtype: int64
fec = fec[ fec. contb_receipt_amt> 0 ]
fec_mrbo = fec[ fec. cand_nm. isin( [ 'Obama, Barack' , 'Romney, Mitt' ] ) ]
雇主职业和雇主统计赞助信息
fec. contbr_occupation. value_counts( ) [ : 10 ]
RETIRED 233990
NOT PROVIDED 56245
ATTORNEY 34286
HOMEMAKER 29931
PHYSICIAN 23432
ENGINEER 14334
TEACHER 13990
CONSULTANT 13273
PROFESSOR 12555
NOT EMPLOYED 9828
Name: contbr_occupation, dtype: int64
occ_mapping = { 'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED' ,
'INFORMATION REQUESTED' : 'NOT PROVIDED' ,
'INFORMATIO REQUESTED (BEST EFFORTS)' : 'NOT PROVIDED' ,
'C.E.O' : 'CEO' }
f = lambda x: occ_mapping. get( x, x)
fec. contbr_occupation = fec. contbr_occupation. map ( f)
emp_mapping = { 'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED' ,
'INFORMATION REQUESTED' : 'NOT PROVIDED' ,
'SELF' : 'SELF-EMPLOYED' ,
'SELF EMPLOYED' : 'SELF-EMPLOTED'
}
f = lambda x: emp_mapping. get( x, x)
fec. contbr_employer = fec. contbr_employer. map ( f)
by_occupation = fec. pivot_table( 'contb_receipt_amt' , index= 'contbr_occupation' , columns= 'party' , aggfunc= 'sum' )
over_2mm = by_occupation[ by_occupation. sum ( 1 ) > 2000000 ]
over_2mm
party Democrat Republican contbr_occupation ATTORNEY 11141982.97 7462058.31 C.E.O. 1690.00 2592983.11 CEO 2074284.79 1638668.41 CONSULTANT 2459912.71 2538990.45 ENGINEER 951525.55 1811937.30 EXECUTIVE 1355161.05 4136400.09 HOMEMAKER 4248875.80 13625600.78 INVESTOR 884133.00 2431258.92 LAWYER 3160478.87 391124.32 MANAGER 762883.22 1441092.37 NOT PROVIDED 4866973.96 20216287.01 OWNER 1001567.36 2406081.92 PHYSICIAN 3735124.94 3587195.24 PRESIDENT 1878509.95 4717413.76 PROFESSOR 2165071.08 294032.73 REAL ESTATE 528902.09 1624507.25 RETIRED 25305116.38 23481023.18 SELF-EMPLOYED 672393.40 1636774.54
over_2mm. plot( kind= 'barh' )
def get_top_amounts ( group, key, n= 5 ) :
totals = group. groupby( key) [ 'contb_receipt_amt' ] . sum ( )
return totals. sort_values( ascending= False ) [ n: ]
grouped = fec_mrbo. groupby( 'cand_nm' )
grouped. apply ( get_top_amounts, 'contbr_occupation' , n= 7 )
cand_nm contbr_occupation
Obama, Barack PROFESSOR 2165071.08
CEO 2074284.79
PRESIDENT 1878509.95
NOT EMPLOYED 1709188.20
EXECUTIVE 1355161.05
...
Romney, Mitt INDEPENDENT PROFESSIONAL 3.00
IFC CONTRACTING SOLUTIONS 3.00
REMODELER & SEMI RETIRED 3.00
AFFORDABLE REAL ESTATE DEVELOPER 3.00
3RD GENERATION FAMILY BUSINESS OWNER 3.00
Name: contb_receipt_amt, Length: 35973, dtype: float64
grouped. apply ( get_top_amounts, 'contbr_employer' , n= 10 )
cand_nm contbr_employer
Obama, Barack REFUSED 149516.07
DLA PIPER 148235.00
HARVARD UNIVERSITY 131368.94
IBM 128490.93
GOOGLE 125302.88
...
Romney, Mitt UN 3.00
UPTOWN CHEAPSKATE 3.00
WILL MERRIFIELD 3.00
INDEPENDENT PROFESSIONAL 3.00
HONOLD COMMUNICTAIONS 3.00
Name: contb_receipt_amt, Length: 95890, dtype: float64
对出资额分组
bins = np. array( [ 0 , 1 , 10 , 100 , 1000 , 10000 , 100000 , 1000000 , 10000000 ] )
labels = pd. cut( fec_mrbo. contb_receipt_amt, bins)
labels
411 (10, 100]
412 (100, 1000]
413 (100, 1000]
414 (10, 100]
415 (10, 100]
...
701381 (10, 100]
701382 (100, 1000]
701383 (1, 10]
701384 (10, 100]
701385 (100, 1000]
Name: contb_receipt_amt, Length: 694282, dtype: category
Categories (8, interval[int64, right]): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]
grouped = fec_mrbo. groupby( [ 'cand_nm' , labels] )
grouped. size( ) . unstack( 0 )
cand_nm Obama, Barack Romney, Mitt contb_receipt_amt (0, 1] 493 77 (1, 10] 40070 3681 (10, 100] 372280 31853 (100, 1000] 153991 43357 (1000, 10000] 22284 26186 (10000, 100000] 2 1 (100000, 1000000] 3 0 (1000000, 10000000] 4 0
bucket_sums = grouped. contb_receipt_amt. sum ( ) . unstack( 0 )
bucket_sums
cand_nm Obama, Barack Romney, Mitt contb_receipt_amt (0, 1] 318.24 77.00 (1, 10] 337267.62 29819.66 (10, 100] 20288981.41 1987783.76 (100, 1000] 54798531.46 22363381.69 (1000, 10000] 51753705.67 63942145.42 (10000, 100000] 59100.00 12700.00 (100000, 1000000] 1490683.08 0.00 (1000000, 10000000] 7148839.76 0.00
normed_sums = bucket_sums. div( bucket_sums. sum ( axis= 1 ) , axis= 0 )
normed_sums
cand_nm Obama, Barack Romney, Mitt contb_receipt_amt (0, 1] 0.805182 0.194818 (1, 10] 0.918767 0.081233 (10, 100] 0.910769 0.089231 (100, 1000] 0.710176 0.289824 (1000, 10000] 0.447326 0.552674 (10000, 100000] 0.823120 0.176880 (100000, 1000000] 1.000000 0.000000 (1000000, 10000000] 1.000000 0.000000
normed_sums[ : 2 ] . plot( kind= 'barh' , stacked= True )
根据州统计赞助信息
grouped = fec_mrbo. groupby( [ 'cand_nm' , 'contbr_st' ] )
totals = grouped. contb_receipt_amt. sum ( ) . unstack( 0 ) . fillna( 0 )
totals = totals[ totals. sum ( 1 ) > 100000 ]
totals[ : 10 ]
cand_nm Obama, Barack Romney, Mitt contbr_st AK 281840.15 86204.24 AL 543123.48 527303.51 AR 359247.28 105556.00 AZ 1506476.98 1888436.23 CA 23824984.24 11237636.60 CO 2132429.49 1506714.12 CT 2068291.26 3499475.45 DC 4373538.80 1025137.50 DE 336669.14 82712.00 FL 7318178.58 8338458.81
percent = totals. div( totals. sum ( 1 ) , axis= 0 )
percent[ : 10 ]
cand_nm Obama, Barack Romney, Mitt contbr_st AK 0.765778 0.234222 AL 0.507390 0.492610 AR 0.772902 0.227098 AZ 0.443745 0.556255 CA 0.679498 0.320502 CO 0.585970 0.414030 CT 0.371476 0.628524 DC 0.810113 0.189887 DE 0.802776 0.197224 FL 0.467417 0.532583
from mpl_toolkits. basemap import Basemap, cm
import numpy as np
from matplotlib import rcParams
from matplotlib. collections import LineCollection
import matplotlib. pyplot as plt
import pyshp
import dbflib
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
5 import matplotlib.pyplot as plt
6 #from shapelib import ShapeFile
----> 7 import pyshp
8 import dbflib
ModuleNotFoundError: No module named 'pyshp'