《利用Python进行数据分析》示例数据
请结合提供的示例数据,分析代码的功能,并进行数据分析与可视化拓展
1.MoviesLens 1M数据集
GroupLens实验室提供了一些从MoviesLens用户那里收集的20世纪90年代末到21世纪初的电影评分数据的集合。这些数据提供了电影的评分、流派、年份和观众数据(年龄、邮编、性别、职业)。 MovisLens1M数据集包含6000个用户对4000部电影的100万个评分。数据分布在三个表格之中:分别包含评分、用户信息和电影信息。
unames = [ "user_id" , "gender" , "age" , "occupation" , "zip" ]
users = pd. read_table( "datasets/movielens/users.dat" , sep= "::" ,
header= None , names= unames, engine= "python" )
rnames = [ "user_id" , "movie_id" , "rating" , "timestamp" ]
ratings = pd. read_table( "datasets/movielens/ratings.dat" , sep= "::" ,
header= None , names= rnames, engine= "python" )
mnames = [ "movie_id" , "title" , "genres" ]
movies = pd. read_table( "datasets/movielens/movies.dat" , sep= "::" ,
header= None , names= mnames, engine= "python" )
users. head( 5 )
ratings. head( 5 )
movies. head( 5 )
ratings
user_id movie_id rating timestamp 0 1 1193 5 978300760 1 1 661 3 978302109 2 1 914 3 978301968 3 1 3408 4 978300275 4 1 2355 5 978824291 ... ... ... ... ... 1000204 6040 1091 1 956716541 1000205 6040 1094 5 956704887 1000206 6040 562 5 956704746 1000207 6040 1096 4 956715648 1000208 6040 1097 4 956715569
1000209 rows × 4 columns
data = pd. merge( pd. merge( ratings, users) , movies)
data
data. iloc[ 0 ]
user_id 1
movie_id 1193
rating 5
timestamp 978300760
gender F
age 1
occupation 10
zip 48067
title One Flew Over the Cuckoo's Nest (1975)
genres Drama
Name: 0, dtype: object
mean_ratings = data. pivot_table( "rating" , index= "title" ,
columns= "gender" , aggfunc= "mean" )
mean_ratings. head( 5 )
gender F M title $1,000,000 Duck (1971) 3.375000 2.761905 'Night Mother (1986) 3.388889 3.352941 'Til There Was You (1997) 2.675676 2.733333 'burbs, The (1989) 2.793478 2.962085 ...And Justice for All (1979) 3.828571 3.689024
ratings_by_title = data. groupby( "title" ) . size( )
ratings_by_title. head( )
active_titles = ratings_by_title. index[ ratings_by_title >= 250 ]
active_titles
Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
'101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
'13th Warrior, The (1999)', '2 Days in the Valley (1996)',
'20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
'2010 (1984)',
...
'X-Men (2000)', 'Year of Living Dangerously (1982)',
'Yellow Submarine (1968)', 'You've Got Mail (1998)',
'Young Frankenstein (1974)', 'Young Guns (1988)',
'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
'Zero Effect (1998)', 'eXistenZ (1999)'],
dtype='object', name='title', length=1216)
mean_ratings = mean_ratings. loc[ active_titles]
mean_ratings
gender F M title 'burbs, The (1989) 2.793478 2.962085 10 Things I Hate About You (1999) 3.646552 3.311966 101 Dalmatians (1961) 3.791444 3.500000 101 Dalmatians (1996) 3.240000 2.911215 12 Angry Men (1957) 4.184397 4.328421 ... ... ... Young Guns (1988) 3.371795 3.425620 Young Guns II (1990) 2.934783 2.904025 Young Sherlock Holmes (1985) 3.514706 3.363344 Zero Effect (1998) 3.864407 3.723140 eXistenZ (1999) 3.098592 3.289086
1216 rows × 2 columns
mean_ratings = mean_ratings. rename( index= { "Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)" :
"Seven Samurai (Shichinin no samurai) (1954)" } )
使用rename( ) 函数来重命名mean_ratings DataFrame的索引。通过index参数指定要重命名的索引名称和对应的新名称。
在这个例子中,将"Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)" 的索引名称更改为
"Seven Samurai (Shichinin no samurai) (1954)" 。
top_female_ratings = mean_ratings. sort_values( "F" , ascending= False )
top_female_ratings. head( )
gender F M title Close Shave, A (1995) 4.644444 4.473795 Wrong Trousers, The (1993) 4.588235 4.478261 Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 4.572650 4.464589 Wallace & Gromit: The Best of Aardman Animation (1996) 4.563107 4.385075 Schindler's List (1993) 4.562602 4.491415
mean_ratings[ "diff" ] = mean_ratings[ "M" ] - mean_ratings[ "F" ]
sorted_by_diff = mean_ratings. sort_values( "diff" )
sorted_by_diff. head( )
gender F M diff title Dirty Dancing (1987) 3.790378 2.959596 -0.830782 Jumpin' Jack Flash (1986) 3.254717 2.578358 -0.676359 Grease (1978) 3.975265 3.367041 -0.608224 Little Women (1994) 3.870588 3.321739 -0.548849 Steel Magnolias (1989) 3.901734 3.365957 -0.535777
sorted_by_diff[ : : - 1 ] . head( )
gender F M diff title Good, The Bad and The Ugly, The (1966) 3.494949 4.221300 0.726351 Kentucky Fried Movie, The (1977) 2.878788 3.555147 0.676359 Dumb & Dumber (1994) 2.697987 3.336595 0.638608 Longest Day, The (1962) 3.411765 4.031447 0.619682 Cable Guy, The (1996) 2.250000 2.863787 0.613787
rating_std_by_title = data. groupby( "title" ) [ "rating" ] . std( )
rating_std_by_title = rating_std_by_title. loc[ active_titles]
rating_std_by_title. head( )
title
'burbs, The (1989) 1.107760
10 Things I Hate About You (1999) 0.989815
101 Dalmatians (1961) 0.982103
101 Dalmatians (1996) 1.098717
12 Angry Men (1957) 0.812731
Name: rating, dtype: float64
rating_std_by_title. sort_values( ascending= False ) [ : 10 ]
title
Dumb & Dumber (1994) 1.321333
Blair Witch Project, The (1999) 1.316368
Natural Born Killers (1994) 1.307198
Tank Girl (1995) 1.277695
Rocky Horror Picture Show, The (1975) 1.260177
Eyes Wide Shut (1999) 1.259624
Evita (1996) 1.253631
Billy Madison (1995) 1.249970
Fear and Loathing in Las Vegas (1998) 1.246408
Bicentennial Man (1999) 1.245533
Name: rating, dtype: float64
movies[ "genres" ] . head( )
movies[ "genres" ] . head( ) . str . split( "|" )
movies[ "genre" ] = movies. pop( "genres" ) . str . split( "|" )
movies. head( )
movie_id title genre 0 1 Toy Story (1995) [Animation, Children's, Comedy] 1 2 Jumanji (1995) [Adventure, Children's, Fantasy] 2 3 Grumpier Old Men (1995) [Comedy, Romance] 3 4 Waiting to Exhale (1995) [Comedy, Drama] 4 5 Father of the Bride Part II (1995) [Comedy]
movies_exploded = movies. explode( "genre" )
movies_exploded[ : 10 ]
movie_id title genre 0 1 Toy Story (1995) Animation 0 1 Toy Story (1995) Children's 0 1 Toy Story (1995) Comedy 1 2 Jumanji (1995) Adventure 1 2 Jumanji (1995) Children's 1 2 Jumanji (1995) Fantasy 2 3 Grumpier Old Men (1995) Comedy 2 3 Grumpier Old Men (1995) Romance 3 4 Waiting to Exhale (1995) Comedy 3 4 Waiting to Exhale (1995) Drama
ratings_with_genre = pd. merge( pd. merge( movies_exploded, ratings) , users)
ratings_with_genre. iloc[ 0 ]
genre_ratings = ( ratings_with_genre. groupby( [ "genre" , "age" ] )
[ "rating" ] . mean( )
. unstack( "age" ) )
genre_ratings[ : 10 ]
age 1 18 25 35 45 50 56 genre Action 3.506385 3.447097 3.453358 3.538107 3.528543 3.611333 3.610709 Adventure 3.449975 3.408525 3.443163 3.515291 3.528963 3.628163 3.649064 Animation 3.476113 3.624014 3.701228 3.740545 3.734856 3.780020 3.756233 Children's 3.241642 3.294257 3.426873 3.518423 3.527593 3.556555 3.621822 Comedy 3.497491 3.460417 3.490385 3.561984 3.591789 3.646868 3.650949 Crime 3.710170 3.668054 3.680321 3.733736 3.750661 3.810688 3.832549 Documentary 3.730769 3.865865 3.946690 3.953747 3.966521 3.908108 3.961538 Drama 3.794735 3.721930 3.726428 3.782512 3.784356 3.878415 3.933465 Fantasy 3.317647 3.353778 3.452484 3.482301 3.532468 3.581570 3.532700 Film-Noir 4.145455 3.997368 4.058725 4.064910 4.105376 4.175401 4.125932
2.美国1880-2010年的婴儿名字
美国社会保障局(SSA)提供了从1880年至现在的婴儿姓名频率的数据。可以使用这些数据做很多事情: 根据给定的名字对婴儿名字随时间的比例进行可视化 确定一个名字的相对排位 确定每年最受欢迎的名字,或者流行程度最高或最低的名字
!head - n 10 datasets/ babynames/ yob1880. txt
'head' 不是内部或外部命令,也不是可运行的程序
或批处理文件。
names1880 = pd. read_csv( "datasets/babynames/yob1880.txt" ,
names= [ "name" , "sex" , "births" ] )
names1880
name sex births 0 Mary F 7065 1 Anna F 2604 2 Emma F 2003 3 Elizabeth F 1939 4 Minnie F 1746 ... ... ... ... 1995 Woodie M 5 1996 Worthy M 5 1997 Wright M 5 1998 York M 5 1999 Zachariah M 5
2000 rows × 3 columns
names1880. groupby( "sex" ) [ "births" ] . sum ( )
sex
F 90993
M 110493
Name: births, dtype: int64
pieces = [ ]
for year in range ( 1880 , 2011 ) :
path = f"datasets/babynames/yob { year} .txt"
frame = pd. read_csv( path, names= [ "name" , "sex" , "births" ] ) 3 调用pd. read_csv( ) 函数来读取位于生成的路径的CSV文件。它为DataFrame的列分配了列名"name" 、"sex" 和"births" 。
frame[ "year" ] = year
pieces. append( frame)
names = pd. concat( pieces, ignore_index= True )
names
name sex births year 0 Mary F 7065 1880 1 Anna F 2604 1880 2 Emma F 2003 1880 3 Elizabeth F 1939 1880 4 Minnie F 1746 1880 ... ... ... ... ... 1690779 Zymaire M 5 2010 1690780 Zyonne M 5 2010 1690781 Zyquarius M 5 2010 1690782 Zyran M 5 2010 1690783 Zzyzx M 5 2010
1690784 rows × 4 columns
total_births = names. pivot_table( "births" , index= "year" ,
columns= "sex" , aggfunc= sum )
total_births. tail( )
total_births. plot( title= "Total births by sex and year" )
<AxesSubplot:title={'center':'Total births by sex and year'}, xlabel='year'>
def add_prop ( group) :
group[ "prop" ] = group[ "births" ] / group[ "births" ] . sum ( )
return group
names = names. groupby( [ "year" , "sex" ] , group_keys= False ) . apply ( add_prop)
names
name sex births year prop 0 Mary F 7065 1880 0.077643 1 Anna F 2604 1880 0.028618 2 Emma F 2003 1880 0.022013 3 Elizabeth F 1939 1880 0.021309 4 Minnie F 1746 1880 0.019188 ... ... ... ... ... ... 1690779 Zymaire M 5 2010 0.000003 1690780 Zyonne M 5 2010 0.000003 1690781 Zyquarius M 5 2010 0.000003 1690782 Zyran M 5 2010 0.000003 1690783 Zzyzx M 5 2010 0.000003
1690784 rows × 5 columns
names. groupby( [ "year" , "sex" ] ) [ "prop" ] . sum ( )
year sex
1880 F 1.0
M 1.0
1881 F 1.0
M 1.0
1882 F 1.0
...
2008 M 1.0
2009 F 1.0
M 1.0
2010 F 1.0
M 1.0
Name: prop, Length: 262, dtype: float64
def get_top1000 ( group) :
return group. sort_values( "births" , ascending= False ) [ : 1000 ]
grouped = names. groupby( [ "year" , "sex" ] )
top1000 = grouped. apply ( get_top1000)
top1000. head( )
name sex births year prop year sex 1880 F 0 Mary F 7065 1880 0.077643 1 Anna F 2604 1880 0.028618 2 Emma F 2003 1880 0.022013 3 Elizabeth F 1939 1880 0.021309 4 Minnie F 1746 1880 0.019188
top1000 = top1000. reset_index( drop= True )
top1000. head( )
name sex births year prop 0 Mary F 7065 1880 0.077643 1 Anna F 2604 1880 0.028618 2 Emma F 2003 1880 0.022013 3 Elizabeth F 1939 1880 0.021309 4 Minnie F 1746 1880 0.019188
boys = top1000[ top1000[ "sex" ] == "M" ]
girls = top1000[ top1000[ "sex" ] == "F" ]
total_births = top1000. pivot_table( "births" , index= "year" ,
columns= "name" ,
aggfunc= sum )
total_births. info( )
subset = total_births[ [ "John" , "Harry" , "Mary" , "Marilyn" ] ]
subset. plot( subplots= True , figsize= ( 12 , 10 ) ,
title= "Number of births per year" )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 131 entries, 1880 to 2010
Columns: 6868 entries, Aaden to Zuri
dtypes: float64(6868)
memory usage: 6.9 MB
array([<AxesSubplot:xlabel='year'>, <AxesSubplot:xlabel='year'>,
<AxesSubplot:xlabel='year'>, <AxesSubplot:xlabel='year'>],
dtype=object)
plt. figure( )
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
table = top1000. pivot_table( "prop" , index= "year" ,
columns= "sex" , aggfunc= sum )
table. plot( title= "Sum of table1000.prop by year and sex" ,
yticks= np. linspace( 0 , 1.2 , 13 ) )
<AxesSubplot:title={'center':'Sum of table1000.prop by year and sex'}, xlabel='year'>
df = boys[ boys[ "year" ] == 2010 ]
df
name sex births year prop 260877 Jacob M 21875 2010 0.011523 260878 Ethan M 17866 2010 0.009411 260879 Michael M 17133 2010 0.009025 260880 Jayden M 17030 2010 0.008971 260881 William M 16870 2010 0.008887 ... ... ... ... ... ... 261872 Camilo M 194 2010 0.000102 261873 Destin M 194 2010 0.000102 261874 Jaquan M 194 2010 0.000102 261875 Jaydan M 194 2010 0.000102 261876 Maxton M 193 2010 0.000102
1000 rows × 5 columns
prop_cumsum = df[ "prop" ] . sort_values( ascending= False ) . cumsum( )
prop_cumsum[ : 10 ]
prop_cumsum. searchsorted( 0.5 )
116
df = boys[ boys. year == 1900 ]
in1900 = df. sort_values( "prop" , ascending= False ) . prop. cumsum( )
in1900. searchsorted( 0.5 ) + 1
25
def get_quantile_count ( group, q= 0.5 ) :
group = group. sort_values( "prop" , ascending= False )
return group. prop. cumsum( ) . searchsorted( q) + 1
diversity = top1000. groupby( [ "year" , "sex" ] ) . apply ( get_quantile_count)
diversity = diversity. unstack( )
fig = plt. figure( )
<Figure size 1000x600 with 0 Axes>
diversity. head( )
diversity. plot( title= "Number of popular names in top 50%" )
<AxesSubplot:title={'center':'Number of popular names in top 50%'}, xlabel='year'>
def get_last_letter ( x) :
return x[ - 1 ]
last_letters = names[ "name" ] . map ( get_last_letter)
last_letters. name = "last_letter"
table = names. pivot_table( "births" , index= last_letters,
columns= [ "sex" , "year" ] , aggfunc= sum )
subtable = table. reindex( columns= [ 1910 , 1960 , 2010 ] , level= "year" )
subtable. head( )
sex F M year 1910 1960 2010 1910 1960 2010 last_letter a 108376.0 691247.0 670605.0 977.0 5204.0 28438.0 b NaN 694.0 450.0 411.0 3912.0 38859.0 c 5.0 49.0 946.0 482.0 15476.0 23125.0 d 6750.0 3729.0 2607.0 22111.0 262112.0 44398.0 e 133569.0 435013.0 313833.0 28655.0 178823.0 129012.0
subtable. sum ( )
letter_prop = subtable / subtable. sum ( )
letter_prop
sex F M year 1910 1960 2010 1910 1960 2010 last_letter a 0.273390 0.341853 0.381240 0.005031 0.002440 0.014980 b NaN 0.000343 0.000256 0.002116 0.001834 0.020470 c 0.000013 0.000024 0.000538 0.002482 0.007257 0.012181 d 0.017028 0.001844 0.001482 0.113858 0.122908 0.023387 e 0.336941 0.215133 0.178415 0.147556 0.083853 0.067959 ... ... ... ... ... ... ... v NaN 0.000060 0.000117 0.000113 0.000037 0.001434 w 0.000020 0.000031 0.001182 0.006329 0.007711 0.016148 x 0.000015 0.000037 0.000727 0.003965 0.001851 0.008614 y 0.110972 0.152569 0.116828 0.077349 0.160987 0.058168 z 0.002439 0.000659 0.000704 0.000170 0.000184 0.001831
26 rows × 6 columns
import matplotlib. pyplot as plt
fig, axes = plt. subplots( 2 , 1 , figsize= ( 10 , 8 ) )
letter_prop[ "M" ] . plot( kind= "bar" , rot= 0 , ax= axes[ 0 ] , title= "Male" )
letter_prop[ "F" ] . plot( kind= "bar" , rot= 0 , ax= axes[ 1 ] , title= "Female" ,
legend= False )
<AxesSubplot:title={'center':'Female'}, xlabel='last_letter'>
plt. subplots_adjust( hspace= 0.25 )
<Figure size 1000x600 with 0 Axes>
letter_prop = table / table. sum ( )
dny_ts = letter_prop. loc[ [ "d" , "n" , "y" ] , "M" ] . T
dny_ts. head( )
last_letter d n y year 1880 0.083055 0.153213 0.075760 1881 0.083247 0.153214 0.077451 1882 0.085340 0.149560 0.077537 1883 0.084066 0.151646 0.079144 1884 0.086120 0.149915 0.080405
plt. close( "all" )
fig = plt. figure( )
<Figure size 1000x600 with 0 Axes>
dny_ts. plot( )
<AxesSubplot:xlabel='year'>
all_names = pd. Series( top1000[ "name" ] . unique( ) )
lesley_like = all_names[ all_names. str . contains( "Lesl" ) ]
lesley_like
632 Leslie
2294 Lesley
4262 Leslee
4728 Lesli
6103 Lesly
dtype: object
filtered = top1000[ top1000[ "name" ] . isin( lesley_like) ]
filtered. groupby( "name" ) [ "births" ] . sum ( )
name
Leslee 1082
Lesley 35022
Lesli 929
Leslie 370429
Lesly 10067
Name: births, dtype: int64
table = filtered. pivot_table( "births" , index= "year" ,
columns= "sex" , aggfunc= "sum" )
table = table. div( table. sum ( axis= "columns" ) , axis= "index" )
table. tail( )
sex F M year 2006 1.0 NaN 2007 1.0 NaN 2008 1.0 NaN 2009 1.0 NaN 2010 1.0 NaN
fig = plt. figure( )
<Figure size 1000x600 with 0 Axes>
table. plot( style= { "M" : "k-" , "F" : "k--" } )
<AxesSubplot:xlabel='year'>
3.美国农业部视频数据库
美国农业部提供了食物营养信息数据库。每种事务都有一些识别属性以及两份营养元素和营养比例的列表。这种形式的数据不适合分析,所以需要做一些工作将数据转换成更好的形式。
import json
db = json. load( open ( "datasets/usda_food/database.json" ) )
len ( db)
6636
db[ 0 ] . keys( )
db[ 0 ] [ "nutrients" ] [ 0 ]
nutrients = pd. DataFrame( db[ 0 ] [ "nutrients" ] )
nutrients. head( 7 )
value units description group 0 25.18 g Protein Composition 1 29.20 g Total lipid (fat) Composition 2 3.06 g Carbohydrate, by difference Composition 3 3.28 g Ash Other 4 376.00 kcal Energy Energy 5 39.28 g Water Composition 6 1573.00 kJ Energy Energy
info_keys = [ "description" , "group" , "id" , "manufacturer" ]
info = pd. DataFrame( db, columns= info_keys)
info. head( )
info. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6636 entries, 0 to 6635
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 description 6636 non-null object
1 group 6636 non-null object
2 id 6636 non-null int64
3 manufacturer 5195 non-null object
dtypes: int64(1), object(3)
memory usage: 207.5+ KB
pd. value_counts( info[ "group" ] ) [ : 10 ]
Vegetables and Vegetable Products 812
Beef Products 618
Baked Products 496
Breakfast Cereals 403
Legumes and Legume Products 365
Fast Foods 365
Lamb, Veal, and Game Products 345
Sweets 341
Fruits and Fruit Juices 328
Pork Products 328
Name: group, dtype: int64
nutrients = [ ]
for rec in db:
fnuts = pd. DataFrame( rec[ "nutrients" ] )
fnuts[ "id" ] = rec[ "id" ]
nutrients. append( fnuts)
nutrients = pd. concat( nutrients, ignore_index= True )
nutrients
value units description group id 0 25.180 g Protein Composition 1008 1 29.200 g Total lipid (fat) Composition 1008 2 3.060 g Carbohydrate, by difference Composition 1008 3 3.280 g Ash Other 1008 4 376.000 kcal Energy Energy 1008 ... ... ... ... ... ... 389350 0.000 mcg Vitamin B-12, added Vitamins 43546 389351 0.000 mg Cholesterol Other 43546 389352 0.072 g Fatty acids, total saturated Other 43546 389353 0.028 g Fatty acids, total monounsaturated Other 43546 389354 0.041 g Fatty acids, total polyunsaturated Other 43546
389355 rows × 5 columns
nutrients. duplicated( ) . sum ( )
nutrients = nutrients. drop_duplicates( )
col_mapping = { "description" : "food" ,
"group" : "fgroup" }
info = info. rename( columns= col_mapping, copy= False )
info. info( )
col_mapping = { "description" : "nutrient" ,
"group" : "nutgroup" }
nutrients = nutrients. rename( columns= col_mapping, copy= False )
nutrients
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6636 entries, 0 to 6635
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 food 6636 non-null object
1 fgroup 6636 non-null object
2 id 6636 non-null int64
3 manufacturer 5195 non-null object
dtypes: int64(1), object(3)
memory usage: 207.5+ KB
value units nutrient nutgroup id 0 25.180 g Protein Composition 1008 1 29.200 g Total lipid (fat) Composition 1008 2 3.060 g Carbohydrate, by difference Composition 1008 3 3.280 g Ash Other 1008 4 376.000 kcal Energy Energy 1008 ... ... ... ... ... ... 389350 0.000 mcg Vitamin B-12, added Vitamins 43546 389351 0.000 mg Cholesterol Other 43546 389352 0.072 g Fatty acids, total saturated Other 43546 389353 0.028 g Fatty acids, total monounsaturated Other 43546 389354 0.041 g Fatty acids, total polyunsaturated Other 43546
375176 rows × 5 columns
ndata = pd. merge( nutrients, info, on= "id" )
ndata. info( )
ndata. iloc[ 30000 ]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 375176 entries, 0 to 375175
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 value 375176 non-null float64
1 units 375176 non-null object
2 nutrient 375176 non-null object
3 nutgroup 375176 non-null object
4 id 375176 non-null int64
5 food 375176 non-null object
6 fgroup 375176 non-null object
7 manufacturer 293054 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 25.8+ MB
value 0.04
units g
nutrient Glycine
nutgroup Amino Acids
id 6158
food Soup, tomato bisque, canned, condensed
fgroup Soups, Sauces, and Gravies
manufacturer
Name: 30000, dtype: object
fig = plt. figure( )
<Figure size 1000x600 with 0 Axes>
result = ndata. groupby( [ "nutrient" , "fgroup" ] ) [ "value" ] . quantile( 0.5 )
result[ "Zinc, Zn" ] . sort_values( ) . plot( kind= "barh" )
<AxesSubplot:ylabel='fgroup'>
by_nutrient = ndata. groupby( [ "nutgroup" , "nutrient" ] )
def get_maximum ( x) :
return x. loc[ x. value. idxmax( ) ]
max_foods = by_nutrient. apply ( get_maximum) [ [ "value" , "food" ] ]
max_foods[ "food" ] = max_foods[ "food" ] . str [ : 50 ]
max_foods. loc[ "Amino Acids" ] [ "food" ]
nutrient
Alanine Gelatins, dry powder, unsweetened
Arginine Seeds, sesame flour, low-fat
Aspartic acid Soy protein isolate
Cystine Seeds, cottonseed flour, low fat (glandless)
Glutamic acid Soy protein isolate
Glycine Gelatins, dry powder, unsweetened
Histidine Whale, beluga, meat, dried (Alaska Native)
Hydroxyproline KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINAL RE
Isoleucine Soy protein isolate, PROTEIN TECHNOLOGIES INTERNAT
Leucine Soy protein isolate, PROTEIN TECHNOLOGIES INTERNAT
Lysine Seal, bearded (Oogruk), meat, dried (Alaska Native
Methionine Fish, cod, Atlantic, dried and salted
Phenylalanine Soy protein isolate, PROTEIN TECHNOLOGIES INTERNAT
Proline Gelatins, dry powder, unsweetened
Serine Soy protein isolate, PROTEIN TECHNOLOGIES INTERNAT
Threonine Soy protein isolate, PROTEIN TECHNOLOGIES INTERNAT
Tryptophan Sea lion, Steller, meat with fat (Alaska Native)
Tyrosine Soy protein isolate, PROTEIN TECHNOLOGIES INTERNAT
Valine Soy protein isolate, PROTEIN TECHNOLOGIES INTERNAT
Name: food, dtype: object
4.2012年联邦选举委员会数据库
美国联邦选举委员会公布了有关政治运动贡献的数据。这些数据包括捐赠者姓名、职业和雇主、地址和缴费金额。你可以尝试做一下的分析: 按职业和雇主的捐赠统计 按捐赠金额统计 按州进行统计
fec = pd. read_csv( "datasets/fec/P00000001-ALL.csv" , low_memory= False )
fec. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001731 entries, 0 to 1001730
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 cmte_id 1001731 non-null object
1 cand_id 1001731 non-null object
2 cand_nm 1001731 non-null object
3 contbr_nm 1001731 non-null object
4 contbr_city 1001712 non-null object
5 contbr_st 1001727 non-null object
6 contbr_zip 1001620 non-null object
7 contbr_employer 988002 non-null object
8 contbr_occupation 993301 non-null object
9 contb_receipt_amt 1001731 non-null float64
10 contb_receipt_dt 1001731 non-null object
11 receipt_desc 14166 non-null object
12 memo_cd 92482 non-null object
13 memo_text 97770 non-null object
14 form_tp 1001731 non-null object
15 file_num 1001731 non-null int64
dtypes: float64(1), int64(1), object(14)
memory usage: 122.3+ MB
fec. iloc[ 123456 ]
cmte_id C00431445
cand_id P80003338
cand_nm Obama, Barack
contbr_nm ELLMAN, IRA
contbr_city TEMPE
contbr_st AZ
contbr_zip 852816719
contbr_employer ARIZONA STATE UNIVERSITY
contbr_occupation PROFESSOR
contb_receipt_amt 50.0
contb_receipt_dt 01-DEC-11
receipt_desc NaN
memo_cd NaN
memo_text NaN
form_tp SA17A
file_num 772372
Name: 123456, dtype: object
unique_cands = fec[ "cand_nm" ] . unique( )
unique_cands
unique_cands[ 2 ]
'Obama, Barack'
parties = { "Bachmann, Michelle" : "Republican" ,
"Cain, Herman" : "Republican" ,
"Gingrich, Newt" : "Republican" ,
"Huntsman, Jon" : "Republican" ,
"Johnson, Gary Earl" : "Republican" ,
"McCotter, Thaddeus G" : "Republican" ,
"Obama, Barack" : "Democrat" ,
"Paul, Ron" : "Republican" ,
"Pawlenty, Timothy" : "Republican" ,
"Perry, Rick" : "Republican" ,
"Roemer, Charles E. 'Buddy' III" : "Republican" ,
"Romney, Mitt" : "Republican" ,
"Santorum, Rick" : "Republican" }
fec[ "cand_nm" ] [ 123456 : 123461 ]
fec[ "cand_nm" ] [ 123456 : 123461 ] . map ( parties)
fec[ "party" ] = fec[ "cand_nm" ] . map ( parties)
fec[ "party" ] . value_counts( )
Democrat 593746
Republican 407985
Name: party, dtype: int64
( fec[ "contb_receipt_amt" ] > 0 ) . value_counts( )
True 991475
False 10256
Name: contb_receipt_amt, dtype: int64
fec = fec[ fec[ "contb_receipt_amt" ] > 0 ]
fec_mrbo = fec[ fec[ "cand_nm" ] . isin( [ "Obama, Barack" , "Romney, Mitt" ] ) ]
fec[ "contbr_occupation" ] . value_counts( ) [ : 10 ]
RETIRED 233990
INFORMATION REQUESTED 35107
ATTORNEY 34286
HOMEMAKER 29931
PHYSICIAN 23432
INFORMATION REQUESTED PER BEST EFFORTS 21138
ENGINEER 14334
TEACHER 13990
CONSULTANT 13273
PROFESSOR 12555
Name: contbr_occupation, dtype: int64
occ_mapping = {
"INFORMATION REQUESTED PER BEST EFFORTS" : "NOT PROVIDED" ,
"INFORMATION REQUESTED" : "NOT PROVIDED" ,
"INFORMATION REQUESTED (BEST EFFORTS)" : "NOT PROVIDED" ,
"C.E.O." : "CEO"
}
def get_occ ( x) :
return occ_mapping. get( x, x)
fec[ "contbr_occupation" ] = fec[ "contbr_occupation" ] . map ( get_occ)
emp_mapping = {
"INFORMATION REQUESTED PER BEST EFFORTS" : "NOT PROVIDED" ,
"INFORMATION REQUESTED" : "NOT PROVIDED" ,
"SELF" : "SELF-EMPLOYED" ,
"SELF EMPLOYED" : "SELF-EMPLOYED" ,
}
def get_emp ( x) :
return emp_mapping. get( x, x)
fec[ "contbr_employer" ] = fec[ "contbr_employer" ] . map ( get_emp)
by_occupation = fec. pivot_table( "contb_receipt_amt" ,
index= "contbr_occupation" ,
columns= "party" , aggfunc= "sum" )
over_2mm = by_occupation[ by_occupation. sum ( axis= "columns" ) > 2000000 ]
over_2mm
party Democrat Republican contbr_occupation ATTORNEY 11141982.97 7477194.43 CEO 2074974.79 4211040.52 CONSULTANT 2459912.71 2544725.45 ENGINEER 951525.55 1818373.70 EXECUTIVE 1355161.05 4138850.09 HOMEMAKER 4248875.80 13634275.78 INVESTOR 884133.00 2431768.92 LAWYER 3160478.87 391224.32 MANAGER 762883.22 1444532.37 NOT PROVIDED 4866973.96 20565473.01 OWNER 1001567.36 2408286.92 PHYSICIAN 3735124.94 3594320.24 PRESIDENT 1878509.95 4720923.76 PROFESSOR 2165071.08 296702.73 REAL ESTATE 528902.09 1625902.25 RETIRED 25305116.38 23561244.49 SELF-EMPLOYED 672393.40 1640252.54
plt. figure( )
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
over_2mm. plot( kind= "barh" )
<AxesSubplot:ylabel='contbr_occupation'>
def get_top_amounts ( group, key, n= 5 ) :
totals = group. groupby( key) [ "contb_receipt_amt" ] . sum ( )
return totals. nlargest( n)
grouped = fec_mrbo. groupby( "cand_nm" )
grouped. apply ( get_top_amounts, "contbr_occupation" , n= 7 )
grouped. apply ( get_top_amounts, "contbr_employer" , n= 10 )
cand_nm contbr_employer
Obama, Barack RETIRED 22694358.85
SELF-EMPLOYED 17080985.96
NOT EMPLOYED 8586308.70
INFORMATION REQUESTED 5053480.37
HOMEMAKER 2605408.54
SELF 1076531.20
SELF EMPLOYED 469290.00
STUDENT 318831.45
VOLUNTEER 257104.00
MICROSOFT 215585.36
Romney, Mitt INFORMATION REQUESTED PER BEST EFFORTS 12059527.24
RETIRED 11506225.71
HOMEMAKER 8147196.22
SELF-EMPLOYED 7409860.98
STUDENT 496490.94
CREDIT SUISSE 281150.00
MORGAN STANLEY 267266.00
GOLDMAN SACH & CO. 238250.00
BARCLAYS CAPITAL 162750.00
H.I.G. CAPITAL 139500.00
Name: contb_receipt_amt, dtype: float64
bins = np. array( [ 0 , 1 , 10 , 100 , 1000 , 10000 ,
100_000 , 1_000_000 , 10_000_000 ] )
labels = pd. cut( fec_mrbo[ "contb_receipt_amt" ] , bins)
labels
411 (10, 100]
412 (100, 1000]
413 (100, 1000]
414 (10, 100]
415 (10, 100]
...
701381 (10, 100]
701382 (100, 1000]
701383 (1, 10]
701384 (10, 100]
701385 (100, 1000]
Name: contb_receipt_amt, Length: 694282, dtype: category
Categories (8, interval[int64, right]): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]
grouped = fec_mrbo. groupby( [ "cand_nm" , labels] )
grouped. size( ) . unstack( level= 0 )
cand_nm Obama, Barack Romney, Mitt contb_receipt_amt (0, 1] 493 77 (1, 10] 40070 3681 (10, 100] 372280 31853 (100, 1000] 153991 43357 (1000, 10000] 22284 26186 (10000, 100000] 2 1 (100000, 1000000] 3 0 (1000000, 10000000] 4 0
plt. figure( )
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
bucket_sums = grouped[ "contb_receipt_amt" ] . sum ( ) . unstack( level= 0 )
normed_sums = bucket_sums. div( bucket_sums. sum ( axis= "columns" ) ,
axis= "index" )
normed_sums
normed_sums[ : - 2 ] . plot( kind= "barh" )
<AxesSubplot:ylabel='contb_receipt_amt'>
grouped = fec_mrbo. groupby( [ "cand_nm" , "contbr_st" ] )
totals = grouped[ "contb_receipt_amt" ] . sum ( ) . unstack( level= 0 ) . fillna( 0 )
totals = totals[ totals. sum ( axis= "columns" ) > 100000 ]
totals. head( 10 )
cand_nm Obama, Barack Romney, Mitt contbr_st AK 281840.15 86204.24 AL 543123.48 527303.51 AR 359247.28 105556.00 AZ 1506476.98 1888436.23 CA 23824984.24 11237636.60 CO 2132429.49 1506714.12 CT 2068291.26 3499475.45 DC 4373538.80 1025137.50 DE 336669.14 82712.00 FL 7318178.58 8338458.81
percent = totals. div( totals. sum ( axis= "columns" ) , axis= "index" )
percent. head( 10 )
cand_nm Obama, Barack Romney, Mitt contbr_st AK 0.765778 0.234222 AL 0.507390 0.492610 AR 0.772902 0.227098 AZ 0.443745 0.556255 CA 0.679498 0.320502 CO 0.585970 0.414030 CT 0.371476 0.628524 DC 0.810113 0.189887 DE 0.802776 0.197224 FL 0.467417 0.532583