记录几道pandas练习题
import pandas as pd
df = pd. read_csv( '2002年-2018年上海机动车拍照拍卖.csv' )
df. head( )
Date Total number of license issued lowest price avg price Total number of applicants 0 2-Jan 1400 13600 14735 3718 1 2-Feb 1800 13100 14057 4590 2 2-Mar 2000 14300 14662 5190 3 2-Apr 2300 16000 16334 4806 4 2-May 2350 17800 18357 4665
df[ '中标率' ] = df[ 'Total number of license issued' ] / df[ 'Total number of applicants' ]
df. sort_values( [ 'Date' ] )
Date Total number of license issued lowest price avg price Total number of applicants 中标率 98 10-Apr 8500 41000 41637 17313 0.490961 102 10-Aug 9000 39800 40169 16855 0.533966 106 10-Dec 9000 10400 15970 11224 0.801853 96 10-Feb 7500 38300 38620 18810 0.398724 95 10-Jan 8000 37800 38311 18975 0.421607 ... ... ... ... ... ... ... 85 9-Mar 6000 26600 27552 18575 0.323015 87 9-May 7200 28500 29100 16471 0.437132 93 9-Nov 8000 34900 35317 21902 0.365263 92 9-Oct 8000 33900 34402 22006 0.363537 91 9-Sep 8500 27200 29500 14906 0.570240
203 rows × 6 columns
for index, row in df. iterrows( ) :
if row[ '中标率' ] < 0.05 :
print ( row[ 'Date' ] )
break
15-May
df[ '年份' ] = df[ 'Date' ] . apply ( lambda x: 2000 + int ( x. split( '-' ) [ 0 ] ) )
df[ '月份' ] = df[ 'Date' ] . apply ( lambda x: x. split( '-' ) [ 1 ] )
newcolumns = [ '年份' , '月份' ] + list ( df. columns[ 1 : - 3 ] )
df1 = df. reindex( columns= newcolumns) . copy( )
df1. head( )
年份 月份 Total number of license issued lowest price avg price Total number of applicants 0 2002 Jan 1400 13600 14735 3718 1 2002 Feb 1800 13100 14057 4590 2 2002 Mar 2000 14300 14662 5190 3 2002 Apr 2300 16000 16334 4806 4 2002 May 2350 17800 18357 4665
import numpy as np
grouped = df1[ 'lowest price ' ] . groupby( df1[ '年份' ] ) . agg( [ max , np. mean] )
grouped[ '0.75分位数' ] = df1[ 'lowest price ' ] . groupby( df1[ '年份' ] ) . quantile( q= 0.75 )
grouped
max mean 0.75分位数 年份 2002 30800 20316.666667 24300.0 2003 38500 31983.333333 36300.0 2004 44200 29408.333333 38400.0 2005 37900 31908.333333 35600.0 2006 39900 37058.333333 39525.0 2007 53800 45691.666667 48950.0 2008 37300 29945.454545 34150.0 2009 36900 31333.333333 34150.0 2010 44900 38008.333333 41825.0 2011 53800 47958.333333 51000.0 2012 68900 61108.333333 65325.0 2013 90800 79125.000000 82550.0 2014 74600 73816.666667 74000.0 2015 85300 80575.000000 83450.0 2016 88600 85733.333333 87475.0 2017 93500 90616.666667 92350.0 2018 89000 87825.000000 88150.0
new_index_columns = [ '年份' ] + list ( df. columns[ 1 : - 3 ] )
df2 = df1. set_index( new_index_columns)
df2. head( )
月份 年份 Total number of license issued lowest price avg price Total number of applicants 2002 1400 13600 14735 3718 Jan 1800 13100 14057 4590 Feb 2000 14300 14662 5190 Mar 2300 16000 16334 4806 Apr 2350 17800 18357 4665 May
df3 = df1[ [ '年份' , '月份' , 'lowest price ' , 'avg price' ] ] . copy( )
df3 = df3. iloc[ 1 : ] . reset_index( ) [ [ '月份' , 'lowest price ' , 'avg price' ] ] . join( df3, rsuffix= '_lastmonth' , how= 'outer' )
df3[ ( ( df3[ 'lowest price ' ] - df3[ 'lowest price _lastmonth' ] ) * ( df3[ 'avg price' ] - df3[ 'avg price_lastmonth' ] ) ) < 0 ] [ [ '年份' , '月份' ] ]
年份 月份 20 2003 Oct 21 2003 Nov 28 2004 Jun 35 2004 Jan 36 2005 Feb 43 2005 Sep 51 2006 May 55 2006 Sep 59 2006 Jan 60 2007 Feb 70 2007 Dec 127 2012 Oct
df4 = df1. copy( )
mean_value = df4[ 'Total number of license issued' ] . rolling( 2 ) . mean( )
mean_value. index = range ( 1 , len ( mean_value) + 1 )
df4[ 'mean-value-of-last-two-months' ] = mean_value[ : 202 ]
df4. fillna( 0 )
df4[ '发行增益' ] = df4[ 'Total number of license issued' ] - df4[ 'mean-value-of-last-two-months' ]
df4. iloc[ [ df4[ '发行增益' ] . idxmax( ) , df4[ '发行增益' ] . idxmin( ) ] ]
年份 月份 Total number of license issued lowest price avg price Total number of applicants mean-value-of-last-two-months 发行增益 72 2008 Jan 16000 8100 23370 20539 7500.0 8500.0 74 2008 Apr 9000 37300 37659 37072 12650.0 -3650.0
airport_data = pd. read_csv( '2007年-2019年俄罗斯货运航班运载量.csv' )
airport_data. head( )
Airport name Year January February March April May June July August September October November December Whole year Airport coordinates 0 Abakan 2019 44.70 66.21 72.7 75.82 100.34 78.38 63.88 73.06 66.74 75.44 110.5 89.8 917.57 (Decimal('91.399735'), Decimal('53.751351')) 1 Aikhal 2019 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.0 0.0 0.00 (Decimal('111.543324'), Decimal('65.957161')) 2 Loss 2019 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.0 0.0 0.00 (Decimal('125.398355'), Decimal('58.602489')) 3 Amderma 2019 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.0 0.0 0.00 (Decimal('61.577429'), Decimal('69.759076')) 4 Anadyr (Carbon) 2019 81.63 143.01 260.9 304.36 122.00 106.87 84.99 130.00 102.00 118.00 94.0 199.0 1746.76 (Decimal('177.738273'), Decimal('64.713433'))
airport_data. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3711 entries, 0 to 3710
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Airport name 3711 non-null object
1 Year 3711 non-null int64
2 January 3711 non-null float64
3 February 3711 non-null float64
4 March 3711 non-null float64
5 April 3711 non-null float64
6 May 3711 non-null float64
7 June 3711 non-null float64
8 July 3711 non-null float64
9 August 3711 non-null float64
10 September 3711 non-null float64
11 October 3711 non-null float64
12 November 3711 non-null float64
13 December 3711 non-null float64
14 Whole year 3711 non-null float64
15 Airport coordinates 3711 non-null object
dtypes: float64(13), int64(1), object(2)
memory usage: 464.0+ KB
airport_data. groupby( 'Year' ) [ 'Whole year' ] . sum ( )
Year
2007 659438.23
2008 664682.46
2009 560809.77
2010 693033.98
2011 818691.71
2012 846388.03
2013 792337.08
2014 729457.12
2015 630208.97
2016 679370.15
2017 773662.28
2018 767095.28
2019 764606.27
Name: Whole year, dtype: float64
airport_data. groupby( 'Year' ) [ 'Airport name' ] . count( )
Year
2007 292
2008 292
2009 292
2010 292
2011 292
2012 292
2013 292
2014 292
2015 292
2016 292
2017 292
2018 248
2019 251
Name: Airport name, dtype: int64
airport_data1 = airport_data[ ( airport_data. Year<= 2015 ) & ( airport_data. Year>= 2010 ) ]
airport_data1. groupby( 'Year' ) . apply ( lambda x: len ( x[ x[ 'Whole year' ] == 0 ] ) / len ( x) )
Year
2010 0.767123
2011 0.770548
2012 0.770548
2013 0.770548
2014 0.770548
2015 0.770548
dtype: float64
airport_data2 = airport_data. copy( )
airport_data2 = airport_data2. set_index( 'Airport name' )
drop_row = pd. DataFrame( airport_data2. groupby( 'Airport name' ) [ 'Whole year' ] . apply ( lambda x: len ( x[ x== 0 ] ) >= 5 ) )
airport_data2. drop( drop_row[ drop_row[ 'Whole year' ] == True ] . index) . head( )
Year January February March April May June July August September October November December Whole year Airport coordinates Airport name Abakan 2019 44.70 66.21 72.70 75.82 100.34 78.38 63.88 73.06 66.74 75.44 110.50 89.80 917.57 (Decimal('91.399735'), Decimal('53.751351')) Anadyr (Carbon) 2019 81.63 143.01 260.90 304.36 122.00 106.87 84.99 130.00 102.00 118.00 94.00 199.00 1746.76 (Decimal('177.738273'), Decimal('64.713433')) Anapa (Vitjazevo) 2019 45.92 53.15 54.00 54.72 52.00 67.45 172.31 72.57 70.00 63.00 69.00 82.10 856.22 (Decimal('37.341511'), Decimal('45.003748')) Arkhangelsk (Talagy) 2019 85.61 118.70 131.39 144.82 137.95 140.18 128.56 135.68 124.75 139.60 210.27 307.10 1804.61 (Decimal('40.714892'), Decimal('64.596138')) Astrakhan (Narimanovo) 2019 51.75 61.08 65.60 71.84 71.38 63.95 164.86 79.46 85.21 87.23 79.06 99.16 980.58 (Decimal('47.999896'), Decimal('46.287344'))
airport_data3= airport_data[ airport_data. Year== 2016 ]
month = airport_data3. columns[ 2 : 14 ]
Rank = pd. DataFrame( index= airport_data3. index, columns= month)
for index in airport_data3. index:
month_list = airport_data3. loc[ index, month] . sort_values( ascending= False )
rank = 1
for a in month_list. index:
Rank. loc[ index, a] = rank
rank = rank + 1
print ( Rank. apply ( lambda x: x. sum ( ) ) )
January 3406
February 3076
March 2730
April 2432
May 2276
June 2047
July 1854
August 1527
September 1269
October 1009
November 728
December 422
dtype: int64
df_dead = pd. read_csv( '美国死亡数.csv' )
df_confirm = pd. read_csv( '美国确证数.csv' )
df_dead. head( )
UID iso2 iso3 code3 FIPS Admin2 Province_State Country_Region Lat Long_ ... 2020/4/17 2020/4/18 2020/4/19 2020/4/20 2020/4/21 2020/4/22 2020/4/23 2020/4/24 2020/4/25 2020/4/26 0 84001001 US USA 840 1001 Autauga Alabama US 32.539527 -86.644082 ... 2 2 2 1 1 2 2 2 2 2 1 84001003 US USA 840 1003 Baldwin Alabama US 30.727750 -87.722071 ... 2 2 2 3 3 3 3 3 3 3 2 84001005 US USA 840 1005 Barbour Alabama US 31.868263 -85.387129 ... 0 0 0 0 0 0 0 0 0 0 3 84001007 US USA 840 1007 Bibb Alabama US 32.996421 -87.125115 ... 0 0 0 0 0 0 0 0 0 0 4 84001009 US USA 840 1009 Blount Alabama US 33.982109 -86.567906 ... 0 0 0 0 0 0 0 0 0 0
5 rows × 108 columns
df_confirm. head( )
UID iso2 iso3 code3 FIPS Admin2 Province_State Country_Region Lat Long_ ... 2020/4/17 2020/4/18 2020/4/19 2020/4/20 2020/4/21 2020/4/22 2020/4/23 2020/4/24 2020/4/25 2020/4/26 0 84001001 US USA 840 1001 Autauga Alabama US 32.539527 -86.644082 ... 26 25 26 28 30 32 33 36 36 37 1 84001003 US USA 840 1003 Baldwin Alabama US 30.727750 -87.722071 ... 103 109 112 117 123 132 143 147 147 161 2 84001005 US USA 840 1005 Barbour Alabama US 31.868263 -85.387129 ... 15 18 20 22 28 29 30 32 32 33 3 84001007 US USA 840 1007 Bibb Alabama US 32.996421 -87.125115 ... 24 26 28 32 32 34 33 34 34 38 4 84001009 US USA 840 1009 Blount Alabama US 33.982109 -86.567906 ... 20 20 21 22 26 29 31 31 31 34
5 rows × 107 columns
list ( df_dead. columns)
['UID',
'iso2',
'iso3',
'code3',
'FIPS',
'Admin2',
'Province_State',
'Country_Region',
'Lat',
'Long_',
'Combined_Key',
'Population',
'2020/1/22',
'2020/1/23',
'2020/1/24',
'2020/1/25',
'2020/1/26',
'2020/1/27',
'2020/1/28',
'2020/1/29',
'2020/1/30',
'2020/1/31',
'2020/2/1',
'2020/2/2',
'2020/2/3',
'2020/2/4',
'2020/2/5',
'2020/2/6',
'2020/2/7',
'2020/2/8',
'2020/2/9',
'2020/2/10',
'2020/2/11',
'2020/2/12',
'2020/2/13',
'2020/2/14',
'2020/2/15',
'2020/2/16',
'2020/2/17',
'2020/2/18',
'2020/2/19',
'2020/2/20',
'2020/2/21',
'2020/2/22',
'2020/2/23',
'2020/2/24',
'2020/2/25',
'2020/2/26',
'2020/2/27',
'2020/2/28',
'2020/2/29',
'2020/3/1',
'2020/3/2',
'2020/3/3',
'2020/3/4',
'2020/3/5',
'2020/3/6',
'2020/3/7',
'2020/3/8',
'2020/3/9',
'2020/3/10',
'2020/3/11',
'2020/3/12',
'2020/3/13',
'2020/3/14',
'2020/3/15',
'2020/3/16',
'2020/3/17',
'2020/3/18',
'2020/3/19',
'2020/3/20',
'2020/3/21',
'2020/3/22',
'2020/3/23',
'2020/3/24',
'2020/3/25',
'2020/3/26',
'2020/3/27',
'2020/3/28',
'2020/3/29',
'2020/3/30',
'2020/3/31',
'2020/4/1',
'2020/4/2',
'2020/4/3',
'2020/4/4',
'2020/4/5',
'2020/4/6',
'2020/4/7',
'2020/4/8',
'2020/4/9',
'2020/4/10',
'2020/4/11',
'2020/4/12',
'2020/4/13',
'2020/4/14',
'2020/4/15',
'2020/4/16',
'2020/4/17',
'2020/4/18',
'2020/4/19',
'2020/4/20',
'2020/4/21',
'2020/4/22',
'2020/4/23',
'2020/4/24',
'2020/4/25',
'2020/4/26']
df_dead[ [ 'Population' , '2020/4/26' ] ] . corr( )
Population 2020/4/26 Population 1.000000 0.403844 2020/4/26 0.403844 1.000000
df_confirm1 = df_confirm[ [ 'Admin2' , 'Province_State' , '2020/4/1' ] ]
df_confirm1. groupby( 'Province_State' ) [ '2020/4/1' ] . apply ( lambda x: len ( x[ x== 0 ] ) / len ( x) )
Province_State
Alabama 0.119403
Alaska 0.793103
Arizona 0.000000
Arkansas 0.293333
California 0.137931
Colorado 0.218750
Connecticut 0.000000
Delaware 0.000000
District of Columbia 0.000000
Florida 0.164179
Georgia 0.125786
Hawaii 0.200000
Idaho 0.386364
Illinois 0.480392
Indiana 0.108696
Iowa 0.404040
Kansas 0.609524
Kentucky 0.441667
Louisiana 0.062500
Maine 0.250000
Maryland 0.041667
Massachusetts 0.142857
Michigan 0.192771
Minnesota 0.367816
Mississippi 0.060976
Missouri 0.391304
Montana 0.625000
Nebraska 0.752688
Nevada 0.470588
New Hampshire 0.100000
New Jersey 0.000000
New Mexico 0.424242
New York 0.080645
North Carolina 0.180000
North Dakota 0.547170
Ohio 0.181818
Oklahoma 0.376623
Oregon 0.277778
Pennsylvania 0.104478
Rhode Island 0.000000
South Carolina 0.065217
South Dakota 0.560606
Tennessee 0.115789
Texas 0.452756
Utah 0.482759
Vermont 0.142857
Virginia 0.270677
Washington 0.128205
West Virginia 0.472727
Wisconsin 0.319444
Wyoming 0.347826
Name: 2020/4/1, dtype: float64
day = list ( df_confirm. columns[ 11 : ] )
df_confirm. set_index( [ 'Province_State' , 'UID' ] ) [ day] . sort_values( by= day, ascending= False ) . head( 10 )
2020/1/22 2020/1/23 2020/1/24 2020/1/25 2020/1/26 2020/1/27 2020/1/28 2020/1/29 2020/1/30 2020/1/31 ... 2020/4/17 2020/4/18 2020/4/19 2020/4/20 2020/4/21 2020/4/22 2020/4/23 2020/4/24 2020/4/25 2020/4/26 Province_State UID Washington 84053033 1 1 1 1 1 1 1 1 1 1 ... 4902 4902 5174 5174 5293 5379 5532 5637 5739 5863 Illinois 84017031 0 0 1 1 1 1 1 1 1 2 ... 19391 20395 21272 22101 23181 24546 25811 27616 29058 30574 California 84006037 0 0 0 0 1 1 1 1 1 1 ... 11400 12021 12341 13823 15153 16447 17537 18545 19133 19567 84006059 0 0 0 0 1 1 1 1 1 1 ... 1501 1556 1636 1676 1691 1753 1827 1845 1969 2074 Arizona 84004013 0 0 0 0 1 1 1 1 1 1 ... 2404 2491 2589 2636 2738 2846 2970 3116 3234 3359 California 84006085 0 0 0 0 0 0 0 0 0 1 ... 1870 1870 1870 1922 1922 1962 1987 2018 2040 2084 Massachusetts 84025025 0 0 0 0 0 0 0 0 0 0 ... 7272 7696 8074 8074 8669 9060 9739 10724 11218 11543 California 84006069 0 0 0 0 0 0 0 0 0 0 ... 44 44 44 44 44 45 47 47 47 47 84006073 0 0 0 0 0 0 0 0 0 0 ... 2087 2158 2213 2268 2325 2491 2643 2826 2943 2943 84006067 0 0 0 0 0 0 0 0 0 0 ... 896 914 925 940 954 971 987 1019 1019 1037
10 rows × 96 columns
df_dead3 = df_dead. set_index( 'Province_State' ) . groupby( level= 0 ) [ day] . sum ( )
df_dead3. iloc[ : , : - 1 ] . rename( columns= dict ( zip ( day[ : - 1 ] , day[ 1 : ] ) ) ) . head( )
df_dead3= df_dead. set_index( 'Province_State' ) . groupby( level= 0 ) [ day] . sum ( )
df_dead3= df_dead3. iloc[ : , 1 : ] - df_dead3. iloc[ : , : - 1 ] . rename( columns= dict ( zip ( day[ : - 1 ] , day[ 1 : ] ) ) )
df_dead3= df_dead3. sort_values( by= day[ 1 : ] , ascending= False )
df_dead3. head( )
2020/1/23 2020/1/24 2020/1/25 2020/1/26 2020/1/27 2020/1/28 2020/1/29 2020/1/30 2020/1/31 2020/2/1 ... 2020/4/17 2020/4/18 2020/4/19 2020/4/20 2020/4/21 2020/4/22 2020/4/23 2020/4/24 2020/4/25 2020/4/26 Province_State Washington 0 0 0 0 0 0 0 0 0 0 ... 24 10 23 7 21 22 25 11 15 12 California 0 0 0 0 0 0 0 0 0 0 ... 81 103 35 48 57 139 112 88 68 36 Florida 0 0 0 0 0 0 0 0 0 0 ... 57 23 26 48 45 26 94 59 9 20 New Jersey 0 0 0 0 0 0 0 0 0 0 ... 322 231 294 134 24 630 276 257 231 24 South Dakota 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 1 0 1 0 1
5 rows × 95 columns
df_dead3. idxmax( axis= 1 )
Province_State
Washington 2020/4/6
California 2020/4/22
Florida 2020/4/23
New Jersey 2020/4/22
South Dakota 2020/4/6
New York 2020/4/7
Georgia 2020/4/7
Kansas 2020/4/11
Colorado 2020/4/24
Louisiana 2020/4/14
Virginia 2020/4/22
Oregon 2020/4/9
Indiana 2020/4/21
Kentucky 2020/4/16
South Carolina 2020/4/9
Nevada 2020/4/12
Texas 2020/4/23
Illinois 2020/4/18
Connecticut 2020/4/20
Missouri 2020/4/21
Michigan 2020/4/22
Pennsylvania 2020/4/21
Wisconsin 2020/4/5
Maryland 2020/4/18
Vermont 2020/4/9
Oklahoma 2020/4/21
Mississippi 2020/4/21
Ohio 2020/4/22
Massachusetts 2020/4/25
Tennessee 2020/4/2
District of Columbia 2020/4/14
Arizona 2020/4/18
Minnesota 2020/4/26
Utah 2020/4/17
New Hampshire 2020/4/3
Arkansas 2020/4/9
Hawaii 2020/4/11
North Carolina 2020/4/17
Alabama 2020/4/21
Iowa 2020/4/24
New Mexico 2020/4/22
Idaho 2020/4/10
Delaware 2020/4/11
Maine 2020/4/7
North Dakota 2020/4/21
Nebraska 2020/4/23
Montana 2020/3/30
Alaska 2020/4/7
West Virginia 2020/4/23
Rhode Island 2020/4/3
Wyoming 2020/4/13
dtype: object