记录几道pandas练习题
import pandas as pd
df = pd. read_csv( '2002年-2018年上海机动车拍照拍卖.csv' )
df. head( )
Date
Total number of license issued
lowest price
avg price
Total number of applicants
0
2-Jan
1400
13600
14735
3718
1
2-Feb
1800
13100
14057
4590
2
2-Mar
2000
14300
14662
5190
3
2-Apr
2300
16000
16334
4806
4
2-May
2350
17800
18357
4665
df[ '中标率' ] = df[ 'Total number of license issued' ] / df[ 'Total number of applicants' ]
df. sort_values( [ 'Date' ] )
Date
Total number of license issued
lowest price
avg price
Total number of applicants
中标率
98
10-Apr
8500
41000
41637
17313
0.490961
102
10-Aug
9000
39800
40169
16855
0.533966
106
10-Dec
9000
10400
15970
11224
0.801853
96
10-Feb
7500
38300
38620
18810
0.398724
95
10-Jan
8000
37800
38311
18975
0.421607
...
...
...
...
...
...
...
85
9-Mar
6000
26600
27552
18575
0.323015
87
9-May
7200
28500
29100
16471
0.437132
93
9-Nov
8000
34900
35317
21902
0.365263
92
9-Oct
8000
33900
34402
22006
0.363537
91
9-Sep
8500
27200
29500
14906
0.570240
203 rows × 6 columns
for index, row in df. iterrows( ) :
if row[ '中标率' ] < 0.05 :
print ( row[ 'Date' ] )
break
15-May
df[ '年份' ] = df[ 'Date' ] . apply ( lambda x: 2000 + int ( x. split( '-' ) [ 0 ] ) )
df[ '月份' ] = df[ 'Date' ] . apply ( lambda x: x. split( '-' ) [ 1 ] )
newcolumns = [ '年份' , '月份' ] + list ( df. columns[ 1 : - 3 ] )
df1 = df. reindex( columns= newcolumns) . copy( )
df1. head( )
年份
月份
Total number of license issued
lowest price
avg price
Total number of applicants
0
2002
Jan
1400
13600
14735
3718
1
2002
Feb
1800
13100
14057
4590
2
2002
Mar
2000
14300
14662
5190
3
2002
Apr
2300
16000
16334
4806
4
2002
May
2350
17800
18357
4665
import numpy as np
grouped = df1[ 'lowest price ' ] . groupby( df1[ '年份' ] ) . agg( [ max , np. mean] )
grouped[ '0.75分位数' ] = df1[ 'lowest price ' ] . groupby( df1[ '年份' ] ) . quantile( q= 0.75 )
grouped
max
mean
0.75分位数
年份
2002
30800
20316.666667
24300.0
2003
38500
31983.333333
36300.0
2004
44200
29408.333333
38400.0
2005
37900
31908.333333
35600.0
2006
39900
37058.333333
39525.0
2007
53800
45691.666667
48950.0
2008
37300
29945.454545
34150.0
2009
36900
31333.333333
34150.0
2010
44900
38008.333333
41825.0
2011
53800
47958.333333
51000.0
2012
68900
61108.333333
65325.0
2013
90800
79125.000000
82550.0
2014
74600
73816.666667
74000.0
2015
85300
80575.000000
83450.0
2016
88600
85733.333333
87475.0
2017
93500
90616.666667
92350.0
2018
89000
87825.000000
88150.0
new_index_columns = [ '年份' ] + list ( df. columns[ 1 : -