这是一个pandas处理csv的demo。
原数据集地址:https://data.world/rajanand/rainfall-in-india
import sys
import pandas as pd
infile = sys.argv[1]
outfile = sys.argv[2]
actions = 10
df = pd.read_csv(infile,encoding="utf-8")
df = df.drop(['YEAR','Jan-Feb','Mar-May','Jun-Sep','Oct-Dec'],axis=1)
df = df.dropna(how='any')
#####################################################
df = df.groupby(['SUBDIVISION'])
df2 = pd.DataFrame()
for gpname,group in df:
if df.size()[gpname] == 115:
df2 = pd.concat([df2,group],ignore_index=True)
actions -= 1
if actions == 0:
break
mymax = df2.iloc[:,1:13].max().max()
mymin = 0
df2.iloc[:,1:14] = (df2.iloc[:,1:14]-mymin)/(mymax-mymin)
print(df2.iloc[1:,1:13].max().max())
gps = df2.groupby(['SUBDIVISION'])
dfavg = pd.DataFrame()
dfavg['avgmonth'] = gps.mean()['ANNUAL'].map(lambda x:x/12)
dfavg.to_csv('rainfall-avg.csv',index=False)
#print(actions)
df2 = df2.drop(['SUBDIVISION','ANNUAL'],axis=1)
####################################################
#gps = df.groupby(['SUBDIVISION'])
#df2 = pd.DataFrame()
#for gpname,group in gps:
# dftmp = pd.DataFrame()
# for i in range(0,len(group)):
# #print(group.iloc[i-1:i][:])
# dftmp2 = group.iloc[i-1:i][:].drop(['SUBDIVISION'],axis=1)
# print(dftmp2)
# dftmp = pd.concat([dftmp,dftmp2],axis=1,ignore_index=True)
# #print(dftmp)
# df2 = pd.concat([df2,dftmp],axis=0,ignore_index=True)
########################################################
df2.to_csv(outfile,index=False)