重复某一行输出N次
from __future__ import division
import pandas as pd
import numpy as np
df = pd.read_csv('auditors.csv')
df = df.drop(df.columns[0], axis=1)
df = df[np.logical_not(df.isna().all(axis=1))] # remove all-NaN rows
df = df.fillna(method='ffill')
df.to_csv('filled_auditors.csv')
lst = []
for _, r in df.iterrows(): #遍历行数据
if "No data fulfill your filter criteria" in list(r):
continue;
l = list(r)[:-3]
for i in range(int(r["ADV\nAppointment date"]), int(r["ADV\nResignation date"])):
lst.append(l + [i, r['ADV\nCurrent or previous']])
# import ipdb;ipdb.set_trace()
cols = list(df.columns[:-3]) + ['data'] + list(df.columns[-1:])
final = pd.DataFrame(lst, columns=cols)
final.to_csv('final.csv')
合并
from __future__ import division
import pandas as pd
import numpy as np
firm = pd.read_csv('F:RA/fin.csv')
firm =firm.set_index(['Company name Latin alphabet'])
#firm = firm.drop(firm.columns[0], axis=1)
#firm = firm.drop(firm.columns[1:3], axis=1)
#firm = firm.drop(firm.columns[2:5], axis=1)
#print(firm)
tick=pd.read_csv('F:RA/Ticker.csv')
tick =tick.set_index(['Company name Latin alphabet'])
data=firm.join(tick)
#print(data)
data.to_csv('fqwer.csv')
遍历所有行,提取相关行
import pandas as pd
data=pd.read_csv('auditors and ticker1.csv')
data =data.dropna(how='any',axis=0)
data_group = data.groupby(by ='ADV Full name')
auditor_list = list(data_group.groups.keys())
print(auditor_list)
for i in range(1983, 2020):
final=[ ]
lst=[]
dic={}
for _, r in data.iterrows(): #遍历行数据
for a in auditor_list:
if r['data']==i and r['ADV Full name']==a:
x=r['Ticker symbol']
lst.append((a,x))
for k_v in lst:
k, v = k_v
dic.setdefault(k, []).append(v)
lst=[]
final = pd.DataFrame.from_dict(dic, orient='index')
name=str(i)
final.to_csv(name +'.csv')
已知月收益率,计算年度收益率
import pandas as pd
import numpy as np
ticker=pd.DataFrame(pd.read_csv('F:/RA/auditors and ticker1.csv'))
tickers=ticker['Ticker symbol'].tolist()
while '' in tickers:
tickers.remove('')
new=[]
lst=['B','C']
returns=pd.DataFrame(pd.read_csv('F:/RA/US stock monthly returns 1963-2018.csv'))
for _, r in returns.iterrows():
if r['Ticker Symbol'] in tickers and r['Returns'] not in lst:
new.append(r)
company = pd.DataFrame(new)
company['year']=company['Names Date'].apply(lambda x:str(x)[0:4])
company['value']=company['Returns'].apply(lambda x:float(x)+1)
company['cum_return']=company.groupby(['year'], as_index=False)['value'].cumprod()
company.drop_duplicates(subset=['year','Company Name'], keep='last', inplace=True)
company.to_csv('cum_return.csv')
判断是否为NaN
np.isnan(return_1y)