确定股票池
from concurrent.futures import ThreadPoolExecutor
import urllib
import os
from time import sleep
import pandas as pd
# 上证代码
shanghaicode = []
for i in range(600000, 606000, 1):
shanghaicode.append(str(i))
# 深证代码
shenzhencode = []
for i in range(1000000, 1005000, 1):
i = str(i)[1:]
shenzhencode.append(i)
爬取数据
def get_data(num):
url = 'http://quotes.money.163.com/service/lrb_' + str(num) + '.html'
while True:
try:
content = urllib.request.urlopen(url, timeout=2).read()
path = '利润表_multi/' + str(num) + '.csv'
if os.path.exists(path):
print(path + " already existed!!!")
break
with open('利润表_multi/' + str(num) + '.csv', 'wb') as f:
f.write(content)
print(num)
sleep(1)
except Exception as e:
if str(e) == 'HTTP Error 404: Not Found':
print(f"{num} : {e}")
break
else:
print(e)
多线程运作
executor = ThreadPoolExecutor(max_workers=10)
executor.map(get_data, shenzhencode)
executor.shutdown()
executor = ThreadPoolExecutor(max_workers=10)
executor.map(get_data, shanghaicode)
executor.shutdown()
读取本地数据
def generatefile(path):
names = []
for dirpath, dirnames, filenames in os.walk(path):
names = filenames
return names
datapath = '利润表_multi/'
datalist = generatefile(datapath)
invest = []
for data in datalist:
try:
path = datapath + data
temp = pd.read_csv(path, encoding='gbk', header=None)
temp = pd.DataFrame(temp.values.T, index=temp.columns, columns=temp.index)
temp.columns = temp.loc[0]
temp = temp[1:]
temp = temp[:-1]
#temp['报告日期'] = temp['报告日期'].apply(convert_date)
temp = temp[['报告日期','净利润(万元)']]
temp['净利润(万元)'] = temp['净利润(万元)'].astype(int)
temp_g = pd.DataFrame(temp.groupby('报告日期').sum())
temp_g = temp_g[:-1] # 去除2021
temp_g.reset_index(inplace=True)
temp_g = temp_g['净利润(万元)']
anu_diff = temp_g.diff()
temp_g = temp_g.values
# anu_diff = anu_diff.values
temp_g = temp_g[::-1]
ratio = 0.3
if len(temp_g) >= 5:
# rate = anu_diff[-5:]/temp_g[-6:-1]
# if rate[-1] >= ratio and rate[-2] >= ratio and rate[-3] >= ratio and rate[-4] >= ratio:
# invest.append(data)
growth_anu = []
for i in range(len(temp_g)):
if i == (len(temp_g)-1):
continue
year = temp_g[i]
ex_year = temp_g[i+1]
if i+1 <= 5:
growth = (year - ex_year)/ex_year
growth_anu.append(growth)
if growth_anu[0] >= ratio and growth_anu[1] >= ratio and growth_anu[2] >= ratio and growth_anu[3] >= ratio:
invest.append(data)
except Exception as e:
print(data + f':{e}')
invest = pd.DataFrame(invest)
invest.to_excel('连续4年增长30%.xls')