Python+selenium+chromedriver下载深交所股票定期报告

最新推荐文章于 2023-10-31 10:59:05 发布

xya_2007

最新推荐文章于 2023-10-31 10:59:05 发布

阅读量405

点赞数

文章标签： python selenium 开发语言

本文链接：https://blog.csdn.net/xya_2007/article/details/130680027

版权

最近开始用Python做股票投资研究，开发了一个深交所股票定期报告下载程序。程序自动下载深交所所有股票的历史定期报告到本地，并可以增量更新新发布的报告。

如需要完整源代码请评论区留下联系方式。

一、运行环境搭建

1、安装Anaconda运行环境：

从清华镜像网站下载WINDOWS版本：Index of /anaconda/archive/ | 清华大学开源软件镜像站 | Tsinghua Open Source Mirror

2、安装PANDAS和selenium

pip install pandas

pip install selenium

3、程序需要安装Google Chrome浏览器，并下载对浏览器版本对应的驱动程序：

下载地址：

http://chromedriver.storage.googleapis.com/index.html

或

CNPM Binaries Mirror

下载后将chromedriver.exe拷贝到一个文件下，并将文件夹加入系统 path环境变量：

二、程序代码：

（1）从深交所网站下载股票基本信息列表，获取所有股票代码：

options = webdriver.ChromeOptions()

prefs = {'profile.default_content_settings.popups': 0,

'download.default_directory': os.path.join(os.getcwd(),'data')}

options.add_experimental_option('prefs', prefs)

browser = webdriver.Chrome(options=options)

browser.implicitly_wait(5)

url = 'http://www.szse.cn/market/product/stock/list/index.html'

browser.get(url)

filepath = './data/A股列表.xlsx'

if os.path.exists(filepath):

os.remove(filepath)

download = browser.find_element_by_xpath("//a[@class='btn-default-excel btn-default-hasicon']")

download.click()

sleep(5)

browser.close()

df = pd.read_excel(filepath, converters={'A股代码':str})

res = [f'sz{code}' for code in list(df['A股代码'])]

（2）根据股票代码查找定期报告文件地址：

#根据股票代码、名称、报告年份等信息生成文件路径，并判断文件是否已经下载

def get_stock_file_name(self, code, name, year, desc):

name = name.replace(' ', '').replace(' ', '')

desc = desc.replace(' ', '').replace(' ', '').replace('*', '')

filepath = os.path.join(self.report_data_path, self.exchange_ins[code[0:2]][0])

if not os.path.exists(filepath):

os.mkdir(filepath)

filepath = os.path.join(filepath,code + '_' + name.replace('*',''))

if not os.path.exists(filepath):

os.mkdir(filepath)

filepath = os.path.join(filepath,year)

if not os.path.exists(filepath):

os.mkdir(filepath)

filepath = os.path.join(filepath,desc + '.pdf')

if not os.path.exists(filepath):

return False, filepath

else:

return True, filepath

#解析一个页面中的报告地址信息

def get_sz_stock_data_page(self, url, codes, start_date = None, end_date = None, page_size = 50, page_num = 1):

if start_date is None:

start_date = '1991-01-01'

if end_date is None:

end_date = datetime.now().strftime(format = '%Y-%m-%d')

#查询指定时间段报表

data = {"seDate":[start_date,end_date],"stock":codes,"channelCode":["fixed_disc"],"pageSize":page_size,"pageNum":page_num}

encoded_data = json.dumps(data).encode("utf-8")

retry = 0

while retry < 5:

try:

http = urllib3.PoolManager()

r = http.request(

"POST",

'http://www.szse.cn/api/disc/announcement/annList',

body = encoded_data,

headers = {

'content-type':'application/json'

}

)

if r.status == 200:

response = r.data

else:

response = None

r.release_conn()

retry = 5

return response

except:

retry = retry + 1

print('获取股票%s报告列表异常, 5秒后重试，重试次数%d' % (str(codes), retry))

sleep(5)

return None

#获取一支股票的报告信息

def get_all_report_of_sz_stock(self, code):

rand = random.random()

url = f"http://www.szse.cn/api/disc/announcement/annList?random={rand}"

response = self.get_sz_stock_data_page(url, codes = [code[2:]])

#print(response)

if len(response) > 0:

j = json.loads(response)

items = j['announceCount']

print(code + ('共有%d条定期报告' % items))

#设置每页50条

page_size = 50

#计算总页数

total_pages = (items + page_size-1)//page_size

page_num = 0

while page_num < total_pages:

data = j['data']

count = j['announceCount']

for d in data:

name = d['title']

sec_name = d['secName'][0]

desc = name.replace(sec_name,'').replace(':', '').replace('：','')

years = re.findall('[0-9]{4}年', name)

#print(name, desc, years, sec_name)

year = ''

if len(years) > 0 :

year = years[0]

else:

years = re.findall('[0-9]{4}半年', name)

if len(years) > 0:

year = years[0].replace('半', '')

url = 'http://disc.static.szse.cn/download' + d['attachPath']

exist,filepath = self.get_stock_file_name(code, sec_name, year, desc)

self.file_lock.wait()

self.file_lock.clear()

if len(self.downloadfiles[self.downloadfiles['保存路径']==filepath]) >0:

temp = self.downloadfiles[self.downloadfiles['保存路径']==filepath]

if not exist:

self.que.put([url, filepath, temp.index[0]])

self.downloadfiles.loc[temp.index[0], '是否完成下载'] = '否'

else:

self.downloadfiles.loc[temp.index[0], '是否完成下载'] = '是'

else:

ind = len(self.downloadfiles)

self.downloadfiles.loc[ind, '下载地址'] = url

self.downloadfiles.loc[ind, '保存路径'] = filepath

if not exist:

self.que.put([url, filepath,ind])

self.downloadfiles.loc[ind, '是否完成下载'] = '否'

else:

self.downloadfiles.loc[ind, '是否完成下载'] = '是'

self.file_lock.set()

page_num = page_num + 1

if page_num < total_pages:

#请求网页数据，直到成功为止

while True:

response = self.get_sz_stock_data_page(url, codes = [code[2:]], page_num = (page_num + 1))

if len(response) > 0:

j = json.loads(response)

break

（3）根据已经获取的报告文件URL，采用多线程方式下载报告文件到本地：

#获取深交所所有股票的报告文件地址信息

def get_all_report_of_sz_market(self):

codes = sorted(self.codes)

#codes =['sz000016']

for code in codes:

if code.startswith('sz'):

self.get_all_report_of_sz_stock(code)

self.finished = True

self.file_lock.wait()

self.file_lock.clear()

pd.DataFrame(columns=['数据由XXX整理']).to_csv(self.catalog_path, index=False, encoding='gbk')

self.downloadfiles.to_csv(self.catalog_path, index=False, mode='a', encoding='gbk')

self.file_lock.set()

#下载单个报告文件

def download_one_file(self, url, filepath, method='GET'):

retry = 0

while retry < 5:

try:

http = urllib3.PoolManager()

response = http.request(method, url)

if response.status == 200:

with open(filepath, 'wb') as f:

f.write(response.data)

else:

print('download statue: %d' % response.status)

response.release_conn()

retry = 5

except:

retry = retry + 1

print('下载文件%s异常，5秒后重试,重试次数%d' % (filepath, retry))

sleep(5)

#报表下载线程函数

def download_thread(self):

count = 0

while (not self.finished) or (self.que.qsize()>0):

if (self.que.qsize()>0):

try:

print('queue size %d' % self.que.qsize())

info = self.que.get()

url = info[0]

filepath = info[1]

print(f'开始下载文件：{url}')

print(f'保存路径:{filepath}')

self.download_one_file(url, filepath)

count = count + 1

self.file_lock.wait()

self.file_lock.clear()

if len(info) > 0:

self.downloadfiles.loc[info[2], '是否完成下载'] = '是'

else:

if len(self.downloadfiles[self.downloadfiles['保存路径']==filepath]) >0:

self.downloadfiles.loc[self.downloadfiles['下载地址'].str.contains(url), '是否完成下载'] = '是'

if count >= 10:

# pd.DataFrame(columns=['数据由谢允安整理']).to_csv(self.catalog_path, index=False, encoding='gbk')

# self.downloadfiles.to_csv(self.catalog_path, index=False, mode='a', encoding='gbk')

count = 0

self.file_lock.set()

sleep(random.randint(3,5))

self.que.task_done()

except:

traceback.print_exc()

else:

sleep(10)

#下载启动函数，下载所有股票报表，下载文件多线程同时下载

def start_download_sz(self):

prepare_thread = threading.Thread(target=self.get_all_report_of_sz_market)

prepare_thread.start()

for i in range(self.num_threads):

t = threading.Thread(target=self.download_thread)

#t.daemon = True

t.start()

prepare_thread.join()

xya_2007

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
Python+selenium+chromedriver下载深交所股票定期报告

self.downloadfiles.loc[self.downloadfiles['下载地址'].str.contains(url), '是否完成下载'] = '是'pd.DataFrame(columns=['数据由XXX整理']).to_csv(self.catalog_path, index=False, encoding='gbk')self.downloadfiles.loc[temp.index[0], '是否完成下载'] = '是'
复制链接

扫一扫