深交所的股票信息读取比较简单:
看上图,爬虫读取到下载按钮的链接之后发起请求,得到XLS文件后直接解析就可以了。
这里放出深交所爬虫模块的代码:
# -*- coding: utf-8 -*-
# 深圳交易所爬虫
import os
import pandas as pd
import requests
#读取最新深交所股票列表
def get_stock_list():
cache_file_path = "./sotck_file.xlsx"
url = "https://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1110&TABKEY=tab1"
response = requests.get(url)
open(cache_file_path, "wb").write(response.content)
file = pd.read_excel(cache_file_path, dtype={'A股代码': str})
stocks = []
for index, row in file.iterrows():
# 处理每一行的数据
stocks.append(row)
os.remove(cache_file_path)
return stocks
爬虫模块向控制模块返回数据后由控制模块作格式统一处理:
import A_SH_basic
from MySQL import ExecInsert, ExecSelect
import A_SZ_basic
from Tools import CustomException
LIST = "L" #上市状态:上市
DELISTED = "D"#上市状态:退市
PAUSED = "P" #上市状态:暂停上市
SZSE = "SZSE" #交易所:深交所
SSE = "SSE" #交易所:上交所
#更新A股股票列表
def update_A_stock_list(SZ=False,SH=False,BJ=False):
database = "stock_a"
if SZ == True:
# 执行更新前先读取历史数据,若某条待更新数据与历史数据一致则跳过该条数据
select_sql = "select * from stock_list where exchange = 'SZSE'"
insert_sql = "insert into stock_list(stock_code,stock_name,province,industry,industry_2,enname,market,exchange,list_status,list_date,delist_date,total_share,float_share) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
update_sql = "update stock_list set stock_name=%s,province=%s,industry=%s,industry_2=%s,enname=%s,market=%s,exchange=%s,list_status=%s,list_date=%s,delist_date=%s,total_share=%s,float_share=%s where stock_code=%s"
select_result = ExecSelect(database, select_sql) # 读取查询结果
# 调用深交所爬虫读取上市股票列表
stocks = A_SZ_basic.get_stock_list()
insert_rows = []
update_rows = []
for stock in stocks:
exist_flag = False
# 遍历历史数据查询该股票是否在历史数据中
for history in select_result:
if history[0] == stock["A股代码"]:
exist_flag = True
break
# 该股票在历史数据中查得到,说明是旧股票,检查是否有需要更新的地方,有则放入update_rows等待写入,无则丢弃
if exist_flag:
if history[1] == stock["A股简称"] and history[2] == stock["省 份"] and history[3] == stock["所属行业"].split(" ")[0] and history[5] == stock["英文名称"] and history[6] == stock["板块"] and history[7] == SZSE and history[8] == LIST and history[9] == stock["A股上市日期"] and history[11] == stock["A股总股本"].replace(',', '') and history[12] == stock["A股流通股本"].replace(',', ''):
pass
else:
stock_code = stock["A股代码"]
stock_name = stock["A股简称"]
province = stock["省 份"]
industry = stock["所属行业"].split(" ")[0] # 一级行业
industry_2 = None # 深交所数据无二级行业
enname = stock["英文名称"]
market = stock["板块"] # 市场类型(主板、创业板、科创板、CDR)
exchange = SZSE # 交易所
list_status = LIST # 上市状态
list_date = stock["A股上市日期"]
delist_date = None # 上市股暂无退市日期
total_share = stock["A股总股本"].replace(',', '')
float_share = stock["A股流通股本"].replace(',', '')
update_rows.append(
(stock_name, province, industry, industry_2, enname, market, exchange,
list_status, list_date, delist_date, total_share, float_share, stock_code))
# 如果该股票在历史数据中查不到,说明是新股票,放入insert_rows等待写入
else:
stock_code = stock["A股代码"]
stock_name = stock["A股简称"]
province = stock["省 份"]
industry = stock["所属行业"].split(" ")[0] # 一级行业
industry_2 = None # 深交所数据无二级行业
enname = stock["英文名称"]
market = stock["板块"] # 市场类型(主板、创业板、科创板、CDR)
exchange = SZSE # 交易所
list_status = LIST # 上市状态
list_date = stock["A股上市日期"]
delist_date = None # 上市股暂无退市日期
total_share = stock["A股总股本"].replace(',', '')
float_share = stock["A股流通股本"].replace(',', '')
insert_rows.append((stock_code, stock_name, province, industry, industry_2, enname, market, exchange,
list_status, list_date, delist_date, total_share, float_share))
# 写入数据库中不存在的新股票
if len(insert_rows) > 0:
result = ExecInsert(database, insert_sql, insert_rows)
if result == 'success':
print("写入深交所上市股票成功.")
else:
raise CustomException("写入深交所上市股票时发生数据库异常:" + result)
# 更新数据库中存在的股票信息
if len(update_rows) > 0:
result = ExecInsert(database, update_sql, update_rows)
if result == 'success':
print("更新深交所上市股票成功.")
else:
raise CustomException("更新深交所上市股票时发生数据库异常:" + result)
print("深交所上市股票更新结束.")
至此,我们实现了第一步:本地化存储两市5000多家股票的基本信息:
stock_list的字段结构如下: