任务描述
在每天的盘前以及盘后的时间爬取上交所和深交所发布的当如的行情数据。
上交所数据源: http://www.sse.com.cn/market/price/report/
深交所数据源: http://www.szse.cn/market/trend/index.html
网站更新规律
上交所在一天之内,实时更新当天的行情数据。
深交所在当天收盘后才会更新当天的数据。
上交所行情
上交所的行情数据可在一个 js 请求中获得。
主要注意的是 js 请求参数的生成:
def get_params(self, begin):
_timestamp = int(time.time()*1000)
data = {
'callback': 'jQuery1124010159023820477953_1589359759776',
'select': 'code,name,open,high,low,last,prev_close,chg_rate,volume,amount,tradephase,change,amp_rate,cpxxsubtype,cpxxprodusta',
'order': '',
'begin': begin,
'end': begin + 25,
"_": _timestamp, # 当前的一个时间戳
}
param = urlencode(data)
return param
其中, jQuery1124010159023820477953_1589359759776 这一部分是固定的,服务器不会对其进行校验。_ 则是当前的时间戳。
另外,在页面中显示的类型,在接口数据中是以字符的形式给出的,我们需要知道字符与实际含义的对应关系。全局搜索 ”主板A股“ 等关键词,很容易在 js 代码中的得到如下映射关系:
self.sub_type_map = {
"ASH": "主板A股",
"BSH": "主板B股",
"KSH": "科创板",
}
这一对应关系。
主体代码部分如下:
import datetime
import json
import re
import sys
import time
from urllib.parse import urlencode
import requests
sys.path.append('./../')
from exchange_report.base import ReportBase, logger
class SHReport(ReportBase):
"""上交所行情"""
def __init__(self):
super(SHReport, self).__init__()
self.url = 'http://yunhq.sse.com.cn:32041//v1/sh1/list/exchange/equity?'
self.headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': 'yfx_c_g_u_id_10000042=_ck20020212032112531630665331275; seecookie=%u5149%u4E91%u79D1%u6280%2C%u5149%u4E91%u79D1%u6280%20%u878D%u8D44%u878D%u5238%2C%u878D%u8D44%u878D%u5238%2C688466; VISITED_MENU=%5B%228307%22%2C%228451%22%2C%228312%22%2C%228814%22%2C%228815%22%2C%229807%22%2C%229808%22%2C%228817%22%2C%228431%22%2C%228619%22%2C%228454%22%5D; yfx_f_l_v_t_10000042=f_t_1580616201243__r_t_1589359605854__v_t_1589359759134__r_c_18',
'Host': 'yunhq.sse.com.cn:32041',
'Pragma': 'no-cache',
'Referer': 'http://www.sse.com.cn/market/price/report/',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
}
self.sub_type_map = {
"ASH": "主板A股",
"BSH": "主板B股",
"KSH": "科创板",
}
self.table_name = 'see_dailyquote'
self.fields = ['SecuCode', 'InnerCode', 'SecuAbbr', 'TradingDay', 'Open', 'High', 'Low',
'Last', 'PrevClose', 'ChgRate', 'Volume', 'Amount', 'RiseFall', 'AmpRate', 'CPXXSubType']
def get_params(self, begin):
_timestamp = int(time.time()*1000)
data = {
'callback': 'jQuery1124010159023820477953_1589359759776',
'select': 'code,name,open,high,low,last,prev_close,chg_rate,volume,amount,tradephase,change,amp_rate,cpxxsubtype,cpxxprodusta',
'order': '',
'begin': begin,
'end': begin + 25,
"_": _timestamp, # 当前的一个时间戳
}
param = urlencode(data)
return param
def _create_table(self):
sql = '''
CREATE TABLE IF NOT EXISTS `{}` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'ID',
`SecuCode` varchar(10) DEFAULT NULL COMMENT '证券代码',
`InnerCode` int(11) NOT NULL COMMENT '证券内部编码',
`SecuAbbr` varchar(100) DEFAULT NULL COMMENT '证券简称',
`TradingDay` datetime NOT NULL COMMENT '交易日',
`Open` decimal(10,4) DEFAULT NULL COMMENT '今开盘(元)',
`High` decimal(10,4) DEFAULT NULL COMMENT '最高价(元)',
`Low` decimal(10,4) DEFAULT NULL COMMENT '最低价(元)',
`Last` decimal(10,4) DEFAULT NULL COMMENT '最新价(元)',
`PrevClose` decimal(10,4) DEFAULT NULL COMMENT '前收价(元)',
`ChgRate` decimal(10,4) DEFAULT NULL COMMENT '涨跌幅(%)',
`Volume` decimal(20,0) DEFAULT NULL COMMENT '成交量(股)',
`Amount` decimal(19,4) DEFAULT NULL COMMENT '成交金额(元)',
`RiseFall` decimal(10,4) DEFAULT NULL COMMENT '涨跌',
`AmpRate` decimal(10,4) DEFAULT NULL COMMENT '振幅',
`CPXXSubType` varchar(20) DEFAULT NULL COMMENT '版块类型',
`CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP,
`UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
UNIQUE KEY `IX_QT_DailyQuote` (`InnerCode`,`TradingDay`),
UNIQUE KEY `PK_QT_DailyQuote` (`ID`),
KEY `IX_QT_DailyQuote_TradingDay` (`TradingDay`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='交易所行情';
'''.format(self.table_name)
client = self._init_pool(self.spider_cfg)
client.insert(sql)
client.dispose()
def select_count(self):
"""获取数据库中当前的插入信息"""
client = self._init_pool(self.spider_cfg)
_today = datetime.datetime.combine(datetime.datetime.today(), datetime.time.min)
sql = '''
select count(*) as total from {} where TradingDay = '{}';
'''.format(self.table_name, _today)
ret = client.select_one(sql).get("total")
msg = "上交所 {} 入库 {} 条 ".format(_today, ret)
return msg
def start(self):
self._create_table()
begin = 0
while True:
url = self.url+self.get_params(begin)
logger.info(url)
resp = requests.get(url, headers=self.headers)
if resp.status_code == 200:
ret = resp.text
ret = re.findall("jQuery\d{22}_\d{13}\((.*)\)", ret)[0]
py_datas = json.loads(ret)
_date = py_datas.get("date")
_time = py_datas.get("time")
trade_day = datetime.datetime.strptime(str(_date), "%Y%m%d")
_total = py_datas.get("total")
_list = py_datas.get("list")
# code,name,open,high,low,last,prev_close,chg_rate,volume,amount,tradephase,
# change,amp_rate,cpxxsubtype,cpxxprodusta
if not _list:
# 没有数据的时候 _list 的结果为空
break
client = self._init_pool(self.spider_cfg)
for one in _list:
item = dict()
(item['SecuCode'], # 证券代码 code
item['SecuAbbr'], # 证券简称 name
item['Open'], # 开盘 open
item['High'], # 最高 high
item['Low'], # 最低 low
item['Last'], # 最新 last
item['PrevClose'], # 前收 prev_close
item['ChgRate'], # 涨跌幅 chg_rate(%)
item['Volume'], # 成交量(股) volume, 网页上显示的是 手, 1 手等于 100 股
item['Amount'], # 成交额(元) amount, 网页上是万元
item['TradePhase'], # tradephase
item['RiseFall'], # 涨跌 change --> FIX change 是 mysql 关键字
item['AmpRate'], # 振幅 amp_rate
item['CPXXSubType'],
item['CPXXProdusta']
) = one
# 将类型版块进行转换
item['CPXXSubType'] = self.sub_type_map.get(item['CPXXSubType'])
# TODO 获取聚源让内部编码这一步会拖慢速度
inner_code = self.get_inner_code(item['SecuCode'])
if not inner_code:
raise Exception("No InnerCode.")
item['InnerCode'] = inner_code
item['TradingDay'] = str(trade_day)
# 去掉不需要的字段
item.pop('CPXXProdusta')
item.pop('TradePhase')
# print(item)
self._save(client, item, self.table_name, self.fields)
client.dispose()
else:
raise
begin += 25 # 每页获取 25 个
if __name__ == "__main__":
SHReport().start()
深交所行情
深交所行情有两种获取形式:一种是通过直接解析页面的形式;另外一种是下载文件,再去解析文件。
我这次采用了第二种获取形式。
对于 xlsx 文件的解析,我们可以用到 xlrd 这个库。下载文件可用 urlretrieve 库。详细的用法不再这里赘述。
主体代码部分如下:
import datetime
import os
import random
import urllib
from urllib.request import urlretrieve
import xlrd
from exchange_report.base import ReportBase, logger
class SZReport(ReportBase):
"""深交所行情爬虫"""
def __init__(self):
super(SZReport, self).__init__()
self.fields = ['TradingDay', 'SecuCode', 'InnerCode', 'SecuAbbr', 'PrevClose', 'Close',
'RiseFall', 'Amount', 'PERatio']
self.table_name = 'szse_dailyquote'
self.base_url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1815_stock&TABKEY=tab1&txtBeginDate={}&txtEndDate={}&radioClass=00%2C20%2C30&txtSite=all&random={}'
self._today = datetime.datetime.combine(datetime.datetime.today(), datetime.time.min)
self.check_day = self._today - datetime.timedelta(days=1)
def _create_table(self):
sql = '''
CREATE TABLE IF NOT EXISTS `{}` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'ID',
`SecuCode` varchar(10) DEFAULT NULL COMMENT '证券代码',
`InnerCode` int(11) NOT NULL COMMENT '证券内部编码',
`SecuAbbr` varchar(100) DEFAULT NULL COMMENT '证券简称',
`TradingDay` datetime NOT NULL COMMENT '交易日',
`PrevClose` decimal(10,4) DEFAULT NULL COMMENT '前收价(元)',
`Close` decimal(10,4) DEFAULT NULL COMMENT '今收(元)',
`Amount` decimal(19,4) DEFAULT NULL COMMENT '成交金额(元)',
`RiseFall` decimal(10,4) DEFAULT NULL COMMENT '涨跌',
`PERatio` decimal(10,4) DEFAULT NULL COMMENT '市盈率',
`CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP,
`UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
UNIQUE KEY `IX_QT_DailyQuote` (`InnerCode`,`TradingDay`),
UNIQUE KEY `PK_QT_DailyQuote` (`ID`),
KEY `IX_QT_DailyQuote_TradingDay` (`TradingDay`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='交易所行情';
'''.format(self.table_name)
client = self._init_pool(self.spider_cfg)
client.insert(sql)
client.dispose()
def load_xlsx(self, dt: datetime.datetime):
"""
下载某一天的明细文件
:param dt: eg.20200506
:return:
"""
dt = dt.strftime("%Y-%m-%d")
url = self.base_url.format(dt, dt, random.random())
dirname, filename = os.path.split(os.path.abspath(__file__))
file_path = os.path.join(dirname, "./data_dir/{}.xlsx".format(dt))
try:
urlretrieve(url, file_path, self.callbackfunc)
except urllib.error.HTTPError:
logger.warning("不存在这一天的数据{}".format(dt))
except TimeoutError:
logger.warning("超时 {} ".format(dt))
except Exception as e:
logger.warning("下载失败 : {}".format(e))
raise Exception
else:
return file_path
def start(self):
self._create_table()
# 在当天收盘前只能拿到前一天的数据
self.load_xlsx(self.check_day)
self.read_xlsx(self.check_day)
# 在当天收盘后可以拿到今天最新的数据
self.load_xlsx(self._today)
self.read_xlsx(self._today)
def _re_amount(self, amount: str):
"""14,044,875.30 """
return float(amount.replace(',', ''))
def select_count(self):
"""获取数据库中当前的插入信息"""
client = self._init_pool(self.spider_cfg)
sql = '''
select count(*) as total from {} where TradingDay = '{}';
'''.format(self.table_name, self.check_day)
ret = client.select_one(sql).get("total")
sql2 = '''
select count(*) as total from {} where TradingDay = '{}';
'''.format(self.table_name, self._today)
ret2 = client.select_one(sql2).get("total")
msg = "深交所 {} 入库 {} 条 ; {} 入库 {} 条".format(self.check_day, ret, self._today, ret2)
return msg
def read_xlsx(self, dt: datetime.datetime):
dt = dt.strftime("%Y-%m-%d")
dirname, filename = os.path.split(os.path.abspath(__file__))
file_path = os.path.join(dirname, "./data_dir/{}.xlsx".format(dt))
wb = xlrd.open_workbook(file_path)
# sheet_names = wb.sheet_names()
ws = wb.sheet_by_name('股票行情')
_rows = ws.nrows
# print(">>> ", _rows)
if _rows < 10:
logger.warning("{} 当天无数据".format(dt))
return
client = self._init_pool(self.spider_cfg)
for idx in range(1, _rows):
_line = ws.row_values(idx)
# print(_line)
item = dict()
item['TradingDay'] = _line[0] # 交易日期
secu_code = _line[1]
item['SecuCode'] = secu_code # 证券代码
inner_code = self.get_inner_code(secu_code)
if not inner_code:
raise
item['InnerCode'] = inner_code
item['SecuAbbr'] = _line[2] # 证券简称
item['PrevClose'] = float(_line[3]) # 前收
item['Close'] = float(_line[4]) # 今收
item['RiseFall'] = float(_line[5]) # 升跌(%)
amount = _line[6]
item['Amount'] = self._re_amount(amount) # 成交金额(元)
item['PERatio'] = self._re_amount(_line[7]) # 市盈率
self._save(client, item, self.table_name, self.fields)
self.rm_file(file_path)
client.dispose()
if __name__ == "__main__":
SZReport().start()
完整项目代码
https://github.com/furuiyang0715/JustSimpleSpider/tree/master/exchange_report
个人QQ:2564493603
欢迎各位同行交流。