系列文章目录
待补充
前言
量化交易系统过程中,我们需要先获取交易数据,我们来实现一个日交易数据的爬取。首先实现一个交易说句获取的基础类;其次,基于基础类实现东财交易数据的获取
提示:以下是本篇文章正文内容,下面案例可供参考
一、基础类实现
TradeDaily的基础类
import json
class TradeDaily():
def tradeDailyData(self, tradeDay, repoistory, repoistoryParam: dict, crawlerParam: dict):
pass
def get_data(self,response):
content = response.text
data = json.loads(content)
# data = json.dumps(data)
return data
二、实现日交易数据爬取
1.实现日交易数据爬取
import requests
import jsonpath
import pandas as pd
import math
from crawler.base.tradeDaily import TradeDaily
'''
https://data.eastmoney.com/bbsj/lrb.html #利润表
https://data.eastmoney.com/stockcomment/ #千股千评
'''
class EmPcTradeDaily(TradeDaily):
def __init__(self,repoistoryOb,reParam:dict,crParam:dict):
self.repoistoryObject=repoistoryOb
self.repoistoryParam=reParam
self.crawlerParam=crParam
'''
f2:最新价 f3:涨跌幅 f4:涨跌额 f5:成交量 f6:成交额 f7:振幅 f8:换手率 f9:市盈率(动态) f10:量比
f12:股票账号 f13:市场(90、) f14:股票名称 f15:最高价 f16:最低价 f17:今天开盘价格 f18:昨日收盘价
无f19 f20:总市值 f21:流通市指 f23:市净率 f62:主力净流出
'''
def get_detail_response(self, page_num):
response = requests.get('http://29.push2.eastmoney.com/api/qt/clist/get?pn=' + str(
page_num) + '&pz=200&po=0&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f12&fs=m%3A0+t%3A6%2Cm%3A0+t%3A80%2Cm%3A1+t%3A2%2Cm%3A1+t%3A23%2Cm%3A0+t%3A81+s%3A2048&stat=1%5E&fields=f1%2Cf2%2Cf3%2Cf4%2Cf5%2Cf6%2Cf7%2Cf8%2Cf9%2Cf10%2Cf12%2Cf13%2Cf14%2Cf15%2Cf16%2Cf17%2Cf18%2Cf20%2Cf21%2Cf23%2Cf24%2Cf25%2Cf22%2Cf11%2Cf62%2Cf128%2Cf136%2Cf115%2Cf152&_=1649293499568')
return response
'''
"SECURITY_CODE":"600532",
"SECUCODE":"600532.SH",
"SECURITY_NAME_ABBR":"未来股份",
"ORG_CODE":"10002612",
"TRADE_MARKET":"069001001001",
"BOARD_CODE":"016011",
"BOARD_NAME":"煤炭行业",
"ORIG_BOARD_CODE":"437",
"TOTAL_MARKET_CAP":7586166084,
"NOTLIMITED_MARKETCAP_A":7586166084,
"CLOSE_PRICE":14.7,
"CHANGE_RATE":0,
"TOTAL_SHARES":516065720,
"FREE_SHARES_A":516065720,
"PE_TTM":-8374.35511122,
"PE_LAR":346.36570245,
"PB_MRQ":4.06484132,
"PCF_OCF_LAR":11.16401607,
"PCF_OCF_TTM":7.8371549,
"PS_TTM":3.64534103,
"PEG_CAR":1899.40065664655,
"TRADE_DATE":"2022-05-27 00:00:00"
'''
def get_industry_response(self, date_str):
response = requests.get(
'https://datacenter-web.eastmoney.com/api/data/v1/get?reportName=RPT_VALUEANALYSIS_DET&columns=ALL"eColumns=&source=WEB&client=WEB&sortColumns=PE_TTM&sortTypes=1&filter=(TRADE_DATE%3D%27'+date_str+'%27)&_=1653800608054')
print('请求URL='+response.url)
# print('返回结果='+response.text)
return response
def get_stock_info_detail(self) -> pd.DataFrame:
page_num, page_size = 1, 1
stock_detail_info = None
rename_columns={'f12':'stock_code','f14':'stock_name','f2':'close','f3':'pct_chg','f4':'change','f5':'volume',
'f6':'amount','f15':'high','f16':'low','f17':'open','f18':'pre_close','f8':'turnover_rate','f9':'stock_pe_ttm','f23':'stock_pb_mrq'
,'f10':'quantity_ratio'}
filter_columns=['stock_code', 'stock_name', 'close','pct_chg','change','volume','amount','high','low','open','pre_close','turnover_rate','quantity_ratio']
while page_num <= page_size:
response= self.get_detail_response(page_num)
data = self.get_data(response)
page_size = math.ceil(data['data']['total'] / 200)
data_list = data['data']['diff']
df = pd.DataFrame(data_list)
df.rename(columns=rename_columns,inplace=True)
df = df.loc[:, filter_columns]
if stock_detail_info is None:
stock_detail_info = df
else:
stock_detail_info = pd.concat([stock_detail_info,df])
page_num = page_num + 1
# 过滤停牌和下市的数据
stock_detail_info = stock_detail_info.replace('-', 0)
print('股票明细数量:', len(stock_detail_info))
# stock_detail_info.to_csv("D:\sstock_detail_info.csv")
stock_detail_info.drop_duplicates(inplace=True)
print('股票明细数量:', len(stock_detail_info))
return stock_detail_info
'''
"SECURITY_CODE":"600532",
"SECUCODE":"600532.SH",
"SECURITY_NAME_ABBR":"未来股份",
"ORG_CODE":"10002612",
"TRADE_MARKET":"069001001001",
"BOARD_CODE":"016011",
"BOARD_NAME":"煤炭行业",
"ORIG_BOARD_CODE":"437",
"TOTAL_MARKET_CAP":7586166084,
"NOTLIMITED_MARKETCAP_A":7586166084,
"CLOSE_PRICE":14.7,
"CHANGE_RATE":0,
"TOTAL_SHARES":516065720,
"FREE_SHARES_A":516065720,
"PE_TTM":-8374.35511122,
"PE_LAR":346.36570245,
"PB_MRQ":4.06484132,
"PCF_OCF_LAR":11.16401607,
"PCF_OCF_TTM":7.8371549,
"PS_TTM":3.64534103,
"PEG_CAR":1899.40065664655,
"TRADE_DATE":"2022-05-27 00:00:00"
'''
def save_industry_Info(self,tradeDay) -> pd.DataFrame:
page_num, page_size = 1, 1
rename_cols={'SECURITY_CODE':'stock_code','BOARD_CODE':'board_code','PE_TTM':'stock_pe_ttm'}
print('查询行业开始处理{0}日,第{1}页处理数据'.format(tradeDay,page_num))
response = self.get_industry_response(tradeDay)
data = self.get_data(response)
data_list = data['result']['data']
industry_info = pd.DataFrame(data_list)
industry_info.rename(columns=rename_cols,
inplace=True)
industry_info['market_code'] = industry_info['SECUCODE'].str[7:9]
industry_info = industry_info.loc[:, [ 'board_code','stock_code','market_code','stock_pe_ttm']]
industry_info.drop_duplicates(inplace=True)
print('行业数量:', len(industry_info))
return industry_info
def get_stock_median(self,stock_info:pd.DataFrame) -> pd.DataFrame:
stock_info=stock_info[stock_info['stock_pe_ttm']>0]
board_code_info = stock_info.groupby(['board_code'])
new_rows = []
for name, itemgroup in board_code_info:
new_row = list()
new_row.extend([name,
round(itemgroup['stock_pe_ttm'].median(), 2),
round(itemgroup['stock_pe_ttm'].mean(),2)
])
new_rows.append(new_row)
median_info = pd.DataFrame(data=new_rows,
columns=['board_code', 'median_pe','average_pe'])
print('行业中位市盈率数量:', len(median_info))
median_info.drop_duplicates(inplace=True)
print('行业中位市盈率数量:', len(median_info))
return median_info
def tradeDailyData(self, tradeDay):
stock_detail_info=self.get_stock_info_detail()
stock_detail_info = stock_detail_info.assign(trade_date=tradeDay)
industry_info = self.save_industry_Info(tradeDay)
median_info = self.get_stock_median(industry_info)
industry_median_info = pd.merge(industry_info, median_info, how='left', left_on='board_code', right_on='board_code')
info = pd.merge(stock_detail_info, industry_median_info, how='left', left_on='stock_code', right_on='stock_code')
# info.to_csv("E:\quant_data\\em_trade_daily_industry" + tradeDay + ".csv")
print('开始插入数据库')
self.repoistoryObject.saveData(info, repoistoryParam=self.repoistoryParam)
2.东财日交易数据数据库脚本
-- quantdata.trade_daily_empc definition
CREATE TABLE `trade_daily_empc` (
`stock_name` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '股票名称',
`stock_code` varchar(12) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT '股票指数代码',
`trade_date` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT '交易日',
`open` float(10,2) DEFAULT NULL COMMENT '开盘点位',
`high` float(10,2) DEFAULT NULL COMMENT '最高点位',
`low` float(10,2) DEFAULT NULL COMMENT '最低点位',
`close` float(10,2) DEFAULT NULL COMMENT '收盘点位',
`pre_close` float(10,2) DEFAULT NULL COMMENT '昨日收盘点',
`change` float(10,2) DEFAULT NULL COMMENT '涨跌点',
`pct_chg` float(10,2) DEFAULT NULL COMMENT '涨跌幅(%)',
`volume` float(20,2) DEFAULT NULL COMMENT '成交量(手)',
`amount` float(20,2) DEFAULT NULL COMMENT '成交额(千)',
`quantity_ratio` float(50,2) DEFAULT NULL COMMENT '量比',
`turnover_rate` float(10,2) DEFAULT NULL COMMENT '换手率',
`industry_name` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '行业名称',
`stock_pe_ttm` float(20,2) DEFAULT NULL COMMENT '市盈率TTM',
`stock_pb_mrq` float(20,2) DEFAULT NULL COMMENT '市净率',
`stock_pe_percentile` float(10,4) DEFAULT NULL COMMENT '市盈率百分位',
`stock_pb_percentile` float(10,4) DEFAULT NULL COMMENT '市净率百分位',
`board_code` varchar(20) DEFAULT NULL COMMENT '行业编号',
`market_code` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '市场',
`average_pe` float(10,4) DEFAULT NULL COMMENT '行业平均市盈率',
`average_pb` float(10,4) DEFAULT NULL COMMENT '平均市净率',
`pe_percentile` float(10,4) DEFAULT NULL COMMENT '行业市盈率百分比',
`pb_percentile` float(10,4) DEFAULT NULL COMMENT '行业平均市净率',
`current_year_percent` float(10,4) DEFAULT NULL COMMENT '年初至今涨幅',
`market_capital` float(20,4) DEFAULT NULL COMMENT '市值',
`dividend_yield` float(10,4) DEFAULT NULL COMMENT '股息率',
`stock_ps_percentile` float(10,4) DEFAULT NULL COMMENT '市销率',
`median_pe` float(10,4) DEFAULT NULL COMMENT '市盈率中位数',
PRIMARY KEY (`stock_code`,`trade_date`),
KEY `inx_tdn_trade_date` (`trade_date`) USING BTREE COMMENT '按日期查询'
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='股票日交易PC';
3.调用
代码如下,调用时传入日期可实现获取交易数据,并落入mysql库中
def empctradeDaily(self,tradeDay):
reParam = {'table_name': 'trade_daily_empc'}
#网络爬虫处理,mysql存储
tradedaily = EmPcTradeDaily(MysqlRepoistory(),reParam,None)
tradedaily.tradeDailyData(tradeDay=tradeDay)
总结
以上就是今天要讲的内容,本文主要介绍了爬取交易数据,下章介绍获取其他交易数据