第2章 爬虫系列-爬取交易数据

系列文章目录

待补充


前言

    量化交易系统过程中,我们需要先获取交易数据,我们来实现一个日交易数据的爬取。首先实现一个交易说句获取的基础类;其次,基于基础类实现东财交易数据的获取


提示:以下是本篇文章正文内容,下面案例可供参考

一、基础类实现

TradeDaily的基础类

import json

class TradeDaily():

    def tradeDailyData(self, tradeDay, repoistory, repoistoryParam: dict, crawlerParam: dict):
        pass

    def get_data(self,response):
        content = response.text
        data = json.loads(content)
        # data = json.dumps(data)
        return data

二、实现日交易数据爬取

1.实现日交易数据爬取

import requests
import jsonpath
import pandas as pd
import math
from crawler.base.tradeDaily import TradeDaily

'''
https://data.eastmoney.com/bbsj/lrb.html   #利润表
https://data.eastmoney.com/stockcomment/   #千股千评

'''
class EmPcTradeDaily(TradeDaily):
    def __init__(self,repoistoryOb,reParam:dict,crParam:dict):
        self.repoistoryObject=repoistoryOb
        self.repoistoryParam=reParam
        self.crawlerParam=crParam

    '''
        f2:最新价 f3:涨跌幅 f4:涨跌额 f5:成交量 f6:成交额 f7:振幅 f8:换手率 f9:市盈率(动态) f10:量比
        f12:股票账号 f13:市场(90、) f14:股票名称 f15:最高价  f16:最低价  f17:今天开盘价格 f18:昨日收盘价
        无f19 f20:总市值 f21:流通市指 f23:市净率 f62:主力净流出
        '''
    def get_detail_response(self, page_num):
        response = requests.get('http://29.push2.eastmoney.com/api/qt/clist/get?pn=' + str(
            page_num) + '&pz=200&po=0&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f12&fs=m%3A0+t%3A6%2Cm%3A0+t%3A80%2Cm%3A1+t%3A2%2Cm%3A1+t%3A23%2Cm%3A0+t%3A81+s%3A2048&stat=1%5E&fields=f1%2Cf2%2Cf3%2Cf4%2Cf5%2Cf6%2Cf7%2Cf8%2Cf9%2Cf10%2Cf12%2Cf13%2Cf14%2Cf15%2Cf16%2Cf17%2Cf18%2Cf20%2Cf21%2Cf23%2Cf24%2Cf25%2Cf22%2Cf11%2Cf62%2Cf128%2Cf136%2Cf115%2Cf152&_=1649293499568')
        return response

    '''
       "SECURITY_CODE":"600532",
       "SECUCODE":"600532.SH",
       "SECURITY_NAME_ABBR":"未来股份",
       "ORG_CODE":"10002612",
       "TRADE_MARKET":"069001001001",
       "BOARD_CODE":"016011",
       "BOARD_NAME":"煤炭行业",
       "ORIG_BOARD_CODE":"437",
       "TOTAL_MARKET_CAP":7586166084,
       "NOTLIMITED_MARKETCAP_A":7586166084,
       "CLOSE_PRICE":14.7,
       "CHANGE_RATE":0,
       "TOTAL_SHARES":516065720,
       "FREE_SHARES_A":516065720,
       "PE_TTM":-8374.35511122,
       "PE_LAR":346.36570245,
       "PB_MRQ":4.06484132,
       "PCF_OCF_LAR":11.16401607,
       "PCF_OCF_TTM":7.8371549,
       "PS_TTM":3.64534103,
       "PEG_CAR":1899.40065664655,
       "TRADE_DATE":"2022-05-27 00:00:00"
    '''
    def get_industry_response(self, date_str):
        response = requests.get(
            'https://datacenter-web.eastmoney.com/api/data/v1/get?reportName=RPT_VALUEANALYSIS_DET&columns=ALL&quoteColumns=&source=WEB&client=WEB&sortColumns=PE_TTM&sortTypes=1&filter=(TRADE_DATE%3D%27'+date_str+'%27)&_=1653800608054')
        print('请求URL='+response.url)
        # print('返回结果='+response.text)
        return response

    def get_stock_info_detail(self) -> pd.DataFrame:
        page_num, page_size = 1, 1
        stock_detail_info = None
        rename_columns={'f12':'stock_code','f14':'stock_name','f2':'close','f3':'pct_chg','f4':'change','f5':'volume',
            'f6':'amount','f15':'high','f16':'low','f17':'open','f18':'pre_close','f8':'turnover_rate','f9':'stock_pe_ttm','f23':'stock_pb_mrq'
             ,'f10':'quantity_ratio'}
        filter_columns=['stock_code', 'stock_name', 'close','pct_chg','change','volume','amount','high','low','open','pre_close','turnover_rate','quantity_ratio']
        while page_num <= page_size:
            response= self.get_detail_response(page_num)
            data = self.get_data(response)
            page_size = math.ceil(data['data']['total'] / 200)
            data_list = data['data']['diff']
            df = pd.DataFrame(data_list)
            df.rename(columns=rename_columns,inplace=True)
            df = df.loc[:, filter_columns]
            if stock_detail_info is None:
                stock_detail_info = df
            else:
                stock_detail_info = pd.concat([stock_detail_info,df])
            page_num = page_num + 1
        # 过滤停牌和下市的数据
        stock_detail_info = stock_detail_info.replace('-', 0)
        print('股票明细数量:', len(stock_detail_info))
        # stock_detail_info.to_csv("D:\sstock_detail_info.csv")
        stock_detail_info.drop_duplicates(inplace=True)
        print('股票明细数量:', len(stock_detail_info))
        return stock_detail_info

    '''
           "SECURITY_CODE":"600532",
           "SECUCODE":"600532.SH",
           "SECURITY_NAME_ABBR":"未来股份",
           "ORG_CODE":"10002612",
           "TRADE_MARKET":"069001001001",
           "BOARD_CODE":"016011",
           "BOARD_NAME":"煤炭行业",
           "ORIG_BOARD_CODE":"437",
           "TOTAL_MARKET_CAP":7586166084,
           "NOTLIMITED_MARKETCAP_A":7586166084,
           "CLOSE_PRICE":14.7,
           "CHANGE_RATE":0,
           "TOTAL_SHARES":516065720,
           "FREE_SHARES_A":516065720,
           "PE_TTM":-8374.35511122,
           "PE_LAR":346.36570245,
           "PB_MRQ":4.06484132,
           "PCF_OCF_LAR":11.16401607,
           "PCF_OCF_TTM":7.8371549,
           "PS_TTM":3.64534103,
           "PEG_CAR":1899.40065664655,
           "TRADE_DATE":"2022-05-27 00:00:00"
        '''
    def save_industry_Info(self,tradeDay) -> pd.DataFrame:
        page_num, page_size = 1, 1
        rename_cols={'SECURITY_CODE':'stock_code','BOARD_CODE':'board_code','PE_TTM':'stock_pe_ttm'}
        print('查询行业开始处理{0}日,第{1}页处理数据'.format(tradeDay,page_num))
        response = self.get_industry_response(tradeDay)
        data = self.get_data(response)
        data_list = data['result']['data']
        industry_info = pd.DataFrame(data_list)
        industry_info.rename(columns=rename_cols,
                  inplace=True)
        industry_info['market_code'] = industry_info['SECUCODE'].str[7:9]
        industry_info = industry_info.loc[:, [ 'board_code','stock_code','market_code','stock_pe_ttm']]
        industry_info.drop_duplicates(inplace=True)
        print('行业数量:', len(industry_info))
        return industry_info

    def get_stock_median(self,stock_info:pd.DataFrame) -> pd.DataFrame:
        stock_info=stock_info[stock_info['stock_pe_ttm']>0]
        board_code_info = stock_info.groupby(['board_code'])
        new_rows = []
        for name, itemgroup in board_code_info:
            new_row = list()
            new_row.extend([name,
                round(itemgroup['stock_pe_ttm'].median(), 2),
                round(itemgroup['stock_pe_ttm'].mean(),2)
            ])
            new_rows.append(new_row)
        median_info = pd.DataFrame(data=new_rows,
                                    columns=['board_code', 'median_pe','average_pe'])
        print('行业中位市盈率数量:', len(median_info))
        median_info.drop_duplicates(inplace=True)
        print('行业中位市盈率数量:', len(median_info))
        return median_info



    def tradeDailyData(self, tradeDay):
        stock_detail_info=self.get_stock_info_detail()
        stock_detail_info = stock_detail_info.assign(trade_date=tradeDay)
        industry_info = self.save_industry_Info(tradeDay)
        median_info = self.get_stock_median(industry_info)
        industry_median_info = pd.merge(industry_info, median_info, how='left', left_on='board_code', right_on='board_code')
        info = pd.merge(stock_detail_info, industry_median_info, how='left', left_on='stock_code', right_on='stock_code')
        # info.to_csv("E:\quant_data\\em_trade_daily_industry" + tradeDay + ".csv")
        print('开始插入数据库')
        self.repoistoryObject.saveData(info, repoistoryParam=self.repoistoryParam)

2.东财日交易数据数据库脚本

-- quantdata.trade_daily_empc definition

CREATE TABLE `trade_daily_empc` (
  `stock_name` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '股票名称',
  `stock_code` varchar(12) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT '股票指数代码',
  `trade_date` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT '交易日',
  `open` float(10,2) DEFAULT NULL COMMENT '开盘点位',
  `high` float(10,2) DEFAULT NULL COMMENT '最高点位',
  `low` float(10,2) DEFAULT NULL COMMENT '最低点位',
  `close` float(10,2) DEFAULT NULL COMMENT '收盘点位',
  `pre_close` float(10,2) DEFAULT NULL COMMENT '昨日收盘点',
  `change` float(10,2) DEFAULT NULL COMMENT '涨跌点',
  `pct_chg` float(10,2) DEFAULT NULL COMMENT '涨跌幅(%)',
  `volume` float(20,2) DEFAULT NULL COMMENT '成交量(手)',
  `amount` float(20,2) DEFAULT NULL COMMENT '成交额(千)',
  `quantity_ratio` float(50,2) DEFAULT NULL COMMENT '量比',
  `turnover_rate` float(10,2) DEFAULT NULL COMMENT '换手率',
  `industry_name` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '行业名称',
  `stock_pe_ttm` float(20,2) DEFAULT NULL COMMENT '市盈率TTM',
  `stock_pb_mrq` float(20,2) DEFAULT NULL COMMENT '市净率',
  `stock_pe_percentile` float(10,4) DEFAULT NULL COMMENT '市盈率百分位',
  `stock_pb_percentile` float(10,4) DEFAULT NULL COMMENT '市净率百分位',
  `board_code` varchar(20) DEFAULT NULL COMMENT '行业编号',
  `market_code` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '市场',
  `average_pe` float(10,4) DEFAULT NULL COMMENT '行业平均市盈率',
  `average_pb` float(10,4) DEFAULT NULL COMMENT '平均市净率',
  `pe_percentile` float(10,4) DEFAULT NULL COMMENT '行业市盈率百分比',
  `pb_percentile` float(10,4) DEFAULT NULL COMMENT '行业平均市净率',
  `current_year_percent` float(10,4) DEFAULT NULL COMMENT '年初至今涨幅',
  `market_capital` float(20,4) DEFAULT NULL COMMENT '市值',
  `dividend_yield` float(10,4) DEFAULT NULL COMMENT '股息率',
  `stock_ps_percentile` float(10,4) DEFAULT NULL COMMENT '市销率',
  `median_pe` float(10,4) DEFAULT NULL COMMENT '市盈率中位数',
  PRIMARY KEY (`stock_code`,`trade_date`),
  KEY `inx_tdn_trade_date` (`trade_date`) USING BTREE COMMENT '按日期查询'
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='股票日交易PC';

3.调用

代码如下,调用时传入日期可实现获取交易数据,并落入mysql库中

    def empctradeDaily(self,tradeDay):
        reParam = {'table_name': 'trade_daily_empc'}
        #网络爬虫处理,mysql存储
        tradedaily = EmPcTradeDaily(MysqlRepoistory(),reParam,None)
        tradedaily.tradeDailyData(tradeDay=tradeDay)


总结

以上就是今天要讲的内容,本文主要介绍了爬取交易数据,下章介绍获取其他交易数据

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值