scrap使用

醉舞经阁半卷书1

于 2022-02-22 23:20:41 发布

阅读量411

点赞数

分类专栏： python 爬虫文章标签： python 开发语言后端

本文链接：https://blog.csdn.net/josnsense/article/details/123079815

版权

python 同时被 2 个专栏收录

3 篇文章 0 订阅

订阅专栏

爬虫

3 篇文章 0 订阅

订阅专栏

一：使用

import scrapy
from stock.items import StockItem, StockDetailItem


class ThsSpider(scrapy.Spider):
    name = 'ths'
    handle_httpstatus_list = [401]
    allowed_domains = ['q.10jqka.com.cn', 'stockpage.10jqka.com.cn']
    base_url = "https://q.10jqka.com.cn/index/index/board/all/field/zdf/order/desc/page/"
    end_url = "/ajax/1/"
    url = base_url + "1" + end_url
    start_urls = [url]
    code_item_dict = dict()
    code_stock_detail_dict = dict()
    index_name_dict = {
        1: "id",
        2: "code",
        3: "name",
        4: "price",
        5: "quota_change",
        6: "up_and_down",
        7: "up_speed",
        8: "change_hand",
        9: "quantity_ratio",
        10: "amplitude",
        11: "turnover",
        12: "outstanding_shares",
        13: "circulating_market_value",
        14: "pe_ratio"
    }
    stock_detail_dict = {
        1: 'area',
        2: 'involves_concepts',
        3: 'main_business',
        4: 'listing_date',
        5: 'net_assets_per_share',
        6: 'eps',
        7: 'net_profit',
        8: 'net_profit_rate',
        9: 'operating_income',
        10: 'cash_flow_per_share',
        11: 'provident_fund_per_share',
        12: 'undistributed_earnings_per_share',
        13: 'total_share_capital',
        14: 'outstanding_shares'
    }
    cookied = {
        'spversion': '20130314',
        'searchGuide': 'sg',
        'historystock': '603098%7C*%7C002196%7C*%7C603825',
        'Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1': '1644137307,1644137325,1644499952,1645343560',
        'v': "A84mmjv8I_W-oJQJxzoL-0Q1H6-VT511pBpGAvgcO66NTGARYN_iWXSjlfXL",
    }
    headers = {
        'Host': 'q.10jqka.com.cn',
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
        'sec-ch-ua-platform': "Windows",
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Sec-Fetch-Site': 'none;',
        'Sec-Fetch-Mode': 'navigate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cookie': 'spversion=20130314; searchGuide=sg; Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1=1644137307,1644137325,1644499952,1645343560; historystock=603098%7C*%7C002196%7C*%7C603825; Hm_lpvt_78c58f01938e4d85eaf619eae71b4ed1=1645538179; v=A-oCTkeIfKUaPvBdSIqnlwDpO1uJW261YN_iWXSjlj3Ip4TFXOu-xTBvMoRH',


    }

    def parse(self, response):
        stock_list = response.xpath("//table[@class='m-table m-pager-table']/tbody/tr")
        for stock in stock_list:
            stock_details = stock.xpath("./td")
            item = dict()
            for index, td in enumerate(stock_details):
                if index == 1 or index == 2:
                    item[self.index_name_dict[index + 1]] = td.xpath("./a/text()").extract()[0]
                    item["url"] = td.xpath("./a/@href").extract()[0]
                elif index < 14:
                    item[self.index_name_dict[index+1]] = td.xpath("./text()").extract()[0]
            self.code_item_dict[item["code"]] = item
        for i in range(2, 3):
            next_url = self.base_url + str(i) + self.end_url
            yield scrapy.http.Request(url=next_url, cookies=self.cookied, callback=self.next_parse)
        # print(len(self.code_item_dict))
        # for code, item in self.code_item_dict.items():
        #     yield scrapy.http.Request(url=item["url"], cookies=self.cookied, meta={'meta_1': code},
        #                               callback=self.stock_detail_parse)
        # print(len(self.code_stock_detail_dict))

    def next_parse(self, response):
        stock_list = response.xpath("//table[@class='m-table m-pager-table']/tbody/tr")
        for stock in stock_list:
            stock_details = stock.xpath("./td")
            item = dict()
            for index, td in enumerate(stock_details):
                if index == 1 or index == 2:
                    item[self.index_name_dict[index + 1]] = td.xpath("./a/text()").extract()[0]
                    item["url"] = td.xpath("./a/@href").extract()[0]
                elif index < 14:
                    item[self.index_name_dict[index + 1]] = td.xpath("./text()").extract()[0]
            self.code_item_dict[item["code"]] = item

    def stock_detail_parse(self, response):
        code = response.meta['meta_1']
        content_list = response.xpath("//dl[@class='company_details']/dd")
        item = dict()
        for index, content in enumerate(content_list):
            if index == 0:
                item[self.stock_detail_dict[index + 1]] = content.xpath("./text()").extract()[0]
            elif index == 1:
                item[self.stock_detail_dict[index + 1]] = content.xpath("./@title").extract()[0]
            elif index == 3:
                item[self.stock_detail_dict[index]] = content.xpath("./@title").extract()[0]
            elif index > 3:
                item[self.stock_detail_dict[index]] = content.xpath("./text()").extract()[0]
        self.code_stock_detail_dict[code] = item

2 items

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class StockItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 代码
    code = scrapy.Field()
    # 名称
    name = scrapy.Field()
    # 股价
    price = scrapy.Field()
    # 涨跌幅（%）
    quota_change = scrapy.Field()
    # 涨跌
    up_and_down = scrapy.Field()
    # 涨速（%）
    up_speed = scrapy.Field()
    # 换手（%）
    change_hand = scrapy.Field()
    # 量比
    quantity_ratio = scrapy.Field()
    # 振幅（%）
    amplitude = scrapy.Field()
    # 成交额
    turnover = scrapy.Field()
    # 流通股
    outstanding_shares = scrapy.Field()
    # 流通市值
    circulating_market_value = scrapy.Field()
    # 市盈率
    pe_ratio = scrapy.Field()
    # 股票详情
    stock_url = scrapy.Field()


class StockDetailItem(scrapy.Item):
    # 代码
    code = scrapy.Field()
    # 名称
    name = scrapy.Field()
    # 所属地域
    area = scrapy.Field()
    # 涉及概念
    involves_concepts = scrapy.Field()
    # 主营业务
    main_business = scrapy.Field()
    # 上市日期
    listing_date = scrapy.Field()
    # 净利润
    net_profit =scrapy.Field()
    # 净利润增长率
    net_profit_rate = scrapy.Field()
    # 营业收入
    operating_income = scrapy.Field()
    # 每股现金流
    cash_flow_per_share = scrapy.Field()
    # 每股公积金
    provident_fund_per_share = scrapy.Field()
    # 每股未分配利润
    undistributed_earnings_per_share = scrapy.Field()
    # 总股本
    total_share_capital = scrapy.Field()
    # 流通股
    outstanding_shares = scrapy.Field()
    # 大单流入
    large_single_inflow = scrapy.Field()
    # 大单流出
    large_single_outflow = scrapy.Field()
    # 中单流入
    mid_single_inflow = scrapy.Field()
    # 中单流出
    mid_single_outflow = scrapy.Field()
    # 小单流入
    small_single_inflow = scrapy.Field()
    # 小单流出
    small_single_outflow = scrapy.Field()