scrapy框架爬取起点小说分类

spider代码

class QidianSpider(scrapy.Spider):
    name = 'qidian'
    allowed_domains = ['qidian.com']
    start_urls = ['https://www.qidian.com/all?orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0']

    def parse(self, response):
        #大分类分组
        li_list = response.xpath("//ul[@type='category']/li")[1:]
        for li in li_list:
            item = {}
            item["first_category_title"] = li.xpath("./a/text()").extract_first()
            item["first_category_url"] = "https:" + li.xpath("./a/@href").extract_first()
            #小分类分组
            yield scrapy.Request(
                url=item["first_category_url"],
                callback=self.parse_first_category,
                meta={"item": item}
            )


    def parse_first_category(self, response):
        dd_list = response.xpath("//div[@class='sub-type']/dl[@class='']/dd")
        first_category_title = response.meta["item"]["first_category_title"]
        for dd in dd_list:
            item = {}
            item["first_category_title"] = first_category_title
            item["second_category_title"] = dd.xpath("./a/text()").extract_first()
            item["second_category_url"] = "https:" + dd.xpath("./a/@href").extract_first()
            yield scrapy.Request(
                url=item["second_category_url"],
                callback=self.parse_second_category,
                meta={"item": item}
            )


    def parse_second_category(self, response):
        first_category_title = response.meta["item"]["first_category_title"]
        second_category_title = response.meta["item"]["second_category_title"]
        li_list = response.xpath("//ul[@class='all-img-list cf']/li")

        for li in li_list:
            item = {}
            item["first_category_title"] = first_category_title
            item["second_category_title"] = second_category_title
            item["book_name"] = li.xpath(".//h4/a/text()").extract_first()
            item["author_name"] = li.xpath(".//p[@class='author']/a[@class='name']/text()").extract_first()
            item["is_end"] = li.xpath(".//span/text()").extract_first()
            item["info"] = li.xpath(".//p[@class='intro']/text()").extract_first().strip()
            item["book_poster_src"] = "http:" + li.xpath(".//div[@class='book-img-box']/a/img/@src").extract_first()
            yield item

        next_url = "https:" + response.xpath("//a[contains(text(), '>')]/@href").extract_first()
        if next_url is not None:
            yield scrapy.Request(url=next_url, callback=self.parse_second_category)

pipelines代码

import json
import requests
from p4.settings import USER_AGENT

class P4Pipeline(object):

    def open_spider(self, spider):
        self.fp = open("./data/book.json", "w", encoding="utf-8")

    def process_item(self, item, spider):
        json.dump(item, self.fp, ensure_ascii=False)
        self.fp.write("\n")
        self.save_poster(item["book_poster_src"], item["book_name"])
        print(item["book_name"] + "save into local documnet successfully")
        return item

    def close_spider(self, spider):
        self.fp.close()

    def save_poster(self, url, title):
        file_name = "./data/img/" + title + ".jpg"
        with open(file_name, "wb") as f:
            f.write(requests.get(url, headers={"User_Agent": USER_AGENT}).content)

主方法

from scrapy import cmdline

cmdline.execute("scrapy crawl qidian".split())

在这里插入图片描述
在这里插入图片描述
爬取结果如下:
在这里插入图片描述

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值