python 使用scrapy爬取小说以及保存json格式

今天爬取的小说网站地址:https://www.hongxiu.com/all?gender=2&catId=-1

使用终端创建项目命令: scrapy  startproject  hongxiu

然后进入项目命令:cd  hongxiu

接着创建爬取的项目名字和网址,命令:scrapy  genspider  book  hongxiu.com

运行:scrapy crawl hongxiu

创建完以后如下图:

然后我们进入book.py修改一下地址,把所用到的模块引进去,如下图:

 

接着我们进入网页来获取想要的链接:

    def parse(self, response):      
        #获取href链接 
        type_list = response.xpath('//ul[@type="category"]//a/@href').extract()
        del type_list[0]
        for type in type_list:
        #将获取的链接进行拼接    
        url = 'https://www.hongxiu.com' + type
            split = re.compile(r'.*?catId=(.*?)&.*?',re.S)
            catId = re.findall(split,url)

            yield 
      scrapy.Request(url=url,meta{'type':catId[0]},callback=self.get_content_with_type_url)

因为小说太多,我们连获取前十页的链接:

    def get_content_with_type_url(self,response):
     
        catId = response.meta['type']
        for page_num in range(1,11):

            url = 'https://www.hongxiu.com/all?pageNum=' + str(page_num) + '&pageSize=10&gender=2&catId=' + catId + '&isFinish=-1&isVip=-1&size=-1&updT=-1&orderBy=0'
            # print(url)
            yield scrapy.Request(url=url,callback=self.get_book_with_url)

获取每一部小说的详情链接:

    def get_book_with_url(self,response):
        detail_list = response.xpath('//div[@class="book-info"]/h3/a/@href').extract()
        for book_detail in detail_list:
            url = 'https://www.hongxiu.com' + book_detail
            yield scrapy.Request(url=url,callback=self.get_detail_with_url)

获取小说名字类型,作者,收藏数,点击数,字数,还有小说简介

    def get_detail_with_url(self,response):
        type = response.xpath('//div[@class="crumbs-nav center1020"]/span/a[2]/text()').extract_first()
        print(type)

        name = response.xpath('//div[@class="book-info"]/h1/em/text()').extract_first()
        print(name)
        author = response.xpath('//div[@class="book-info"]/h1/a/text()').extract_first()
        print(author)

        total = response.xpath('//p[@class="total"]/span/text()').extract_first() +\
                response.xpath('//p[@class="total"]/em/text()').extract_first()
        print(total)
        love = response.xpath('//p[@class="total"]/span[2]/text()').extract_first() +\
               response.xpath('//p[@class="total"]/em[2]/text()').extract_first()
        print(love)
        click = response.xpath('//p[@class="total"]/span[3]/text()').extract_first() +\
                response.xpath('//p[@class="total"]/em[3]/text()').extract_first()
        print(click)
        introduce = response.xpath('//p[@class="intro"]/text()').extract()
        print(introduce)
        url = 'http:' + response.xpath('//div[@class="book-img"]//img/@src').extract_first()
        url = url.replace('\r','')
        print(url)

然后我们进入items.py进行编辑:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class HongxiuItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    type = scrapy.Field()
    name = scrapy.Field()
    author = scrapy.Field()
    total = scrapy.Field()
    love = scrapy.Field()
    click = scrapy.Field()
    introduce = scrapy.Field()
    url = scrapy.Field()
    pass

接着我们回到book.py:

 item = HongxiuItem()
        item['type'] = type
        item['name'] = name
        item['author'] = author
        item['total'] = total
        item['love'] = love
        item['click'] = click
        item['introduce'] = introduce
        item['url'] = [url]

        yield item

然后我们进入pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import codecs
import os
import json
class HongxiuPipeline(object):
    def __init__(self):
        self.file = codecs.open(filename='book.json',mode='w',encoding='utf-8')
        self.file.write('"book_list":[')
    def process_item(self, item, spider):
        res = dict(item)
        str = json.dumps(res,ensure_ascii=False)
        self.file.write(str)
        self.file.write(',\n')
        return item
    def close_spider(self,spider):
        self.file.seek(-1,os.SEEK_END)
        self.file.truncate()

        self.file.seek(-1,os.SEEK_END)
        self.file.write(']')
        self.file.close()

最后我们进入管道settings.py:

下面是我们今天在book.py里面的完整代码:

# -*- coding: utf-8 -*-
import scrapy
import re
from .. items import HongxiuItem
class BookSpider(scrapy.Spider):
    name = 'book'
    allowed_domains = ['hongxiu.com']
    start_urls = ['https://www.hongxiu.com/all?gender=2&catId=-1']

    def parse(self, response):
        type_list = response.xpath('//ul[@type="category"]//a/@href').extract()
        del type_list[0]
        for type in type_list:
            url = 'https://www.hongxiu.com' + type
            split = re.compile(r'.*?catId=(.*?)&.*?',re.S)
            catId = re.findall(split,url)

            yield scrapy.Request(url=url,meta={'type':catId[0]},callback=self.get_content_with_type_url)

    def get_content_with_type_url(self,response):
        #https://www.hongxiu.com
        # /all?pageNum=1&pageSize=10&gender=2&catId=30020&isFinish=-1&
        # isVip=-1&size=-1&updT=-1&orderBy=0
        catId = response.meta['type']
        for page_num in range(1,11):

            url = 'https://www.hongxiu.com/all?pageNum=' + str(page_num) + '&pageSize=10&gender=2&catId=' + catId + '&isFinish=-1&isVip=-1&size=-1&updT=-1&orderBy=0'
            # print(url)
            yield scrapy.Request(url=url,callback=self.get_book_with_url)
    def get_book_with_url(self,response):
        detail_list = response.xpath('//div[@class="book-info"]/h3/a/@href').extract()
        for book_detail in detail_list:
            url = 'https://www.hongxiu.com' + book_detail
            yield scrapy.Request(url=url,callback=self.get_detail_with_url)

    def get_detail_with_url(self,response):
        type = response.xpath('//div[@class="crumbs-nav center1020"]/span/a[2]/text()').extract_first()
        print(type)

        name = response.xpath('//div[@class="book-info"]/h1/em/text()').extract_first()
        print(name)
        author = response.xpath('//div[@class="book-info"]/h1/a/text()').extract_first()
        print(author)

        total = response.xpath('//p[@class="total"]/span/text()').extract_first() +\
                response.xpath('//p[@class="total"]/em/text()').extract_first()
        print(total)
        love = response.xpath('//p[@class="total"]/span[2]/text()').extract_first() +\
               response.xpath('//p[@class="total"]/em[2]/text()').extract_first()
        print(love)
        click = response.xpath('//p[@class="total"]/span[3]/text()').extract_first() +\
                response.xpath('//p[@class="total"]/em[3]/text()').extract_first()
        print(click)
        introduce = response.xpath('//p[@class="intro"]/text()').extract()
        print(introduce)
        url = 'http:' + response.xpath('//div[@class="book-img"]//img/@src').extract_first()
        url = url.replace('\r','')
        print(url)

        item = HongxiuItem()
        item['type'] = type
        item['name'] = name
        item['author'] = author
        item['total'] = total
        item['love'] = love
        item['click'] = click
        item['introduce'] = introduce
        item['url'] = [url]

        yield item

运行结果是如下:

谁有更好的方法,下面留言一起交流...

 

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,以下是一个简单的 Scrapy 爬取南京天气预报的示例: 1. 创建 Scrapy 项目 在命令行中输入以下命令: ``` scrapy startproject nanjing_weather cd nanjing_weather ``` 2. 创建爬虫 在命令行中输入以下命令: ``` scrapy genspider nanjing www.weather.com.cn ``` 这将在 `nanjing_weather/spiders` 目录下创建一个名为 `nanjing.py` 的文件,用于定义爬虫。 3. 编写爬虫代码 打开 `nanjing.py` 文件,将以下代码粘贴到文件中: ```python import scrapy class NanjingWeatherSpider(scrapy.Spider): name = 'nanjing' allowed_domains = ['www.weather.com.cn'] start_urls = ['http://www.weather.com.cn/weather/101190101.shtml'] def parse(self, response): # 获取天气信息 weather_info = response.xpath('//div[@id="7d"]/ul/li') for info in weather_info: date = info.xpath('h1/text()').get() weather = info.xpath('p[@class="wea"]/text()').get() temperature = info.xpath('p[@class="tem"]/span/text()').get() + info.xpath('p[@class="tem"]/i/text()').get() wind = info.xpath('p[@class="win"]/i/text()').get() yield { 'city': '南京', 'date': date, 'weather': weather, 'temperature': temperature, 'wind': wind, } ``` 以上代码定义了一个名为 `NanjingWeatherSpider` 的爬虫,首先在 `start_urls` 中定义了南京天气预报页面的链接,然后在 `parse` 方法中获取天气信息,并将数据以字典的形式返回。 4. 运行爬虫 在命令行中输入以下命令: ``` scrapy crawl nanjing -o nanjing_weather.json ``` 这将运行爬虫并将爬取到的数据保存到 `nanjing_weather.json` 文件中。 以上就是一个简单的 Scrapy 爬取南京天气预报的示例。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值