16. python爬虫——基于scrapy爬取多页面新闻信息，通过请求传参完成持久化存储

最新推荐文章于 2024-03-14 06:19:07 发布

辰阳星宇

最新推荐文章于 2024-03-14 06:19:07 发布

阅读量744

点赞数 1

分类专栏： # 网络爬虫文章标签： python 大数据

本文链接：https://blog.csdn.net/qq_41094332/article/details/108387340

版权

网络爬虫专栏收录该内容

20 篇文章 4 订阅

订阅专栏

python爬虫——基于scrapy通过请求传参完成持久化存储

1、介绍
【前置准备】
2、分析

1、介绍

请求传参

使用场景：如果爬取解析的数据不在同一张页面中。（需要进行深度爬取）
需求：爬取海外网中的新闻标题和新闻详细内容
http://hk.haiwainet.cn/news/

【前置准备】

构建scrapy框架工程文件，设置完setting.py配置信息

2、分析

（1）获取单页面情况

打开网页，获取新闻标题和新闻详细内容
在这里插入图片描述
首先，获取各标签下的新闻标题，xpath：
/html/body/div[2]/div[3]/div[1]/ul/a/text()

然后，获取详情页的链接地址，xpath：
/html/body/div[2]/div[3]/div[1]/ul/a/@href

之后进入详情页，获取详情内容
在这里插入图片描述
获取页面内容：
//*[@id="cen"]//text()

编写代码

import scrapy


class BossSpider(scrapy.Spider):
    name = 'boss'
    #allowed_domains = ['www.xxx.com']
    start_urls = ['http://hk.haiwainet.cn/news/']

    def parse_detail(self,response):
        detail_page = response.xpath('//*[@id="cen"]//text()').extract()
        detail_page = ''.join(detail_page)
        print(detail_page)

    def parse(self, response):
        ul_list = response.xpath('/html/body/div[2]/div[3]/div[1]/ul')
 #       print(ul_list)

        for ul in ul_list:
            li_list = ul.xpath('./li')
            for li in li_list:
                news_name = li.xpath('./a/text()').extract_first()
                print(news_name)
                detail_url = li.xpath('./a/@href').extract_first()
                #print(detail_url)
            yield scrapy.Request(detail_url,callback=self.parse_detail)

（2）请求传参

settings.py

ITEM_PIPELINES = {
   'bossPro.pipelines.BossproPipeline': 300,
}

items.py

import scrapy


class BossproItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    news_name = scrapy.Field()
    detail_page = scrapy.Field()

boss.py

import scrapy
from bossPro.items import BossproItem

class BossSpider(scrapy.Spider):
    name = 'boss'
    #allowed_domains = ['www.xxx.com']
    start_urls = ['http://hk.haiwainet.cn/news/']

    #回调函数接收item
    def parse_detail(self,response):
        item = response.meta['item']
        detail_page = response.xpath('//*[@id="cen"]//text()').extract()
        detail_page = ''.join(detail_page)
        item['detail_page'] = detail_page
        print(detail_page)

        yield item

    def parse(self, response):
        ul_list = response.xpath('/html/body/div[2]/div[3]/div[1]/ul')
        #       print(ul_list)

        for ul in ul_list:
            li_list = ul.xpath('./li')
            for li in li_list:
                item = BossproItem()
                news_name = li.xpath('./a/text()').extract_first()
                item['news_name'] = news_name
                print(news_name)
                detail_url = li.xpath('./a/@href').extract_first()
                #print(detail_url)
                #对详情页发请求获取详情页的页面源码数据
                #手动请求的发送
                #请求参数：meta={},可以将meta字典传递给请求对应的回调函数
            yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item})

pipelines.py

class BossproPipeline:
    def process_item(self, item, spider):
        print(item)
        return item

（3）获取多页信息

import scrapy
from bossPro.items import BossproItem

class BossSpider(scrapy.Spider):
    name = 'boss'
    #allowed_domains = ['www.xxx.com']
    start_urls = ['http://hk.haiwainet.cn/news/']

    url = 'http://hk.haiwainet.cn/news/%d.html'
    page_num = 2
    #回调函数接收item
    def parse_detail(self,response):
        item = response.meta['item']
        detail_page = response.xpath('//*[@id="cen"]//text()').extract()
        detail_page = ''.join(detail_page)
        item['detail_page'] = detail_page
        print(detail_page)

        yield item

    #解析首页中的标题名称
    def parse(self, response):

        ul_list = response.xpath('/html/body/div[2]/div[3]/div[1]/ul')
        #       print(ul_list)

        for ul in ul_list:
            li_list = ul.xpath('./li')
            for li in li_list:
                item = BossproItem()
                news_name = li.xpath('./a/text()').extract_first()
                item['news_name'] = news_name
                print(news_name)
                detail_url = li.xpath('./a/@href').extract_first()
                #print(detail_url)
                #对详情页发请求获取详情页的页面源码数据
                #手动请求的发送
                #请求参数：meta={},可以将meta字典传递给请求对应的回调函数
            yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item})

        #分页操作
        if self.page_num <= 3:
            new_url = format(self.url%self.page_num)
            self.page_num += 1

            yield scrapy.Request(new_url,callback=self.parse)