scrapy中请求传参+数据持久存储

最新推荐文章于 2022-08-12 17:08:01 发布

迷路的贝壳儿

最新推荐文章于 2022-08-12 17:08:01 发布

阅读量150

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/weixin_39218107/article/details/100924356

版权

爬虫专栏收录该内容

8 篇文章 0 订阅

订阅专栏

代码中有详细的注释

# -*- coding: utf-8 -*-
import scrapy

from movie.items import MovieItem


class AllMovieSpider(scrapy.Spider):
    name = 'all_movie'
    # allowed_domains = ['4567tv.tv']
    start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/1/page/1.html']
    domain = "https://www.4567tv.tv"

    # 需要爬取的的内容（电影名称：首页爬取）
    # 电影简介详情页面爬取
    # 电影详情页面的链接
    # 爬取前五页的内容
    # 通用的url模板，适用于第二页以及以后
    url = "https://www.4567tv.tv/index.php/vod/show/id/1/page/%d.html"
    page = 2

    def parse(self, response):
        all_li = response.xpath("//div[@class='stui-pannel_bd']//ul//li")
        for li in all_li:
            movie_name = li.xpath(".//h4/a/text()").get()
            movie_url = self.domain + li.xpath(".//h4/a/@href").get()
            item = MovieItem()
            item['movie_name'] = movie_name

            # 对详情页面的url发起get请求，并请求传参
            # 请求传参：meta参数对应的字典就可以传递给请求对象中指定好的回调函数
            yield scrapy.Request(url=movie_url, callback=self.detail_parse, meta={'item': item})
        if self.page <= 5:
            return
        else:
            new_url = format(self.url % self.page)
            self.page += 1
            yield scrapy.Request(url=new_url, callback=self.page)

    def detail_parse(self, response):
        # 回调函数内部通过response.meta就可以接收到请求传参传递过来的字典
        item = response.meta['item']
        detail = response.xpath("//div[@class='stui-content__detail']//p[@class='desc detail hidden-xs']"
                                "//span[@class='detail-content']/text()").get()
        print(detail)
        item['detail'] = detail
        yield item

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql


class MoviePipeline(object):
    def __init__(self):

        self.conn = pymysql.Connect(host='127.0.0.1',
                                    port=3306,
                                    user='root',
                                    password='root',
                                    db='yju')
        self.cursor = self.conn.cursor()

    def open_spider(self, spider):
        print("爬虫开始了、、、")

    def process_item(self, item, spider):
        try:

            sql = "insert into movie(movie_name, detail) values ('%s', '%s')" % (item['movie_name'], item['detail'])
            self.cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            print("插入数据出现错误")
            print("!" * 50)
            print(e)
            self.conn.rollback()
        return item

    def close_spider(self, spider):
        print("爬虫结束了、、、")