scrapy中请求传参+数据持久存储

代码中有详细的注释

# -*- coding: utf-8 -*-
import scrapy

from movie.items import MovieItem


class AllMovieSpider(scrapy.Spider):
    name = 'all_movie'
    # allowed_domains = ['4567tv.tv']
    start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/1/page/1.html']
    domain = "https://www.4567tv.tv"

    # 需要爬取的的内容(电影名称:首页爬取)
    # 电影简介详情页面爬取
    # 电影详情页面的链接
    # 爬取前五页的内容
    # 通用的url模板,适用于第二页以及以后
    url = "https://www.4567tv.tv/index.php/vod/show/id/1/page/%d.html"
    page = 2

    def parse(self, response):
        all_li = response.xpath("//div[@class='stui-pannel_bd']//ul//li")
        for li in all_li:
            movie_name = li.xpath(".//h4/a/text()").get()
            movie_url = self.domain + li.xpath(".//h4/a/@href").get()
            item = MovieItem()
            item['movie_name'] = movie_name

            # 对详情页面的url发起get请求,并请求传参
            # 请求传参:meta参数对应的字典就可以传递给请求对象中指定好的回调函数
            yield scrapy.Request(url=movie_url, callback=self.detail_parse, meta={'item': item})
        if self.page <= 5:
            return
        else:
            new_url = format(self.url % self.page)
            self.page += 1
            yield scrapy.Request(url=new_url, callback=self.page)

    def detail_parse(self, response):
        # 回调函数内部通过response.meta就可以接收到请求传参传递过来的字典
        item = response.meta['item']
        detail = response.xpath("//div[@class='stui-content__detail']//p[@class='desc detail hidden-xs']"
                                "//span[@class='detail-content']/text()").get()
        print(detail)
        item['detail'] = detail
        yield item
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql


class MoviePipeline(object):
    def __init__(self):

        self.conn = pymysql.Connect(host='127.0.0.1',
                                    port=3306,
                                    user='root',
                                    password='root',
                                    db='yju')
        self.cursor = self.conn.cursor()

    def open_spider(self, spider):
        print("爬虫开始了、、、")

    def process_item(self, item, spider):
        try:

            sql = "insert into movie(movie_name, detail) values ('%s', '%s')" % (item['movie_name'], item['detail'])
            self.cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            print("插入数据出现错误")
            print("!" * 50)
            print(e)
            self.conn.rollback()
        return item

    def close_spider(self, spider):
        print("爬虫结束了、、、")
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值