python的scrapy爬虫模块间进行传参_scrapy (三) : 请求传参

scrapy 请求传参

1.定义数据结构item.py文件

'''

field: item.py

'''

# -*- coding: utf-8 -*-

# Define here the models for your scraped items

#

# See documentation in:

# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class MovieprojectItem(scrapy.Item):

# define the fields for your item here like:

# name = scrapy.Field()

# 电影海报

# 一级页面要抓取的内容

post = scrapy.Field()

name = scrapy.Field()

_type = scrapy.Field()

# 二级页面要抓取的内容

director = scrapy.Field()

design = scrapy.Field()

actor = scrapy.Field()

info = scrapy.Field()

2.爬虫文件

# -*- coding: utf-8 -*-

# -*- coding: utf-8 -*-

import scrapy

from movieproject.items import MovieprojectItem

class MovieSpider(scrapy.Spider):

name = 'movie'

allowed_domains = ['www.id97.com']

start_urls = ['http://www.id97.com/movie/']

url = 'http://www.id97.com/movie/?page={}'

page = 1

'''

(1)只需要提取页码链接,只提取第一页的信息即可

(2)需要写两个规则,一个规则提取详情页面,一个规则是提取页码链接

'''

def parse(self, response):

# 先查找所有的movie_div

movie_div_list = response.xpath('//div[starts-with(@class,"col-xs-1-5")]')

# 遍历所有的div,去获取每一个详细的信息

for odiv in movie_div_list:

item = MovieprojectItem()

# 获取电影海报

item['post'] = odiv.xpath(".//img/@data-original").extract_first()

# 获取电影名字

item['name'] = odiv.xpath("./div/div/h1/a/text()").extract_first()

# 获取电影类型

item['_type'] = odiv.xpath("./div/div/div/a/text()").extract()

# 获取详情页面

detail_href = odiv.xpath('./div/a/@href').extract_first()

'''

向详情页面发送请求

将item向二级传递过去,到二级页面接受并且接着提取其他的信息

请求二级详情页面,解析二级页面中的相应内容,通过meta参数进行Request的数据传

'''

yield scrapy.Request(url=detail_href,callback=self.parse_detail, meta={'item': item})

# 爬取其他页面

if self.page <= 5:

self.page += 1

url = self.url.format(self.page)

print(url)

yield scrapy.Request(url=url, callback=self.parse)

def parse_detail(self,response):

# 首先获取到上一级传递过来的item

item = response.meta['item']

# 在这个页面中接着提取电影的其它信息即可

# 获取导演

item['director'] = response.xpath("//div[starts-with(@class,'col-xs-8')]/table/tbody/tr/td[2]/a/text()").extract()

# 获取编剧

item['design'] = response.xpath("//div[starts-with(@class,'col-xs-8')]/table/tbody/tr[2]/td[2]/a/text()").extract()

# 获取主演

item['actor'] = response.xpath("//div[starts-with(@class,'col-xs-8')]/table/tbody/tr[3]/td[2]/a/text()").extract()

# 获取电影介绍

item['info'] = response.xpath("//div[@class='col-xs-12 movie-introduce']/p/text()").extract_first()

#提交item到管道

yield item

3.管道文件

# -*- coding: utf-8 -*-

'''

filed: pipelines.py

'''

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import json

from scrapy.utils.project import get_project_settings

import pymysql

class MovieprojectPipeline(object):

def open_spider(self,spider):

self.fp = open("movie.json","w",encoding="utf8")

def process_item(self, item, spider):

obj = dict(item)

string = json.dumps(obj,ensure_ascii=False)

self.fp.write(string+'\n')

# print("写入成功")

return item

def close_spider(self,spider):

self.fp.close()

class MovieMysqlPipeline(object):

def open_spider(self,spider):

# 获取所有的配置信息

settings = get_project_settings()

# 链接数据库

host = settings['DB_HOST']

port = settings['DB_PORT']

user = settings['DB_USER']

pwd = settings['DB_PWD']

name = settings['DB_NAME']

charset = settings['DB_CHARSET']

self.conn = pymysql.connect(host=host, port=port, user=user, password=pwd, db=name, charset=charset)

def process_item(self, item, spider):

# 拼接sql语句

sql = 'insert into movie(post, name, type, director, design, actor, info) values("%s","%s","%s","%s","%s","%s","%s")' % (item['post'], item['name'], item['_type'], item['director'], item['design'], item['actor'], item['info'])

# 获取游标

cursor = self.conn.cursor()

# 执行sql语句

try:

cursor.execute(sql)

self.conn.commit()

except Exception as e:

self.conn.rollback()

return item

def close_spider(self,spider):

# 关闭数据库

self.conn.close()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值