CrawlSpider的cookie问题,求助大神

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import KejianItem
import re
from lxml import etree
import requests
from scrapy import Request

class A53kejianSpider(CrawlSpider):
    name = ''
    allowed_domains = ['com']
    start_urls = ['http://www.com/']
    rules = (
        Rule(LinkExtractor(allow=r'/\w{2,9}/$'),follow=True),
        Rule(LinkExtractor(allow=r'/\w{2,9}/\d+.html'), callback='parse_item', follow=False),
    )
    #重写start_requests
    def start_requests(self):
        cookies = 'ASPSESSIONIDAQSSQBCQ=ANELFNGBHNAFGJGBPCOOHACN; Hm_lvt_0cb4e81fdd1a5b0b04d6edd93bfa0928=1587718373,1587728873,1587741214,1587773867; Hm_lpvt_0cb4e81fdd1a5b0b04d6edd93bfa0928=1587776002'
        cookies = {i.split('=')[0]: i.split('=')[1] for i in cookies.split('; ')}
        yield scrapy.Request(url=self.start_urls[0],cookies=cookies,callback=self.parse,dont_filter=True)
    def parse_item(self, response):
        item=KejianItem()
        #标题
        item['title'] =response.xpath('//div[@class="b downinfo"]/h1/text()').get()
        #内容
        content=response.xpath('//div[@class ="p20"]/p//text()').getall()
        item['content']=''.join(content)
        #验证码
        codes =response.xpath('//*[@id="container"]/div[1]/div[4]//text()').getall()
        item['code'] =codes[1]+codes[2]+codes[9]
        #下载地址:
        orgin =response.url
        number =re.findall('\d',orgin)
        number=''.join(number)
        url = 'http://www。com/plug/down.asp?id=' + number + '&order=0'
        ###获取下载链接
        yield scrapy.http.Request(url,meta={'item':item} ,callback=self.down_url,dont_filter=True)
    def down_url(self,response):
        item = response.meta['item']
        result = response.xpath('//text()').getall()
        result = "".join(result)
        downurl = result[1:]  #下载地址
        item['downurl'] = downurl
        yield item

重写了start_requests,让parse请求中代了cookies信息,但是下载链接还是不能获取。

不再scrapy框架中,使用该requests模块可以获取下载链接

from lxml import etree
import requests
 url = 'http://www。com/plug/down.asp?id=' + number + '&order=0'
cookies ='ASPSESSIONIDAQSSQBCQ=OIEKFNGBBMBKAOACCCJJCJCK; Hm_lvt_0cb4e81fdd1a5b0b04d6edd93bfa0928=1587685941,1587718373,1587728873,1587741214; 1Hl5Yp=content%5F9353=6123%3A381; Hm_lpvt_0cb4e81fdd1a5b0b04d6edd93bfa0928=1587742953'
cookies = {i.split('=')[0]: i.split('=')[1] for i in cookies.split('; ')}
resp = requests.get(url,cookies=cookies)  # 获取 url 网页源码
data = etree.HTML(resp.text)
downl =data.xpath('//text()')
print(downl)

大神,这是为什么?在scrapy中,我也携带了cookie,但是为什么不能获取下载地址

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值