# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import KejianItem
import re
from lxml import etree
import requests
from scrapy import Request
class A53kejianSpider(CrawlSpider):
name = ''
allowed_domains = ['com']
start_urls = ['http://www.com/']
rules = (
Rule(LinkExtractor(allow=r'/\w{2,9}/$'),follow=True),
Rule(LinkExtractor(allow=r'/\w{2,9}/\d+.html'), callback='parse_item', follow=False),
)
#重写start_requests
def start_requests(self):
cookies = 'ASPSESSIONIDAQSSQBCQ=ANELFNGBHNAFGJGBPCOOHACN; Hm_lvt_0cb4e81fdd1a5b0b04d6edd93bfa0928=1587718373,1587728873,1587741214,1587773867; Hm_lpvt_0cb4e81fdd1a5b0b04d6edd93bfa0928=1587776002'
cookies = {i.split('=')[0]: i.split('=')[1] for i in cookies.split('; ')}
yield scrapy.Request(url=self.start_urls[0],cookies=cookies,callback=self.parse,dont_filter=True)
def parse_item(self, response):
item=KejianItem()
#标题
item['title'] =response.xpath('//div[@class="b downinfo"]/h1/text()').get()
#内容
content=response.xpath('//div[@class ="p20"]/p//text()').getall()
item['content']=''.join(content)
#验证码
codes =response.xpath('//*[@id="container"]/div[1]/div[4]//text()').getall()
item['code'] =codes[1]+codes[2]+codes[9]
#下载地址:
orgin =response.url
number =re.findall('\d',orgin)
number=''.join(number)
url = 'http://www。com/plug/down.asp?id=' + number + '&order=0'
###获取下载链接
yield scrapy.http.Request(url,meta={'item':item} ,callback=self.down_url,dont_filter=True)
def down_url(self,response):
item = response.meta['item']
result = response.xpath('//text()').getall()
result = "".join(result)
downurl = result[1:] #下载地址
item['downurl'] = downurl
yield item
重写了start_requests,让parse请求中代了cookies信息,但是下载链接还是不能获取。
不再scrapy框架中,使用该requests模块可以获取下载链接
from lxml import etree
import requests
url = 'http://www。com/plug/down.asp?id=' + number + '&order=0'
cookies ='ASPSESSIONIDAQSSQBCQ=OIEKFNGBBMBKAOACCCJJCJCK; Hm_lvt_0cb4e81fdd1a5b0b04d6edd93bfa0928=1587685941,1587718373,1587728873,1587741214; 1Hl5Yp=content%5F9353=6123%3A381; Hm_lpvt_0cb4e81fdd1a5b0b04d6edd93bfa0928=1587742953'
cookies = {i.split('=')[0]: i.split('=')[1] for i in cookies.split('; ')}
resp = requests.get(url,cookies=cookies) # 获取 url 网页源码
data = etree.HTML(resp.text)
downl =data.xpath('//text()')
print(downl)
大神,这是为什么?在scrapy中,我也携带了cookie,但是为什么不能获取下载地址