Scrapy 框架 手动发送请求 POST 请求的发送

 

手动发送请求

import scrapy

from choutiSpider.items import ChoutispiderItem


class ChoutiSpider(scrapy.Spider): name = 'chouti' # allowed_domains = ['www.xxx.com'] start_urls = ['https://dig.****.com/r/scoff/hot/1'] # 通用模板 url url = 'https://dig.****.com/r/scoff/hot/%s' page_num = 1 def parse(self, response): div_list = response.xpath('//div[@id="content-list"]/div') # print(div_list) print(self.page_num) for div in div_list: content = div.xpath('./div[@class="news-content"]/div[1]/a/text()').extract_first().strip() author = div.xpath('./div[@class="news-content"]/div[2]/a[4]/b/text()').extract_first() # print(content, author) item = ChoutispiderItem() item['author'] = author item['content'] = content # 提交数据 到管道中 yield item # 手动发送请求 分页爬取 if self.page_num < 120: self.page_num += 1 new_url = self.url % self.page_num # 发送请求 提交 yield scrapy.Request(url=new_url, callback=self.parse)

post 请求发送

# 在scrapy框架中默认情况下cookie会被自动处理,无需手动!

class PostdemoSpider(scrapy.Spider): name = 'postdemo' allowed_domains = ['www.xxx.com'] start_urls = ['https://fanyi.****.com/sug'] # 调用父类的 发送请求的 方法 def start_requests(self): for url in self.start_urls: data = { 'kw': 'cat' } yield scrapy.FormRequest(url=url, formdata=data, callback=self.parse) def parse(self, response): print(response.text)
在scrapy框架中默认情况下cookie会被自动处理,无需手动!

settings 配置:

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

请求传参:

  • 二级详情页面 的 item 传递
import scrapy

from boosPro.items import BoosproItem


class BoosSpider(scrapy.Spider): name = 'boos' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.****.com/c101010100/?query=python%E7%88%AC%E8%99%AB&page=1'] url = 'https://www.****.com/c101010100/?query=python%E7%88%AC%E8%99%AB&page=%s' page_num = 1 def parse(self, response): li_list = response.xpath('//div[@class="job-list"]/ul/li') for li in li_list: item = BoosproItem() title = li.xpath('.//div[@class="job-title"]/text()').extract_first() # 薪资 salary salary = li.xpath('.//div[@class="info-primary"]/h3/a/span/text()').extract_first() # 公司 company company = li.xpath('.//div[@class="company-text"]/h3/a/text()').extract_first() detail_url = 'https://www.zhipin.com' + li.xpath('.//div[@class="info-primary"]/h3/a/@href').extract_first() item['title'] = title item['salary'] = salary item['company'] = company # 对详情页的url进行手动请求的发送 yield scrapy.Request(url=detail_url, callback=self.parsrDetail, meta={'item': item}) if self.page_num <= 3: self.page_num += 1 newUrl = self.url % self.page_num yield scrapy.Request(url=newUrl, callback=self.parse) # 用来解析详情页的相关的数据 def parsrDetail(self, response): # 接收meta item = response.meta['item'] job_desc = response.xpath('//*[@id="main"]/div[3]/div/div[2]/div[2]/div[1]/div//text()').extract() company_content = response.xpath('//*[@id="main"]/div[3]/div/div[2]/div[2]/div[2]/div/text()').extract_first() job_desc = ' '.join(job_desc) item['job_desc'] = job_desc item['company_content'] = company_content # print(job_desc, 1111111) yield item

转载于:https://www.cnblogs.com/ellisonzhang/p/11113273.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值