1、介绍
请求传参
- 使用场景:如果爬取解析的数据不在同一张页面中。(需要进行深度爬取)
- 需求:爬取海外网中的新闻标题和新闻详细内容
http://hk.haiwainet.cn/news/
【前置准备】
构建scrapy框架工程文件,设置完setting.py
配置信息
2、分析
(1)获取单页面情况
打开网页,获取新闻标题和新闻详细内容
首先,获取各标签下的新闻标题,xpath:
/html/body/div[2]/div[3]/div[1]/ul/a/text()
然后,获取详情页的链接地址,xpath:
/html/body/div[2]/div[3]/div[1]/ul/a/@href
之后进入详情页,获取详情内容
获取页面内容:
//*[@id="cen"]//text()
- 编写代码
import scrapy
class BossSpider(scrapy.Spider):
name = 'boss'
#allowed_domains = ['www.xxx.com']
start_urls = ['http://hk.haiwainet.cn/news/']
def parse_detail(self,response):
detail_page = response.xpath('//*[@id="cen"]//text()').extract()
detail_page = ''.join(detail_page)
print(detail_page)
def parse(self, response):
ul_list = response.xpath('/html/body/div[2]/div[3]/div[1]/ul')
# print(ul_list)
for ul in ul_list:
li_list = ul.xpath('./li')
for li in li_list:
news_name = li.xpath('./a/text()').extract_first()
print(news_name)
detail_url = li.xpath('./a/@href').extract_first()
#print(detail_url)
yield scrapy.Request(detail_url,callback=self.parse_detail)
(2)请求传参
settings.py
ITEM_PIPELINES = {
'bossPro.pipelines.BossproPipeline': 300,
}
items.py
import scrapy
class BossproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
news_name = scrapy.Field()
detail_page = scrapy.Field()
boss.py
import scrapy
from bossPro.items import BossproItem
class BossSpider(scrapy.Spider):
name = 'boss'
#allowed_domains = ['www.xxx.com']
start_urls = ['http://hk.haiwainet.cn/news/']
#回调函数接收item
def parse_detail(self,response):
item = response.meta['item']
detail_page = response.xpath('//*[@id="cen"]//text()').extract()
detail_page = ''.join(detail_page)
item['detail_page'] = detail_page
print(detail_page)
yield item
def parse(self, response):
ul_list = response.xpath('/html/body/div[2]/div[3]/div[1]/ul')
# print(ul_list)
for ul in ul_list:
li_list = ul.xpath('./li')
for li in li_list:
item = BossproItem()
news_name = li.xpath('./a/text()').extract_first()
item['news_name'] = news_name
print(news_name)
detail_url = li.xpath('./a/@href').extract_first()
#print(detail_url)
#对详情页发请求获取详情页的页面源码数据
#手动请求的发送
#请求参数:meta={},可以将meta字典传递给请求对应的回调函数
yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item})
pipelines.py
class BossproPipeline:
def process_item(self, item, spider):
print(item)
return item
(3)获取多页信息
import scrapy
from bossPro.items import BossproItem
class BossSpider(scrapy.Spider):
name = 'boss'
#allowed_domains = ['www.xxx.com']
start_urls = ['http://hk.haiwainet.cn/news/']
url = 'http://hk.haiwainet.cn/news/%d.html'
page_num = 2
#回调函数接收item
def parse_detail(self,response):
item = response.meta['item']
detail_page = response.xpath('//*[@id="cen"]//text()').extract()
detail_page = ''.join(detail_page)
item['detail_page'] = detail_page
print(detail_page)
yield item
#解析首页中的标题名称
def parse(self, response):
ul_list = response.xpath('/html/body/div[2]/div[3]/div[1]/ul')
# print(ul_list)
for ul in ul_list:
li_list = ul.xpath('./li')
for li in li_list:
item = BossproItem()
news_name = li.xpath('./a/text()').extract_first()
item['news_name'] = news_name
print(news_name)
detail_url = li.xpath('./a/@href').extract_first()
#print(detail_url)
#对详情页发请求获取详情页的页面源码数据
#手动请求的发送
#请求参数:meta={},可以将meta字典传递给请求对应的回调函数
yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item})
#分页操作
if self.page_num <= 3:
new_url = format(self.url%self.page_num)
self.page_num += 1
yield scrapy.Request(new_url,callback=self.parse)