import scrapy
from scrapy.http import Request
from urllib import parse
from amazonscrapy.items import AmazonscrapyItem
from scrapy.loader import ItemLoader
import datetime
class CheckRankingSpider(scrapy.Spider):
name = 'check_ranking'
check_website = "www.amazon.ca"
key_word = "samrt plug"#关键字
allowed_domains = [check_website]
start_urls = ["https://%s/s/ref=nb_sb_noss_2?url=search-alias=aps&field-keywords=%s"%(check_website,key_word.replace(" ","+"))]#搜索页url
check_pro_asin = "B074QRB1KB"#查找产品asin码
headers = {
"Host": check_website,
"Referer": "https://%s"%check_website,
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0"
}
def start_requests(self):
for url in self.start_urls:
return [Request(url,headers=self.headers,callback=self.parse_pro_url,dont_filter=True)]#搜索第一页开始爬取
def parse_pro_url(self,response):
#获取产品列表结点
pro_nodes = response.css(".s-result-item.celwidget")
for i,pro_node in enumerate(pro_nodes):
pro_asin = pro_node.css("::attr(data-asin)").extract_first()
#判断是否目标产品asin
if self.check_pro_asin == pro_asin:
check_pro_url = pro_node.css("a.s-access-detail-page::attr(href)").extract_first()
#获取所在页位置
ranking_position = i+1
#获取所在页
ranking_page = response.css(".pagnCur::text").extract_first()
print("找到了")
yield Request(check_pro_url,headers=self.headers,callback=self.parse_pro_detail,meta={"ranking_position":ranking_position,"ranking_page":ranking_page,"search_url":response.url})
#该页找不到,进入下一页
page_next_url = response.css("#pagnNextLink::attr(href)").extract_first("")
yield Request(url=parse.urljoin(response.url,page_next_url),callback=self.parse_pro_url)
#提取目标产品信息:标题,价格,评论数,问题数,url,加载进item
def parse_pro_detail(self,response):
pro_item_loder = ItemLoader(item=AmazonscrapyItem(),response=response)
search_url = response.meta.get("search_url","")
pro_item_loder.add_value("search_url",search_url)
pro_item_loder.add_css("title","#productTitle::text")
pro_item_loder.add_value("url",response.url)
if response.css("#acrPopover::attr(title)"):
pro_item_loder.add_css("review_point","#acrPopover::attr(title)")
else:
pro_item_loder.add_value("review_point",0)
if response.css("#acrCustomerReviewText::text"):
pro_item_loder.add_css("review_num","#acrCustomerReviewText::text")
else:
pro_item_loder.add_value("review_num",0)
if response.css("#askATFLink span::text"):
pro_item_loder.add_css("question_num","#askATFLink span::text",)
else:
pro_item_loder.add_value("question_num",0)
pro_item_loder.add_value("asin",self.check_pro_asin)
pro_item_loder.add_xpath("price","//span[@id='priceblock_ourprice']/text()|//span[@id='priceblock_saleprice']/text()")
position = response.meta.get("ranking_position","")
pro_item_loder.add_value("position",position)
page = response.meta.get("ranking_page","")
pro_item_loder.add_value("page",page)
pro_item_loder.add_value("check_date",datetime.datetime.now().date().strftime("%Y-%m-%d"))
pro_item = pro_item_loder.load_item()
yield pro_item
爬取亚马逊关键字搜索产品排名
最新推荐文章于 2025-02-21 16:04:55 发布