爬取亚马逊关键字搜索产品排名

import scrapy
from scrapy.http import Request
from urllib import parse
from amazonscrapy.items import AmazonscrapyItem
from scrapy.loader import ItemLoader
import datetime

class CheckRankingSpider(scrapy.Spider):
    name = 'check_ranking'
    check_website = "www.amazon.ca"
    key_word = "samrt plug"#关键字
    allowed_domains = [check_website]
    start_urls = ["https://%s/s/ref=nb_sb_noss_2?url=search-alias=aps&field-keywords=%s"%(check_website,key_word.replace(" ","+"))]#搜索页url
    check_pro_asin = "B074QRB1KB"#查找产品asin码
    headers = {
        "Host": check_website,
        "Referer":  "https://%s"%check_website,
        # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0" 
    }

    def start_requests(self):
        for url in self.start_urls:
            return [Request(url,headers=self.headers,callback=self.parse_pro_url,dont_filter=True)]#搜索第一页开始爬取

    def parse_pro_url(self,response):
        #获取产品列表结点
        pro_nodes = response.css(".s-result-item.celwidget")
        for i,pro_node in enumerate(pro_nodes):
            pro_asin = pro_node.css("::attr(data-asin)").extract_first()
            #判断是否目标产品asin
            if self.check_pro_asin == pro_asin:
                check_pro_url = pro_node.css("a.s-access-detail-page::attr(href)").extract_first()
                #获取所在页位置
                ranking_position = i+1
                #获取所在页
                ranking_page = response.css(".pagnCur::text").extract_first()
                print("找到了")
                yield Request(check_pro_url,headers=self.headers,callback=self.parse_pro_detail,meta={"ranking_position":ranking_position,"ranking_page":ranking_page,"search_url":response.url})
    #该页找不到,进入下一页
        page_next_url = response.css("#pagnNextLink::attr(href)").extract_first("")
        yield Request(url=parse.urljoin(response.url,page_next_url),callback=self.parse_pro_url)

    #提取目标产品信息:标题,价格,评论数,问题数,url,加载进item
    def parse_pro_detail(self,response):
        pro_item_loder = ItemLoader(item=AmazonscrapyItem(),response=response)
        search_url = response.meta.get("search_url","")
        pro_item_loder.add_value("search_url",search_url)
        pro_item_loder.add_css("title","#productTitle::text")
        pro_item_loder.add_value("url",response.url)
        if response.css("#acrPopover::attr(title)"):
            pro_item_loder.add_css("review_point","#acrPopover::attr(title)")
        else:
            pro_item_loder.add_value("review_point",0)
        if response.css("#acrCustomerReviewText::text"):
            pro_item_loder.add_css("review_num","#acrCustomerReviewText::text")
        else:
            pro_item_loder.add_value("review_num",0)
        if response.css("#askATFLink span::text"):
            pro_item_loder.add_css("question_num","#askATFLink span::text",)
        else:
            pro_item_loder.add_value("question_num",0)
        pro_item_loder.add_value("asin",self.check_pro_asin)
        pro_item_loder.add_xpath("price","//span[@id='priceblock_ourprice']/text()|//span[@id='priceblock_saleprice']/text()")
        position = response.meta.get("ranking_position","")
        pro_item_loder.add_value("position",position)
        page = response.meta.get("ranking_page","")
        pro_item_loder.add_value("page",page)
        pro_item_loder.add_value("check_date",datetime.datetime.now().date().strftime("%Y-%m-%d"))
        pro_item = pro_item_loder.load_item()
        yield pro_item
  • 5
    点赞
  • 28
    收藏
    觉得还不错? 一键收藏
  • 10
    评论
评论 10
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值