scrapy框架爬取网站商品信息

项目场景:

苏宁易购计算机商品列表爬取(台式机、笔记本和平板)
以及评论内容爬取


使用架构是:scrapy

存储数据使用是mongoDB存数据 在爬取过程中,最重要是学会分析网页请求URL,分析清楚URL之后才能获取到想要的信息

网页URL分析

电脑商品列表的URL:
https://list.suning.com/emall/searchV1Product.do?ci=258003&pg=03&yjhx=&cp=0&il=0&st=0&iy=0&isNoResult=0&n=1&sesab=ACBAABC&id=IDENTIFYING&cc=773&paging=0&sub=1&jzq=40634

其中,需要关注两个参数

  1. cp是翻页的意思,p=0,1,2,3

  2. paging是下拉页面请求,商品列表每一页一次加载30个商品列表,如果下拉就会继续加载,paging的最大值为3,这样算是一整页(在分析过程中发现paging可以取0~198,商品都不重复)


setting 文件

DOWNLOAD_DELAY = 3#下载延迟

DEFAULT_REQUEST_HEADERS = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
#     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#     'Accept-Language': 'en',
}

SPIDER_MIDDLEWARES = {
    # 'datasetSpider.middlewares.DatasetspiderSpiderMiddleware': 543,

    'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
    'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': None, #用于解决 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'product.suning.com'
}
ITEM_PIPELINES = {
    'datasetSpider.pipelines.SuComputerPipeline': 300,
}
#Mongodb
MONGODB_HOST = "localhost"
MONGODB_PORT = 27017
MONGODB_DBNAME = 'suning'
MONGODB_COMPUTER_LIST = 'su_computer_list1'
MONGODB_COMPUTER_REVIEW = 'su_computer_review1'


items文件

#苏宁易购的电脑
class SuNingComputer(scrapy.Item):
    ProductId = scrapy.Field()
    ProductName = scrapy.Field()
    ProductDescription = scrapy.Field()
    ProductUrl = scrapy.Field()
    ProductCategories = scrapy.Field()
    ProductPrice = scrapy.Field()
    StoreName = scrapy.Field()
    ProductParameter = scrapy.Field()

    ShopId = scrapy.Field()
#苏宁易购评论
class SuNingComuputerReview(scrapy.Item):
    ProductId = scrapy.Field()
    ReviewId = scrapy.Field()
    ReviewEr = scrapy.Field()
    ReviewContent = scrapy.Field()
    ReviewRating = scrapy.Field() #评价星级
    ReviewHelful = scrapy.Field()
    ReviewTime= scrapy.Field()
items文件中写了两个类 SuNingComuputerReview 是电脑的商品列表类 SuNingComuputerReview 是对应每一个电脑商品的评论类

pipelines文件

import  json
from datasetSpider.items  import SuNingComputer,SuNingComuputerReview
import pymongo
from datasetSpider import settings

#苏宁计算机商品列表信息,评论信息
class SuComputerPipeline(object):
    def __init__(self):
        #self.filename = open("suComputer.json", "ab+")
        #self.filereview = open("suComputerreview.json", "ab+")
        pass

    def open_spider(self,spider):
        #从setting 中获取参数
        host = settings.MONGODB_HOST
        port = settings.MONGODB_PORT
        daname = settings.MONGODB_DBNAME
        computer_list = settings.MONGODB_COMPUTER_LIST
        computer_review = settings.MONGODB_COMPUTER_REVIEW
        #链接mongodb数据库
        self.mongodb_client = pymongo.MongoClient(host=host, port=port)
        self.db = self.mongodb_client[daname]
        self.db_su_computer_list = self.db[computer_list]
        self.db_su_computer_review = self.db[computer_review]

    def process_item(self, item, spider):
        insert_mongo = dict(item)
        #jsonText = json.dumps(dict(item), ensure_ascii=False) + "\n"

        if isinstance(item,SuNingComputer):#判断是否是SuNingComputer中yield过来的item
            #self.filename.write(jsonText.encode("utf-8"))
            #插入mongo数据库
            self.db_su_computer_list.insert(insert_mongo)

        elif isinstance(item,SuNingComuputerReview):#判断是否是SuNingComuputerReview中yield过来的item
            #self.filereview.write(jsonText.encode("utf-8"))
            # 插入mongo数据库
            self.db_su_computer_review.insert(insert_mongo)

        return item

    def close_spider(self, spider):
        #self.filename.close()
        #self.filereview.close()
        self.mongodb_client.close()

computer.py 业务处理

import re
import scrapy
import datasetSpider
from datasetSpider.items import SuNingComputer,SuNingComuputerReview
import json
import requests


class ComputerSpider(scrapy.Spider):

    name = 'computer'
    allowed_domains = ['https://search.suning.com/']
    # start_urls = ['https://list.suning.com/0-258003-0.html?safp=d488778a.diannao.42212715284.1&safc=cate.0.0&safpn=10003.00005']
    start_urls = ['https://list.suning.com/emall/searchV1Product.do?ci=258003&pg=03&yjhx=&cp=0&il=0&st=0&iy=0&isNoResult=0&n=1&sesab=ACBAABC&id=IDENTIFYING&cc=773&sub=0&jzq=40609']
    # 下滑加载的url
    page_num = 0
    same_page_url = "https://list.suning.com/emall/searchV1Product.do?ci=258003&pg=03&yjhx=&cp="
    same_page_url_center = "&il=0&st=0&iy=0&isNoResult=0&n=1&sesab=ACBAABC&id=IDENTIFYING&cc=773&paging="
    same_page_url_tail = "&sub=1&jzq=40634"

    next_page_num = 15
    next_page_url = "https://list.suning.com/emall/searchV1Product.do?ci=258003&pg=03&yjhx=&cp="
    next_page_url_tail = "&il=0&st=0&iy=0&adNumber=0&isNoResult=0&n=1&sesab=ACBAABC&id=IDENTIFYING&cc=773&sub=1&jzq=40610"

    def parse(self, response):
        product_list=response.xpath("//div[@class='title-selling-point']")
        # print(review_list)
        for each in product_list:
            # 获取电脑商品列表中的url
            product_url = each.xpath("./a/@href").extract()[0]
            nextUrl = 'https:'+product_url
            product_data=eval(each.xpath("./a/@sa-data").extract()[0])
            # print(product_data)
            ProductId = product_data["prdid"]
            ShopId = product_data["shopid"]

            # # 跳转到商品详细的页面
            yield scrapy.Request(url=nextUrl,meta={'ProductId':ProductId,'ShopId':ShopId,'ProductUrl':nextUrl},callback=self.productDetail)
        #     管道保存商品url
        #     su_item = SuNingComputer()
        #     su_item['ProductUrl'] = nextUrl
        #     yield su_item

        #处理下滑动态加载的页面
        print("page_num:")
        print(self.page_num)
        if self.page_num < 4:
            yield scrapy.Request(self.same_page_url + str(self.next_page_num) + self.same_page_url_center + str(self.page_num) + self.same_page_url_tail, callback=self.parse)
            self.page_num += 1

        #进行翻页
        if self.page_num >=4:
            self.page_num = 0;
            if self.next_page_num <= 50:
                yield scrapy.Request(self.next_page_url + str(self.next_page_num) + self.next_page_url_tail,callback=self.parse)
                self.next_page_num += 1

    #跳转商品详细页面
    def productDetail(self,response):
        get_meta = response.meta
        #商品id
        product_id = get_meta['ProductId']
        #商品的url
        product_url = get_meta['ProductUrl']
        #店铺的id
        shop_id = get_meta['ShopId']
        #商品描述
        product_description = response.xpath("//meta[@name='description']/@content").extract()[0]
        # #店铺名称
        # product_store_name=response.xpath("//a[@id='chead_indexUrl']/@title").extract()[0]
        # #商品类别
        # product_category=response.xpath("//table[@id='bzqd_tag']/tbody/tr[2]/td[2]/text()").extract()[0]
        #商品参数
        product_parameter=response.xpath("//table[@id='itemParameter']/tbody/tr")
        parameter_dict={}
        for i in product_parameter:
            # print(i)
            text1=i.xpath("./td[1]/div/span/text()").extract()
            text2=i.xpath("./td[2]/text()").extract()
            text3=i.xpath("./td[2]/a/text()").extract()

            punctuation = '!,;:?".\''
            if len(text1) != 0 and len(text2) != 0:
                # print(text1)
                key1 = re.sub(r'[{}]+'.format(punctuation),"_",text1[0])#以为key中不能有".",不然无法存到mongodb
                # print(key1)
                parameter_dict[key1] = text2[0]
            if len(text1) != 0 and len(text3) != 0:
                key2 = text1[0]
                parameter_dict[key2] = text3[0]
        # print(parameter_dict)

        #获取评论的cluster_id
        script = response.xpath("//script[@type='text/javascript']/text()").extract()#获取script代码,有很多有用信息
        cluster = re.findall(r'\"clusterId\":\".*?\"', script[0])
        cluster_id = json.loads("{"+cluster[0]+"}")['clusterId']
        #商品名
        product_display = re.findall(r'\"itemDisplayName\":\".*?\"', script[0])
        product_name = json.loads("{" + product_display[0] + "}")['itemDisplayName']
        #店铺名称
        flagshipName = re.findall(r'\"flagshipName\":\".*?\"', script[0])
        product_store_name = json.loads("{" + flagshipName[0] + "}")['flagshipName']
        # print(product_store_name)
        #商品描述
        product_category = {}
        category = re.findall(r'\"categoryName\d\":\".*?\"', script[0])
        categoryName = {}
        for it in category:
            categoryName.update(json.loads("{"+it+"}"))
        product_category['0'] = categoryName['categoryName1']
        product_category['1'] = categoryName['categoryName2']
        product_category['2'] = categoryName['categoryName3']

        brandName = re.findall(r'\"brandName\":\".*?\"', script[0])
        itemDisplayName = re.findall(r'\"itemDisplayName\":\".*?\"', script[0])

        brandName = json.loads("{"+brandName[0]+"}")
        itemDisplayName = json.loads("{"+itemDisplayName[0]+"}")

        product_category['3'] = brandName['brandName']
        product_category['4'] = itemDisplayName['itemDisplayName']
        # print(product_category)

        # headers = datasetSpider.settings.DEFAULT_REQUEST_HEADERS
        price_url = "https://icps.suning.com/icps-web/getVarnishAllPriceNoCache/0000000" + str(product_id) + "_773_7730199_" + str(shop_id) + "_1_getClusterPrice.jsonp"
        # res = requests.get(price_url, headers=headers)
        meta = {"ProductId":product_id,"ShopId":shop_id,"ProductUrl":product_url,
                "ProductName":product_name,"ProductDescription":product_description,
                "ProductCategories":product_category,"StoreName":product_store_name,
                "ProductParameter":parameter_dict,"cluster_id":cluster_id}
        yield scrapy.Request(url=price_url,meta=meta,callback=self.Price)



    #获取价格
    def Price(self,response):
        get_meta = response.meta
        price_dirt = re.findall(r'[(](.*)[)]', response.text)[0]
        price_dirt = json.loads(price_dirt)[0]
        # 商品初始价格
        product_price = price_dirt['price']
        # print(product_price)
        product_id = get_meta["ProductId"]
        shop_id = get_meta["ShopId"]
        product_url = get_meta["ProductUrl"]
        product_name = get_meta["ProductName"]
        product_description = get_meta["ProductDescription"]
        product_category = get_meta["ProductCategories"]
        product_store_name = get_meta["StoreName"]
        parameter_dict = get_meta["ProductParameter"]
        cluster_id = get_meta["cluster_id"]
        # 交给管道处理
        su_item = SuNingComputer()
        su_item["ProductId"] = product_id
        su_item["ShopId"] = shop_id
        su_item['ProductUrl'] = product_url
        su_item["ProductName"] = product_name
        su_item["ProductDescription"] = product_description
        su_item["ProductCategories"] = product_category
        su_item["ProductPrice"] = product_price
        su_item["StoreName"] = product_store_name
        su_item["ProductParameter"] = parameter_dict
        yield su_item
        # print(su_item)

        # 评论
        total_review_url = "https://review.suning.com/ajax/cluster_review_satisfy/cluster-" + str(cluster_id) + "-0000000" + str(product_id) + "-" + str(shop_id) + "-----satisfy.htm"
        yield scrapy.Request(url=total_review_url,meta={"product_id": product_id, "shop_id": shop_id, "cluster_id": cluster_id},callback=self.Review)
    #跳转评论页面
    def Review(self,response):
        get_meta = response.meta
        cluster_id = get_meta["cluster_id"]
        product_id = get_meta["product_id"]
        shop_id = get_meta["shop_id"]
        #最多只能访问50页评论
        for i in range(1,51):
        # review_url = "https://review.suning.com/ajax/cluster_review_lists/cluster-"+str(cluster_id)+"-0000000"+str(product_id)+"-"+str(shop_id)+"-total-1-default-10-----reviewList.htm"
            review_url = "https://review.suning.com/ajax/cluster_review_lists/cluster-"+str(cluster_id)+"-0000000"+str(product_id)+"-"+str(shop_id)+"-total-"+str(i)+"-default-10-----reviewList.htm"
            yield scrapy.Request(url=review_url,meta={"product_id":product_id},callback=self.ReviewList)

    #处理每一页评论
    def ReviewList(self,response):
        # print(response.text)
        get_meta = response.meta
        product_id = get_meta["product_id"]
        # review_text = json.loads(response.text)
        review_list = re.findall(r'[(](.*)[)]', response.text)#使用贪婪匹配才能获取括号所有数据
        # review_text = response.text
        review_text = ''.join(review_list)#拼接字符串
        # print(review_text)
        review_dirt = json.loads(review_text)
        #先判断是否有commodityReviews消息返回
        if "commodityReviews" in review_dirt:
            commodityReviews = review_dirt['commodityReviews']
            for item in commodityReviews:
                # print(item)
                commodityReviewId = item['commodityReviewId']
                content = item['content']
                qualityStar = item['qualityStar']
                publishTime = item['publishTime']
                userInfo = item['userInfo']
                nickName = userInfo['nickName']
                # 评论点赞的数量
                usefulCnt_url = "https://review.suning.com/ajax/useful_count/" + str(commodityReviewId) + "-usefulCnt.htm"
                meta = {"product_id": product_id,"commodityReviewId":commodityReviewId,"content":content,"qualityStar":qualityStar,"publishTime":publishTime,"nickName":nickName}
                yield scrapy.Request(url=usefulCnt_url,meta=meta,callback=self.ReviewUsefulCnt)

    #是否有用
    def ReviewUsefulCnt(self,response):
        get_meta = response.meta
        ProductId = get_meta["product_id"]
        ReviewId = get_meta["commodityReviewId"]
        ReviewEr = get_meta["nickName"]
        ReviewContent = re.sub('<br/>', "",get_meta["content"])
        ReviewRating = get_meta["qualityStar"]
        ReviewTime = get_meta["publishTime"]
        #点赞评论的次数
        reviewUsefuAndReplylList = re.findall(r'[(](.*?)[)]', response.text)[0]
        reviewUsefuAndReplylList = json.loads(reviewUsefuAndReplylList)
        ReviewHelful = reviewUsefuAndReplylList["reviewUsefuAndReplylList"][0]["usefulCount"]

        reviewItem = SuNingComuputerReview()
        reviewItem["ProductId"] = ProductId
        reviewItem["ReviewId"] = ReviewId
        reviewItem["ReviewEr"] = ReviewEr
        reviewItem["ReviewContent"] = ReviewContent
        reviewItem["ReviewRating"] = ReviewRating
        reviewItem["ReviewTime"] = ReviewTime
        reviewItem["ReviewHelful"] = ReviewHelful
        yield reviewItem


  • 3
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值