scrapy抓取当当网部分商品信息

spider编码

class DangdangSpider(scrapy.Spider):
    name = 'dangdang'
    allowed_domains = ['dangdang.com']
    start_urls = ['http://category.dangdang.com/pg1-cid4004279.html']

    def parse(self, response):
        li_list = response.xpath("//li[contains(@class, 'line')]")
        for i, li in enumerate(li_list):
            item = P9DangdangItem()
            item["name"] = li.xpath(".//p[@class='name']/a/@title").extract_first()
            item["price"] = li.xpath(".//p[@class='price']/span/text()").extract_first()
            item["review_num"] = li.xpath(".//p[@class='star']/a/text()").extract_first()
            item["shop_name"] = li.xpath(".//p[@class='link']/a/text()").extract_first()
            item["img_src"] = li.xpath(".//a[@class='pic']/img/@src").extract_first() if i<8 else li.xpath(".//a[@class='pic']/img/@data-original").extract_first()
            yield item

        next_page_url = "http://category.dangdang.com/" + response.xpath("//li[@class='next']/a/@href").extract_first() if response.xpath("//li[@class='next']/a/@href").extract_first() else None
        if next_page_url is not None:
            yield scrapy.Request(url=next_page_url, callback=self.parse)

pipelines编码

class P9DangdangPipeline(object):
    def __init__(self):
        self.mongo_client = pymongo.MongoClient(host="localhost", port=27017)
        self.to_data_path = self.mongo_client["dangdang"]["commodity"]


    def process_item(self, item, spider):
        self.to_data_path.insert_one(dict(item))
        print(item["name"] + " save to mongo successfully")
        return item


    def close_spider(self, spider):
        self.mongo_client.close()


class P9DangdangPipeline_save_img(object):

    def process_item(self, item, spider):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
        }
        src = item["img_src"]
        item["name"] = re.sub(r"/", "", item["name"])
        file_name = "./data/imgs/" + item["name"] + ".jpg"
        img_bytes = requests.get(src, headers=headers).content
        with open(file_name, "wb") as fp:
            fp.write(img_bytes)
            print(file_name + "save to local path successfully")

items编码

class P9DangdangItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()
    price = scrapy.Field()
    review_num = scrapy.Field()
    shop_name = scrapy.Field()
    img_src = scrapy.Field()

settings.py设置

DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent': 'User-Agent,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;'
},
ITEM_PIPELINES = {
   'p9_dangdang.pipelines.P9DangdangPipeline': 300,
   'p9_dangdang.pipelines.P9DangdangPipeline_save_img' : 301
},
DOWNLOAD_DELAY = 1

爬取结果如下:
在这里插入图片描述
在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值