Scrapy+bs4爬取京东商品信息

Scrapy+bs4爬取京东商品信息

把数据存储到Redis里面

spiders下的爬虫文件

# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from jd.items import JdItem
from scrapy_redis.spiders import RedisSpider


class GoodSpider(RedisSpider):
    name = 'good'
    # allowed_domains = ['search.jd.com']
    # start_urls = ['https://search.jd.com/Search?keyword=ipad&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&ev=exbrand_Apple%5E&page=1&s=1&click=0']
    redis_key = "good:start_urls"
    # 在Redis数据库里执行
    # lpush good:start_urls https://search.jd.com/Search?keyword=ipad&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&ev=exbrand_Apple%5E&page=1&s=1&click=0
    # 在spiders目录下执行
    # scrapy runspider good.py

    def parse(self, response):
        soup = BeautifulSoup(response.text, 'lxml')
        one_page_products = soup.find_all(class_='gl-item')
        for one_product in one_page_products:
            productId = one_product.get('data-sku')
            productImg = one_product.find(class_="p-img").find('img').get('source-data-lazy-img')
            productPrice = one_product.find(class_="p-price").get_text().strip()[1:]
            productTitle = one_product.find(class_="p-name").find('em').get_text().strip()
            productShop = one_product.find(class_="p-shop").get_text().strip()
            productStatus = one_product.find(class_="p-commit").find_all('a')
            if len(productStatus) == 1:
                productStatus = productStatus[0].get_text().strip()
            elif len(productStatus) == 2:
                productStatus = productStatus[1].get_text().strip()
            item = JdItem()
            item['productId'] = productId
            item['productImg'] = productImg
            item['productTitle'] = productTitle
            item['productShop'] = productShop
            item['productStatus'] = productStatus
            item['productPrice'] = productPrice
            yield item
        # 爬取10页的商品数据
        for i in range(2,11):
            yield scrapy.Request(f'https://search.jd.com/Search?keyword=ipad&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&wq=ipad&ev=exbrand_Apple%5E&page={2*i-1}',callback=self.parse)

settings.py
from fake_useragent import UserAgent
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = UserAgent().random

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 1

SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
REDIS_HOST = 'localhost'
REDIS_PORT = 6379

ITEM_PIPELINES = {
   # 'commit.pipelines.CommitPipeline': 300,
   'scrapy_redis.pipelines.RedisPipeline': 300,
}

items.py
import scrapy


class CommitItem(scrapy.Item):
    class JdItem(scrapy.Item):
    productImg = scrapy.Field()
    productPrice = scrapy.Field()
    productTitle = scrapy.Field()
    productShop = scrapy.Field()
    productStatus = scrapy.Field()
    productId = scrapy.Field()


  • 1
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值