Scrapy+bs4爬取京东商品信息
把数据存储到Redis里面
spiders下的爬虫文件
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from jd.items import JdItem
from scrapy_redis.spiders import RedisSpider
class GoodSpider(RedisSpider):
name = 'good'
# allowed_domains = ['search.jd.com']
# start_urls = ['https://search.jd.com/Search?keyword=ipad&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&ev=exbrand_Apple%5E&page=1&s=1&click=0']
redis_key = "good:start_urls"
# 在Redis数据库里执行
# lpush good:start_urls https://search.jd.com/Search?keyword=ipad&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&ev=exbrand_Apple%5E&page=1&s=1&click=0
# 在spiders目录下执行
# scrapy runspider good.py
def parse(self, response):
soup = BeautifulSoup(response.text, 'lxml')
one_page_products = soup.find_all(class_='gl-item')
for one_product in one_page_products:
productId = one_product.get('data-sku')
productImg = one_product.find(class_="p-img").find('img').get('source-data-lazy-img')
productPrice = one_product.find(class_="p-price").get_text().strip()[1:]
productTitle = one_product.find(class_="p-name").find('em').get_text().strip()
productShop = one_product.find(class_="p-shop").get_text().strip()
productStatus = one_product.find(class_="p-commit").find_all('a')
if len(productStatus) == 1:
productStatus = productStatus[0].get_text().strip()
elif len(productStatus) == 2:
productStatus = productStatus[1].get_text().strip()
item = JdItem()
item['productId'] = productId
item['productImg'] = productImg
item['productTitle'] = productTitle
item['productShop'] = productShop
item['productStatus'] = productStatus
item['productPrice'] = productPrice
yield item
# 爬取10页的商品数据
for i in range(2,11):
yield scrapy.Request(f'https://search.jd.com/Search?keyword=ipad&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&wq=ipad&ev=exbrand_Apple%5E&page={2*i-1}',callback=self.parse)
settings.py
from fake_useragent import UserAgent
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = UserAgent().random
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 1
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
REDIS_HOST = 'localhost'
REDIS_PORT = 6379
ITEM_PIPELINES = {
# 'commit.pipelines.CommitPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 300,
}
items.py
import scrapy
class CommitItem(scrapy.Item):
class JdItem(scrapy.Item):
productImg = scrapy.Field()
productPrice = scrapy.Field()
productTitle = scrapy.Field()
productShop = scrapy.Field()
productStatus = scrapy.Field()
productId = scrapy.Field()