京东全网爬虫—— 旧版
根据B站2019年的视频学习而写出的代码,现在无法爬取京东数据。本次使用的内容数据来源于 http://dc.3.cn/category/get
一、商品分类爬虫
1. 创建分类爬虫并爬取数据
# 进入项目目录
cd mall_spider
# 创建爬虫
scrapy genspider category_spider jd.com
① 在Terminal中创建爬虫 jd_category
scrapy genspider jd_category jd.com
② 编写 jd_category.py文件
import scrapy
class JdCategorySpider(scrapy.Spider):
name = 'jd_category'
allowed_domains = ['3.cn']
start_urls = ['http://dc.3.cn/category/get']
def parse(self, response):
print(response.body.decode('GBK'))
③ 在Terminal中执行jd_category —— 查看拿到的数据
scrapy crawl jd_category
# 输出结果
# {"data":[{"id":"a","t":["jiadian.jd.com/|家电馆||0","pro.jd.com/mall/active/6fboBT3QVrMUPHRh9kxKXPsz7bc/index.html|家电专卖店||0","pro.jd.com/mall/active/3
Fy6fJbih9wXySuLJjZAKBiVNMje/index.html|家电服务||0","jdqc.jd.com/|企业采购||0","list.jd.com/list.html?cat=737,14857|商用电器||0","pro.jd.com/mall/active/vg
djc9m4qW1t5qrx4aQceh5sbbb/index.html|高价回收||0"],"b":["//mall.jd.com/index-1000282702.html|美的|vclist/jfs/t1/157139/14/7668/97909/602e0c61Ee3d70412/3748
69e30eeffddf.jpg|","pro.jd.com/mall/active/3BPQ3jZhu2VtDQA83UHDSVpbkQbK/index.html|西门子|vclist/jfs/t1/90641/24/3606/1171/5de0e340Eaad1e3e0/5b9cad64436c05
ff.jpg|","mall.jd.com/index-1000000950.html|东联|vclist/jfs/t1/162273/9/5081/4182/60175691Eb53ef9fa/edc904211f2fed1f.jpg|","//pro.jd.com/mall/active/x4jSkh... 很多
④ 完整版jd_category.py文件
import scrapy
import json
from mall_spider.items import Category
class JdCategorySpider(scrapy.Spider):
name = 'jd_category'
allowed_domains = ['3.cn']
start_urls = ['http://dc.3.cn/category/get']
def parse(self, response):
# print(response.body.decode('GBK'))
result = json.loads(response.body.decode('GBK'))
datas = result['data']
# 遍历数据列表
for data in datas:
item = Category()
b_category = data['s'][0]
b_category_info = b_category['n']
item['b_category_name'], item['b_category_url'] = self.get_category_name_url(b_category_info)
# 中分类信息列表
m_category_s = b_category['s']
# 遍历中分类列表
for m_category in m_category_s:
# 中分类信息
m_category_info = m_category['n']
item['m_category_name'], item['m_category_url'] = self.get_category_name_url(m_category_info)
# 小分类数据列表
s_category_s = m_category['s']
for s_category in s_category_s:
s_category_info = s_category['n']
item['s_category_name'], item['s_category_url'] = self.get_category_name_url(s_category_info)
# print(item)
# 把数据交给引擎
yield item
def get_category_name_url(self, category_info):
# 根据分类信息,提取名称和URL
category = category_info.split('|')
# 分类URL
category_url = category[0]
# 分类名称
category_name = category[1]
# 处理第一类分类URL---mvd.jd.com/music.html|音乐||0
if category_url.count('jd.com') == 1:
# URL 进行补全
category_url = 'https://' + category_url
# 处理第二类分类URL---1713-6929|港台图书||0
elif category_url.count('-') == 1:
category_url = 'https://channel.jd.com/{}.html'.format(category_url)
# 处理第三类分类URL---9855-17084-17089|管材管件||0
elif category_url.count('-') == 2:
# 把URL中'-'替换成 ','
category_url = category_url.replace('-', ',')
# 补全URL
category_url = 'https://list.jd.com/list.html?cat={}'.format(category_url)
# 返回类别名称和URL
return category_name, category_url
输出结果只选取了三个
{'b_category_name': '工业品',
'b_category_url': 'https://mro.jd.com/',
'm_category_name': '实验用品',
'm_category_url': 'https://i-list.jd.com/list.html?cat=14065,14137',
's_category_name': '实验室试剂',
's_category_url': 'https://i-list.jd.com/list.html?cat=14065,14137,14138'}
{'b_category_name': '工业品',
'b_category_url': 'https://mro.jd.com/',
'm_category_name': '实验用品',
'm_category_url': 'https://i-list.jd.com/list.html?cat=14065,14137',
's_category_name': '实验室耗材',
's_category_url': 'https://i-list.jd.com/list.html?cat=14065,14137,14139'}
{'b_category_name': '工业品',
'b_category_url': 'https://mro.jd.com/',
'm_category_name': '实验用品',
'm_category_url': 'https://i-list.jd.com/list.html?cat=14065,14137',
's_category_name': '实验室设备',
's_category_url': 'https://i-list.jd.com/list.html?cat=14065,14137,14140'}
2. 保存分类信息
(1)实现保存分类的Pipeline类
from itemadapter import ItemAdapter
from mall_spider.spiders.jd_category import JdCategorySpider
from pymongo import MongoClient
from mall_spider.settings import MONGODB_URL
"""
实现保存分类的Pipeline类
步骤:open_spider方法中,链接MongoDB数据库,获取要操作的集合
process_item方法中,向MongoDB中插入类别数据
close_spider方法中,关闭MongoDB的链接
"""
class CategoryPipeline(object):
# 当爬虫启动的时候执行
def open_spider(self, spider):
if isinstance(spider, JdCategorySpider):
# open_spider方法中,链接MongoDB数据库,获取要操作的集合
self.client = MongoClient(MONGODB_URL)
self.collection = self.client['jd']['category']
def process_item(self, item, spider):
# process_item方法中,向MongoDB中插入类别数据
if isinstance(spider, JdCategorySpider):
self.collection.insert_one(dict(item))
return item
def close_spider(self, spider):
# close_spider方法中,关闭MongoDB的链接
if isinstance(spider, JdCategorySpider):
self.client.close()
(2)在settings.py中开启类别的Pipeline
ITEM_PIPELINES = {
'mall_spider.pipelines.CategoryPipeline': 300,
}
3. 实现商品爬虫
(1)首先创建一个爬虫
scrapy genspider jd_product jd.com
(2)使用Charles实现手机抓包
(3)商品爬虫并实现分布式
import scrapy
import json
from jsonpath import jsonpath
from mall_spider.items import Product
from scrapy_redis.spiders import RedisSpider
import pickle
# 重写start_requests方法,根据分类信息构建列表页的请求
# 解析列表页,提取商品的skuid,构建商品基本的信息请求,实现翻页
# 解析商品基本信息,构建商品促销信息的请求
# 解析促销信息,构建商品评价信息的请求
# 解析商品评价信息,构建价格信息的请求
# 解析价格信息
"""scrapy_redis:分布式爬虫
修改爬虫类
修改继承关系:继承RedisSpider
指定redis_key
把重写start_requests改为 重写 make_request_from_data
"""
class JdProductSpider(RedisSpider):
name = 'jd_product'
allowed_domains = ['jd.com', 'p.3.cn']
# 指定redis_key:用于指定起始URL列表,在Redis数据库中key
redis_key = 'jd_product:category'
# 把重写start_requests改为 重写 make_request_from_data
def make_request_from_data(self, data):
# 根据redis中读取的分类信息的二进制数据,构建请求
# 把分类信息的二进制数据转换为字典
category = pickle.loads(data)
return scrapy.Requests(category['s_category_url'], callback=self.parse, meta={'category': category})
# def start_requests(self):
# category = {'b_category_name': '工业品',
# 'b_category_url': 'https://mro.jd.com/',
# 'm_category_name': '实验用品',
# 'm_category_url': 'https://i-list.jd.com/list.html?cat=14065,14137',
# 's_category_name': '实验室设备',
# 's_category_url': 'https://i-list.jd.com/list.html?cat=14065,14137,14140'}
# # 根据小分类的URL,构建列表页的请求
# yield scrapy.Requests(category['s_category_url'], callback=self.parse, meta={'category': category})
def parse(self, response):
category = response.meta['category']
# print(category)
# 解析列表页,提取商品的skuid
sku_ids = response.xpath('//div[contains(@class,"j-sku-item")]/@data-sku').extract()
for sku_id in sku_ids:
# 创建Product,用于保存商品数据
item = Product()
# 设置商品类别
item['product_category'] = category
item['product_sku_id'] = sku_id
# 构建商品基本的信息请求
product_base_url = 'https://cdnware.m.jd.com/c1/skuDetail/apple/7.3.0/{}.json'.format(sku_id)
yield scrapy.Requests(product_base_url, callback=self.parse_product_base, meta={'item': item})
# 获取下一页的url
next_url = response.xpath('//a[@class="pn-next"]/@href').extract_first()
if next_url:
# 补全URL
next_url = response.urljoin(next_url)
print(next_url)
# 构建下一页的请求
yield scrapy.Requests(next_url, callback=self.parse, meta={'category': category})
def parse_product_base(self, response):
# 取出传递过来的数据
item = response.meta['item']
# print(item)
# print(response.text)
""" 解析商品基本信息,构建商品促销信息的请求"""
# 把json字符串转换为字典
result = json.loads(response.text)
# 1. 商品名称 :product_name
item['product_name'] = result['wareInfo']['basicInfo']['name']
# 2. 商品图片URL:product_img_url
item['product_img_url'] = result['wareInfo']['basicInfo']['wareImage'][0]['small']
# 3. 图书信息,作者,出版社:product_book_info
item['product_book_info'] = result['wareInfo']['basicInfo']['bookInfo']
# 4. 商品选项:product_option
color_size = jsonpath(result, '$..colorSize')
if color_size:
# 注意colorSize:值是列表,而jsonpath返回列表,color_size是两层列表
color_size = color_size[0]
product_option = {}
for option in color_size:
title = option['title']
value = jsonpath(option, '$..text')
product_option['title'] = value
item['product_option '] = product_option
# 5. 商品店铺: product_shop
shop = jsonpath(result, '$..shop')
if shop:
shop = shop[0]
if shop:
item['product_shop'] = {
'shop_id': shop['shopId'],
'shop_name': shop['name'],
'shop_score': shop['score']
}
else:
item['product_shop'] = {
'shop_name': '京东自营'
}
# 6. 商品类别ID: product_category_id
item['product_category_id'] = result['wareInfo']['basicInfo']['category']
# category:"9987;653;665" => 需要把';'改成','
item['product_category_id'] = item['product_category_id'].replace(';', ',')
# print(item)
# 准备促销信息的URL
ad_url = 'https://cd.jd.com/promotion/v2?skuId={}&area=1_72_4137_0&cat={}'.\
format(item['product_sku_id'], item['product_category_id'])
# 构建促销信息的请求
yield scrapy.Requests(ad_url, callback=self.parse_product_ad, meta={'item': item})
def parse_product_ad(self, response):
item = response.meta['item']
# print(item)
# 把数据转换为字典
result = json.loads(response.body.decode('GBK'))
# product_ad 商品促销
item['product_ad'] = jsonpath(result, '$..ad')[0] if jsonpath(result, '$..ad') else ''
# 构建评价信息请求
comments_url = 'https://club.jd.com/comment/productCommentSummarles.action?referencelds={}'.\
format(item['product_sku_id'])
yield scrapy.Requests(comments_url, callback=self.parse_product_comments, meta={'item': item})
def parse_product_comments(self, response):
item = response.meta['item']
# print(item)
# print(response.text)
# 解析商品评价信息
result = json.loads(response.text)
# 商品评论数量:product_comments
# 评价数量,好评数量,差评数量,好评率
item['product_comments'] = {
'CommentCount': jsonpath(result, '$..CommentCount')[0],
'GoodCount': jsonpath(result, '$..GoodCount')[0],
'PoorCount': jsonpath(result, '$..PoorCount')[0],
'GoodRate': jsonpath(result, '$..GoodRate')[0],
}
print(item)
# 构建价格请求
price_url = 'https://p.3.cn/prices/mgets?skuIds=J_{}'.format(item['product_sku_id'])
yield scrapy.Requests(price_url, callback=self.parse_product_price, meta={'item': item})
def parse_product_price(self, response):
# 获取价格信息
item = response.meta['item']
# print(response.text)
result = json.loads(response.text)
item['product_price'] = result[0]['p']
# print(item)
# 把商品数据交给引擎
yield item
① 在settings.py文件中设置属性
# REDIS数据链接
REDIS_URL = 'redis://127.0.0.1:6379/0'
# 去重容器类
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.REPDupeFilter"
# 调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 是不进行调度持久化:
# 如果是True,当程序结束时,会保留Redis中已爬指纹和待爬请求
# 如果是False,当程序结束时,会清空Redis中已爬指纹和待爬请求
SCHEDULER_PERSIST = True
② 在项目文件夹下创建add_category_to_redis.py
from pymongo import MongoClient
from redis import StrictRedis
import pickle
from mall_spider.settings import MONGODB_URL, REDIS_URL
from mall_spider.spiders.jd_product import JdProductSpider
def add_category_to_redis():
# 1.链接MongoDB
mongo = MongoClient(MONGODB_URL)
# 2.链接Redis
redis = StrictRedis.from_url(REDIS_URL)
# 3.读取MongoDB中分类信息,序列化后,添加到redis_key指定的list
# 获取MongoDB中分类的集合
collection = mongo['jd']['category']
# 读取分类信息
cursor = collection.find()
for category in cursor:
# 序列化字典数据
data = pickle.dumps(category)
# 添加到redis_key指定的list
redis.lpush(JdProductSpider.redis_key, data)
# 4.关闭MongoDB
mongo.close()
if __name__ == '__main__':
add_category_to_redis()
二、保存商品信息
1. 实现存储商品Pipeline类
from mall_spider.spiders.jd_product import JdProductSpider
class ProductPipeline(object):
# 当爬虫启动的时候执行
def open_spider(self, spider):
if isinstance(spider, JdProductSpider):
# open_spider方法中,链接MongoDB数据库,获取要操作的集合
self.client = MongoClient(MONGODB_URL)
self.collection = self.client['jd']['product']
def process_item(self, item, spider):
# process_item方法中,向MongoDB中插入类别数据
if isinstance(spider, JdCategorySpider):
self.collection.insert_one(dict(item))
return item
def close_spider(self, spider):
# close_spider方法中,关闭MongoDB的链接
if isinstance(spider, JdProductSpider):
self.client.close()
2. 在settings.py开启这个管道
ITEM_PIPELINES = {
'mall_spider.pipelines.CategoryPipeline': 300,
# 开启商品管道
'mall_spider.pipelines.ProductPipeline': 301,
}
三、下载器中间件
1. 实现随机User-Agent的中间件
import requests
import random
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
"""
实现下载器中间件
"""
# 1.实现随机User-Agent的中间件
# 1.1 准备User-Agent列表
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
# 1.2在middlewares.py中,实现RandomUserAgent类
class RandomUserAgent(object):
# 1.2.1实现process_request方法
def process_request(self, request, spider):
# 1.2.2 如果是请求是https://cdnware.m.jd.com开头的,就是设置一个IPhone的user-agent
if request.url.startswith('https://cdnware.m.jd.com'):
request.headers['user-agent'] = 'JD4iPhone/164880(iPhone; iOS 12.1.2;Scale/2,00)'
# 否则从User-Agent列表中随机取出一个
else:
request.headers['user-agent'] = random.choice(USER_AGENTS)
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from twisted.internet import defer
from twisted.internet.error import TimeoutError, DNSLookupError, ConnectionRefusedError, ConnectionDone, ConnectError, ConnectionLost, TCPTimedOutError
from twisted.web.client import ResponseFailede
from scrapy.core.downloader.handlers.http11 import TunnelError
import re
# 2.实现代理IP中间件
# 2.1 在middlewares.py中,实现ProxyMiddleware类
class ProxyMiddleware(object):
EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError,
ConnectionRefusedError, ConnectionDone, ConnectError,
ConnectionLost, TCPTimedOutError, ResponseFailed,
IOError, TunnelError)
# 2.2 实现process_request方法
def process_request(self, request, spider):
# 2.2.1 从代理池中获取一个随机的代理IP,需指定代理IP的协议和访问的域名
response = requests.get('http://localhost:6868/random?protocol=https&domain=jd.com')
# 2.2.2设置给request.meta [ 'proxy' ]
request.meta['proxy'] = response.content.decode()
return None
# 2.3 实现process_exception方法
def process_exception(self, request, exception, spider):
# 2.3.1 当请求出现异常时,代理池哪些代理IP在本域名下是不可以用的
if isinstance(exception, self.EXCEPTIONS_TO_RETRY):
url = 'http://localhsot:6868/disable_domain'
proxy = request.meta['proxy']
ip = re.findall('https?://(.+?):\d+', proxy)[0]
params = {
'ip': ip,
'domain': 'jd.com',
}
# 发送请求,告诉代理池这个代理IP在本域名下是不可以用的
requests.get(url, params=params)
2. 在settings.py文件开启,下载器中间件
DOWNLOADER_MIDDLEWARES = {
'mall_spider.middlewares.ProxyMiddleware': 300,
'mall_spider.middlewares.RandomUserAgent': 301,
}