基于Python实现京东商城爬虫
1 基于Python实现京东商城爬虫
1.1 创建京东爬虫项目
# 1. 激活虚拟环境
conda activate env_spider
# 2. 创建爬虫项目
scrapy startproject spider_jd
1.2 抓取商品分类数据(Scrapy)
1.2.1 创建商品分类数据类
"item.py"
class Category(scrapy.Item):
# 大类名称
b_category_name = scrapy.Field()
# 大类URL
b_category_url = scrapy.Field()
# 中类名称
m_category_name = scrapy.Field()
# 中类URL
m_category_url = scrapy.Field()
# 小类名称
s_category_name = scrapy.Field()
# 小类URL
s_category_url = scrapy.Field()
class Product(scrapy.Item):
# 商品类别
product_category = scrapy.Field()
# 商品ID
product_sku_id = scrapy.Field()
# 商品名称
product_name = scrapy.Field()
# 商品图片URL
product_img_url = scrapy.Field()
# 商品店铺
product_shop = scrapy.Field()
# 图书信息\作者\出版社
product_book_info = scrapy.Field()
# 商品选项
product_option = scrapy.Field()
# 商品评论数量
product_comments = scrapy.Field()
# 商品促销
product_ad = scrapy.Field()
# 商品价格
product_price = scrapy.Field()
1.2.2 创建商品分类爬虫
# 进入爬虫项目目录
cd spider_jd
# 创建爬虫
scrapy genspider category_spider jd.com
"category_spider.py"
import scrapy
import json
from spider_jd.items import Category
class CategorySpider(scrapy.Spider):
name = "category_spider"
allowed_domains = ["dc.3.cn"]
start_urls = ["https://dc.3.cn/category/get"]
def parse(self, response):
categories = json.loads(response.body.decode('GBK'))
for category in categories['data']:
item = Category()
b_category = category['s'][0]
b_category_info = b_category['n']
print('=' * 50)
item['b_category_name'], item['b_category_url'] = self.get_category_name_url(b_category_info)
print(item['b_category_name'], item['b_category_url'])
for m_category in b_category['s']:
m_category_info = m_category['n']
item['m_category_name'], item['m_category_url'] = self.get_category_name_url(m_category_info)
# print('\t', item['m_category_name'], item['m_category_url'])
for s_category in m_category['s']:
s_category_info = s_category['n']
item['s_category_name'], item['s_category_url'] = self.get_category_name_url(s_category_info)
# print('\t\t', s_category_name, s_category_url)
yield item
def get_category_name_url(self, category_info):
data_list = category_info.split('|')
category_name = data_list[1]
category_url = data_list[0]
if 'jd.com' in category_url:
category_url = 'https://' + category_url
elif category_url.count('-') == 1:
category_url = 'https://channel.jd.com/' + category_url + '.html'
else:
category_url = 'https://list.jd.com/list.html?cat=' + category_url.replace('-', ',')
return category_name, category_url
1.2.3 保存商品分类数据
- settings中配置MongoDB数据库链接
"settings.py"
# 配置MongoDB
MONGODB_URL = 'mongodb://127.0.0.1:27017'
- pipeline中实现CategoryPipeline类
from spider_jd.spiders.category_spider import CategorySpider
from spider_jd.settings import MONGODB_URL
from pymongo import MongoClient
class CategoryPipeline:
def open_spider(self, spider):
if isinstance(spider, CategorySpider):
self.client = MongoClient(MONGODB_URL)
self.collection = self.client['jd']['category']
def process_item(self, item, spider):
if isinstance(spider, CategorySpider):
self.collection.insert_one(dict(item))
return item
def close_spider(self, spider):
if isinstance(spider, CategorySpider):
self.client.close()
- settings中开启CategoryPipeline
ITEM_PIPELINES = {
"spider_jd.pipelines.CategoryPipeline": 300,
}
1.2.4 运行商品分类爬虫
- 命令行运行
scrapy crawl category_spider
1.3 爬取京东商品数据(Selenium)
由于京东商品数据需要动态加载,因此使用selenium模拟点击页面抓取数据。
爬取手机商品数据代码如下:
def get_phone(driver, category, products):
url = category['s_category_url']
brands = products.distinct("phone_brand")
driver.get(url)
time.sleep(5)
count = 0
ul_brand = driver.find_element(By.CSS_SELECTOR, '.v-fixed')
li_brand_list = ul_brand.find_elements(By.TAG_NAME, 'li')
url_brand_list = []
phone_brand_list = []
for li_brand in li_brand_list:
brand = li_brand.find_element(By.TAG_NAME, 'a')
url_brand = brand.get_attribute('href')
phone_brand = brand.get_attribute('title')
url_brand_list.append(url_brand)
phone_brand_list.append(phone_brand)
for url_brand, phone_brand in zip(url_brand_list, phone_brand_list):
if phone_brand in brands:
continue
driver.get(url_brand)
time.sleep(3)
count += 1
# 滑动加载页面,向下滚动500个像素
for i in range(6):
ActionChains(driver) \
.key_down(Keys.PAGE_DOWN) \
.key_up(Keys.PAGE_DOWN) \
.perform()
time.sleep(2)
# 获取页面商品数据
ul = driver.find_element(By.CSS_SELECTOR, '.gl-warp')
li_list = ul.find_elements(By.CSS_SELECTOR, '.gl-item')
for li in li_list:
if count % 5 == 0:
# 滑动加载页面,向下滚动5000*9个像素
height = 250
script_text = 'window.scrollBy(0,' + str(height) + ');'
print(script_text)
driver.execute_script(script_text)
# time.sleep(1)
# 创建一条商品数据
product = {
'category_id': category['_id'],
'b_category_name': category['b_category_name'],
'm_category_name': category['m_category_name'],
's_category_name': category['s_category_name'],
'phone_brand': phone_brand
}
# 获取商品skuid
skuid = li.get_attribute("data-sku")
product['skuid'] = skuid
product['product_url'] = f"https://item.jd.com/{skuid}.html"
print('='*30, skuid)
# 获取商品图片
CSS_img = '.gl-i-wrap > .p-img > a:nth-child(1) > img:nth-child(1)'
if is_element_exist(driver, CSS_img, li):
# 'li.gl-item:nth-child(2) > div:nth-child(1) > div:nth-child(1) > a:nth-child(1) > img:nth-child(1)'
img = li.find_element(By.CSS_SELECTOR, CSS_img)
img_url = img.get_attribute("src")
product['img_url'] = img_url
# print(img_url)
else:
product['img_url'] = ''
print('!图片不存在')
# 获取商品不同型号
CSS_phone_type_ul = '.gl-i-wrap > .p-scroll > .ps-wrap > .ps-main'
if is_element_exist(driver, CSS_phone_type_ul, li):
phone_type_ul = li.find_element(By.CSS_SELECTOR, CSS_phone_type_ul)
phone_type_li_list = phone_type_ul.find_elements(By.TAG_NAME, 'li')
phone_type_list = []
phone_type_img_list = []
for phone_type_li in phone_type_li_list:
phone_type_a = phone_type_li.find_element(By.CSS_SELECTOR, 'a')
phone_type = phone_type_a.get_attribute("title")
phone_type_list.append(phone_type)
# print(phone_type)
phone_type_img = phone_type_a.find_element(By.CSS_SELECTOR, 'img')
phone_type_img_url = phone_type_img.get_attribute("src")
phone_type_img_list.append(phone_type_img_url)
# print(phone_type_img_url)
product['phone_type'] = phone_type_list
product['phone_type_img'] = phone_type_img_list
else:
product['phone_type'] = []
product['phone_type_img'] = []
print('!型号不存在')
# 获取商品价格
CSS_phone_price = '.gl-i-wrap > .p-price > strong > i'
if is_element_exist(driver, CSS_phone_price, li):
phone_price_i = li.find_element(By.CSS_SELECTOR, CSS_phone_price)
phone_price = phone_price_i.text
product['phone_price'] = phone_price
# print(phone_price)
else:
product['phone_price'] = ''
print('!价格不存在')
# 获取商品名称
CSS_phone_name = '.gl-i-wrap > div:nth-child(4) > a > em'
if is_element_exist(driver, CSS_phone_name, li):
phone_name = li.find_element(By.CSS_SELECTOR, CSS_phone_name).text
product['phone_name'] = phone_name
# print(phone_name)
else:
product['phone_name'] = ''
print('!名称不存在')
# 获取商品评论数
CSS_phone_commit = '#J_comment_' + str(skuid)
if is_element_exist(driver, CSS_phone_commit, li):
phone_commit = li.find_element(By.CSS_SELECTOR, CSS_phone_commit)
phone_commit_num = phone_commit.text
product['phone_commit_num'] = phone_commit_num
# print(phone_commit_num)
else:
product['phone_commit_num'] = ''
print('!评论数不存在')
# 获取店铺名称
CSS_phone_shop = '.gl-i-wrap > .p-shop > .J_im_icon > a'
if is_element_exist(driver, CSS_phone_shop, li):
phone_shop = li.find_element(By.CSS_SELECTOR, CSS_phone_shop).text
phone_shop_url = li.find_element(By.CSS_SELECTOR, CSS_phone_shop).get_attribute("href")
product['phone_shop'] = phone_shop
product['phone_shop_url'] = phone_shop_url
# print(phone_shop)
# print(phone_shop_url)
else:
product['phone_shop'] = ''
product['phone_shop_url'] = ''
print('!店铺不存在')
# 获取商品标签(如:京东自营、秒杀等)
CSS_phone_shop_icon = '.gl-i-wrap > .p-icons'
if is_element_exist(driver, CSS_phone_shop_icon, li):
phone_shop_icon_div = li.find_element(By.CSS_SELECTOR, CSS_phone_shop_icon)
phone_shop_icon_i_list = phone_shop_icon_div.find_elements(By.TAG_NAME, 'i')
phone_icon_list = []
for i in phone_shop_icon_i_list:
phone_icon_list.append(i.text)
product['phone_shop_icon'] = phone_icon_list
# print(phone_icon_list)
else:
product['phone_shop_icon'] = []
print('!店铺标签不存在')
print(product)
products.insert_one(product)
2 基于FastAPI搭建可视化商城
效果如下:
3 基于Gradio搭建问答机器人
基于大语言模型和RAG技术实现商品推销小助手,通过自然语言为用户推荐商品。
效果如下: