毕设 基于Python实现京东商城爬虫

1 基于Python实现京东商城爬虫

1.1 创建京东爬虫项目

# 1. 激活虚拟环境
conda activate env_spider

# 2. 创建爬虫项目
scrapy startproject spider_jd

1.2 抓取商品分类数据(Scrapy)

1.2.1 创建商品分类数据类

"item.py"
class Category(scrapy.Item):
    # 大类名称
    b_category_name = scrapy.Field()
    # 大类URL
    b_category_url = scrapy.Field()
    # 中类名称
    m_category_name = scrapy.Field()
    # 中类URL
    m_category_url = scrapy.Field()
    # 小类名称
    s_category_name = scrapy.Field()
    # 小类URL
    s_category_url = scrapy.Field()


class Product(scrapy.Item):
    # 商品类别
    product_category = scrapy.Field()
    # 商品ID
    product_sku_id = scrapy.Field()
    # 商品名称
    product_name = scrapy.Field()
    # 商品图片URL
    product_img_url = scrapy.Field()
    # 商品店铺
    product_shop = scrapy.Field()
    # 图书信息\作者\出版社
    product_book_info = scrapy.Field()
    # 商品选项
    product_option = scrapy.Field()
    # 商品评论数量
    product_comments = scrapy.Field()
    # 商品促销
    product_ad = scrapy.Field()
    # 商品价格
    product_price = scrapy.Field()

1.2.2 创建商品分类爬虫

# 进入爬虫项目目录
cd spider_jd
# 创建爬虫
scrapy genspider category_spider jd.com
"category_spider.py"
import scrapy
import json
from spider_jd.items import Category


class CategorySpider(scrapy.Spider):
    name = "category_spider"
    allowed_domains = ["dc.3.cn"]
    start_urls = ["https://dc.3.cn/category/get"]

    def parse(self, response):
        categories = json.loads(response.body.decode('GBK'))
        for category in categories['data']:
            item = Category()
            b_category = category['s'][0]
            b_category_info = b_category['n']
            print('=' * 50)
            item['b_category_name'], item['b_category_url'] = self.get_category_name_url(b_category_info)
            print(item['b_category_name'], item['b_category_url'])

            for m_category in b_category['s']:
                m_category_info = m_category['n']
                item['m_category_name'], item['m_category_url'] = self.get_category_name_url(m_category_info)
                # print('\t', item['m_category_name'], item['m_category_url'])

                for s_category in m_category['s']:
                    s_category_info = s_category['n']
                    item['s_category_name'], item['s_category_url'] = self.get_category_name_url(s_category_info)
                    # print('\t\t', s_category_name, s_category_url)
                    yield item

    def get_category_name_url(self, category_info):
        data_list = category_info.split('|')
        category_name = data_list[1]
        category_url = data_list[0]

        if 'jd.com' in category_url:
            category_url = 'https://' + category_url
        elif category_url.count('-') == 1:
            category_url = 'https://channel.jd.com/' + category_url + '.html'
        else:
            category_url = 'https://list.jd.com/list.html?cat=' + category_url.replace('-', ',')
        return category_name, category_url

1.2.3 保存商品分类数据

  • settings中配置MongoDB数据库链接
"settings.py"
# 配置MongoDB
MONGODB_URL = 'mongodb://127.0.0.1:27017'
  • pipeline中实现CategoryPipeline类
from spider_jd.spiders.category_spider import CategorySpider
from spider_jd.settings import MONGODB_URL
from pymongo import MongoClient


class CategoryPipeline:
    def open_spider(self, spider):
        if isinstance(spider, CategorySpider):
            self.client = MongoClient(MONGODB_URL)
            self.collection = self.client['jd']['category']

    def process_item(self, item, spider):
        if isinstance(spider, CategorySpider):
            self.collection.insert_one(dict(item))
        return item

    def close_spider(self, spider):
        if isinstance(spider, CategorySpider):
            self.client.close()
  • settings中开启CategoryPipeline
ITEM_PIPELINES = {
   "spider_jd.pipelines.CategoryPipeline": 300,
}

1.2.4 运行商品分类爬虫

  • 命令行运行
scrapy crawl category_spider

1.3 爬取京东商品数据(Selenium)

由于京东商品数据需要动态加载,因此使用selenium模拟点击页面抓取数据。
爬取手机商品数据代码如下:

def get_phone(driver, category, products):
    url = category['s_category_url']
    brands = products.distinct("phone_brand")
    driver.get(url)
    time.sleep(5)

    count = 0
    ul_brand = driver.find_element(By.CSS_SELECTOR, '.v-fixed')
    li_brand_list = ul_brand.find_elements(By.TAG_NAME, 'li')
    url_brand_list = []
    phone_brand_list = []
    for li_brand in li_brand_list:
        brand = li_brand.find_element(By.TAG_NAME, 'a')
        url_brand = brand.get_attribute('href')
        phone_brand = brand.get_attribute('title')
        url_brand_list.append(url_brand)
        phone_brand_list.append(phone_brand)

    for url_brand, phone_brand in zip(url_brand_list, phone_brand_list):
        if phone_brand in brands:
            continue

        driver.get(url_brand)
        time.sleep(3)

        count += 1
        # 滑动加载页面,向下滚动500个像素
        for i in range(6):
            ActionChains(driver) \
                .key_down(Keys.PAGE_DOWN) \
                .key_up(Keys.PAGE_DOWN) \
                .perform()
            time.sleep(2)

        # 获取页面商品数据
        ul = driver.find_element(By.CSS_SELECTOR, '.gl-warp')
        li_list = ul.find_elements(By.CSS_SELECTOR, '.gl-item')
        for li in li_list:
            if count % 5 == 0:
                # 滑动加载页面,向下滚动5000*9个像素
                height = 250
                script_text = 'window.scrollBy(0,' + str(height) + ');'
                print(script_text)
                driver.execute_script(script_text)
                # time.sleep(1)

            # 创建一条商品数据
            product = {
                'category_id': category['_id'],
                'b_category_name': category['b_category_name'],
                'm_category_name': category['m_category_name'],
                's_category_name': category['s_category_name'],
                'phone_brand': phone_brand
            }

            # 获取商品skuid
            skuid = li.get_attribute("data-sku")
            product['skuid'] = skuid
            product['product_url'] = f"https://item.jd.com/{skuid}.html"
            print('='*30, skuid)
            # 获取商品图片
            CSS_img = '.gl-i-wrap > .p-img > a:nth-child(1) > img:nth-child(1)'
            if is_element_exist(driver, CSS_img, li):
                # 'li.gl-item:nth-child(2) > div:nth-child(1) > div:nth-child(1) > a:nth-child(1) > img:nth-child(1)'
                img = li.find_element(By.CSS_SELECTOR, CSS_img)
                img_url = img.get_attribute("src")
                product['img_url'] = img_url
                # print(img_url)
            else:
                product['img_url'] = ''
                print('!图片不存在')
            # 获取商品不同型号
            CSS_phone_type_ul = '.gl-i-wrap > .p-scroll > .ps-wrap > .ps-main'
            if is_element_exist(driver, CSS_phone_type_ul, li):
                phone_type_ul = li.find_element(By.CSS_SELECTOR, CSS_phone_type_ul)
                phone_type_li_list = phone_type_ul.find_elements(By.TAG_NAME, 'li')
                phone_type_list = []
                phone_type_img_list = []
                for phone_type_li in phone_type_li_list:
                    phone_type_a = phone_type_li.find_element(By.CSS_SELECTOR, 'a')
                    phone_type = phone_type_a.get_attribute("title")
                    phone_type_list.append(phone_type)
                    # print(phone_type)
                    phone_type_img = phone_type_a.find_element(By.CSS_SELECTOR, 'img')
                    phone_type_img_url = phone_type_img.get_attribute("src")
                    phone_type_img_list.append(phone_type_img_url)
                    # print(phone_type_img_url)
                    product['phone_type'] = phone_type_list
                    product['phone_type_img'] = phone_type_img_list
            else:
                product['phone_type'] = []
                product['phone_type_img'] = []
                print('!型号不存在')
            # 获取商品价格
            CSS_phone_price = '.gl-i-wrap > .p-price > strong > i'
            if is_element_exist(driver, CSS_phone_price, li):
                phone_price_i = li.find_element(By.CSS_SELECTOR, CSS_phone_price)
                phone_price = phone_price_i.text
                product['phone_price'] = phone_price
                # print(phone_price)
            else:
                product['phone_price'] = ''
                print('!价格不存在')
            # 获取商品名称
            CSS_phone_name = '.gl-i-wrap > div:nth-child(4) > a > em'
            if is_element_exist(driver, CSS_phone_name, li):
                phone_name = li.find_element(By.CSS_SELECTOR, CSS_phone_name).text
                product['phone_name'] = phone_name
                # print(phone_name)
            else:
                product['phone_name'] = ''
                print('!名称不存在')
            # 获取商品评论数
            CSS_phone_commit = '#J_comment_' + str(skuid)
            if is_element_exist(driver, CSS_phone_commit, li):
                phone_commit = li.find_element(By.CSS_SELECTOR, CSS_phone_commit)
                phone_commit_num = phone_commit.text
                product['phone_commit_num'] = phone_commit_num
                # print(phone_commit_num)
            else:
                product['phone_commit_num'] = ''
                print('!评论数不存在')
            # 获取店铺名称
            CSS_phone_shop = '.gl-i-wrap > .p-shop > .J_im_icon > a'
            if is_element_exist(driver, CSS_phone_shop, li):
                phone_shop = li.find_element(By.CSS_SELECTOR, CSS_phone_shop).text
                phone_shop_url = li.find_element(By.CSS_SELECTOR, CSS_phone_shop).get_attribute("href")
                product['phone_shop'] = phone_shop
                product['phone_shop_url'] = phone_shop_url
                # print(phone_shop)
                # print(phone_shop_url)
            else:
                product['phone_shop'] = ''
                product['phone_shop_url'] = ''
                print('!店铺不存在')
            # 获取商品标签(如:京东自营、秒杀等)
            CSS_phone_shop_icon = '.gl-i-wrap > .p-icons'
            if is_element_exist(driver, CSS_phone_shop_icon, li):
                phone_shop_icon_div = li.find_element(By.CSS_SELECTOR, CSS_phone_shop_icon)
                phone_shop_icon_i_list = phone_shop_icon_div.find_elements(By.TAG_NAME, 'i')
                phone_icon_list = []
                for i in phone_shop_icon_i_list:
                    phone_icon_list.append(i.text)
                product['phone_shop_icon'] = phone_icon_list
                # print(phone_icon_list)
            else:
                product['phone_shop_icon'] = []
                print('!店铺标签不存在')

            print(product)
            products.insert_one(product)

2 基于FastAPI搭建可视化商城

效果如下:
在这里插入图片描述

3 基于Gradio搭建问答机器人

基于大语言模型和RAG技术实现商品推销小助手,通过自然语言为用户推荐商品。
效果如下:
在这里插入图片描述

4 欢迎加QQ技术交流,还可获取毕设源码与文章~

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值