Python—Scrapy爬取京东商城

最新推荐文章于 2024-07-24 13:47:38 发布

深秋的喵

最新推荐文章于 2024-07-24 13:47:38 发布

阅读量4.5k

点赞数 4

分类专栏：爬虫 scrapy 文章标签： python

本文链接：https://blog.csdn.net/qq_44213653/article/details/107668794

版权

爬虫同时被 2 个专栏收录

2 篇文章 0 订阅

订阅专栏

scrapy

2 篇文章 0 订阅

订阅专栏

Python—Scrapy爬取京东商城

1.创建项目
scrapy startproject jd
在这里插入图片描述

效果：
在这里插入图片描述

2.生成一个爬虫
scrapy genspider jd_category jd.com

效果：
在这里插入图片描述

3.在items.py文件中定义要提取的字段

import scrapy


class JdItem(scrapy.Item):
    """商品信息"""
    title = scrapy.Field()  # 标题
    price = scrapy.Field()  # 价格
    sku_id = scrapy.Field()  # 商品id
    url = scrapy.Field()  # 商品链接
    info = scrapy.Field()  # 评论


class CommentItem(scrapy.Item):
    """
    评论
    """
    # '留言时间', '评分', '回复数', '点赞数', '图片数', '评论内容'
    content = scrapy.Field()
    comment_time = scrapy.Field()
    reply_count = scrapy.Field()
    score = scrapy.Field()
    vote_count = scrapy.Field()
    image_count = scrapy.Field()

4.jd_category.py中的内容，对列表页进行了爬取
在这里插入图片描述
对评论信息进行了爬取

import html
import json
import re

import scrapy
from ..items import JdItem, CommentItem


class JdSpider(scrapy.Spider):
    name = 'jd_goods'
    allowed_domains = ['jd.com']  # 有的时候写个www.jd.com会导致search.jd.com无法爬取
    # https: // list.jd.com / list.html?cat = 9987, 653, 655
    page = 1
    s = 1
    url = 'https://list.jd.com/list.html?cat=9987%2C653%2C655&page=1&s=1&click=0'
    next_url = 'https://list.jd.com/list.html?cat=9987%2C653%2C655&page={}&s={}&click=0'

    def start_requests(self):
        yield scrapy.Request(self.url)

    def parse(self, response):
        """
        爬取每页的前三十个商品，数据直接展示在原网页中
        :param response:
        :return:
        """
        for li in response.xpath('//*[@id="J_goodsList"]/ul/li'):
            item = JdItem()
            title = li.xpath('div/div/a/em/text()').extract_first("")  # 标题
            price = li.xpath('div/div/strong/i/text()').extract_first("")  # 价格
            sku_id = li.xpath('./@data-sku').extract_first("")  # id

            # 详细内容的url
            url = li.xpath('./div/div[@class="p-img"]/a/@href').extract_first("")  # 需要跟进的链接

            item['title'] = title
            item['price'] = price
            item['url'] = url
            item['sku_id'] = sku_id

            if not item['url'].startswith("https:"):
                item['info'] = None
                item['url'] = "https:" + item['url']

                # yield item

            # 详细页面
            yield scrapy.Request(item['url'], callback=self.info_parse, meta={"item": item})

        if self.page <=10:
            self.page +=2
            self.s +=60
            # print(self.next_url.format(self.page, self.s))
            yield scrapy.Request(url=self.next_url.format(self.page, self.s),callback=self.parse)

    def info_parse(self, response):
        """
        详细页面
        :param response:
        :return:
        """
        item = response.meta['item']
        # 评论页面的url
        # page是评论的页面，如果爬取多页，可以更改page
        comment_url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={}' \
                      '&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1'
        # print(comment_url.format(item.get('sku_id')))

        # 评论页面
        yield scrapy.Request(comment_url.format(item.get('sku_id')), callback=self.comment_parse, meta={"item": item})

    def comment_parse(self, response):
        """
        爬取评论
        :param response:
        :return:
        """
        text= response.text
        comment_list = re.findall(
            r'guid":".*?"content":"(.*?)".*?"creationTime":"(.*?)",".*?"replyCount":(\d+),"score":(\d+).*?usefulVoteCount":(\d+).*?imageCount":(\d+).*?images":',
            text)
        info = []
        for result in comment_list:
            # 根据正则表达式结果匹配数据
            # '留言时间', '评分', '回复数', '点赞数', '图片数', '评论内容'
            comment_item = CommentItem()

            comment_item['content'] = result[0]
            comment_item['comment_time'] = result[1]
            comment_item['reply_count'] = result[2]
            comment_item['score'] = result[3]
            comment_item['vote_count'] = result[4]
            comment_item['image_count'] = result[5]

            info.append(comment_item)

        item = response.meta['item']
        item['info'] = info

        yield item