京东爬虫记录

最新推荐文章于 2024-07-24 13:47:38 发布

数据指北Ai

最新推荐文章于 2024-07-24 13:47:38 发布

阅读量2.6w

点赞数

分类专栏： # --- Python基础

本文链接：https://blog.csdn.net/shujuelin/article/details/90759559

版权

--- Python基础专栏收录该内容

7 篇文章 0 订阅

订阅专栏

一个博主的京东爬虫，自己改写成爬取京东书籍。

两个爬虫：1、爬取京东书籍信息。2、爬取京东评论信息

公用文件：JingdongspiderItem、MySQLPipeline、settings

1、爬取京东书籍信息

# -*- coding: utf-8 -*-
import requests
from jingdongspider.items import JingdongspiderItem
import scrapy
import re
import json
from scrapy import Request
import urllib.request

#爬取京东书籍信息
class JingdongSpider(scrapy.Spider):
    name = 'jingdong'
    allowed_domains = ['jd.com']
    start_urls = ['https://www.jd.com']

    def parse(self, response):
        """京东"""
        url = "https://list.jd.com/list.html?cat=1713,3259,3336&page=1&sort=sort_rank_asc&trans=1&JL=6_0_0#J_main"
        #实现自动爬取
        # 这里体现了yield的用法
        # 可简单的把yield的作用理解为“抛出”，我把信息抛出，此时停止计算，等待parseMainPage运算完成，继续执行yield的下一条代码，即遍历下一书籍
        yield Request(url, callback=self.parseMainPage)


    def parseMainPage(self, response):
        # 书籍的种类是很多种的，我在默认的parse解析方法中得到所有种类的地址，以此循环
        # callback到parseMainPage中，分别对具体的书籍种类类进行爬取
        urls = response.xpath('//li[@class="gl-item"]/div/div[@class="p-img"]/a')
        for url in urls:
            item = JingdongspiderItem()
            url = url.xpath('@href').extract()
            all_url = response.urljoin(url[0])#使用urljoin()方法构建完整的绝对URL
            item['link'] = all_url  # 商品链接
            for link in url:
                # 如果找到下一页的url，得到绝对路径构造新的Request
                '''
                urljoin(url) 用于构造绝对url ，当传入的url参数是一个相对地址的时候，这个伙计会根据response.url计算出相应的绝对地址
                栗子：
               response.url=‘https://mp.csdn.net’，url=‘mdeditor/85640067’。则
               response.joinurl(url)的值为‘https://mp.csdn.net/mdeditor/85640067’
               然后就可以根据这个构造出来的新的url，重新构造request，然后爬取下一页面的内容了

                '''
                url = response.urljoin(link)
                ## 重新构造request方法，然后调用页面解析函数（递归?），不断爬取页面中的内容
                # url（page） ：它是请求链接。
                # callback:它是回调函数当指定了该回调函数的请求完成之后，获取到响应，引擎会将该响应作为参数传递给这个回调函数。
                # 回调函数进行解析或生成下一个请求，parseDetails。
                yield Request(url, meta={'meta': item}, callback=self.parseDetails)

        """
        通过递归原理解析下一页
        下一页网页xpath解析地址
        
        """
        next_page = response.xpath('//a[@class="pn-next"]')
        for page in next_page:
            pages = page.xpath('@href').extract()[0]
            page = response.urljoin(pages)
            print(">>>>>>>>>>>>>", page)
            #通过url
            #和callback变革－构造了一个新的请求，回调函数callback依然使用parseMainPage()
            #方法。这个请求完成后，响应会重新经过parseMainPage方法处理，得到第二页的解析结果，然后生成第二页
            #的下一页，也就是第三页的请求。 这样爬虫就进入了一个循环，直到最后一页。
            yield Request(page, callback=self.parseMainPage, dont_filter=True)

    def parseDetails(self, response):
        item = response.meta['meta']
        id = response.xpath('//a/@data-sku').extract()[0]  #商品id
        item['project_id'] = id
        print(">>>>>>",id)

        shop_name = response.xpath('//div[@class="name"]/a/text()').extract()[0] # 商店名称
        print(">>>>>>",shop_name)
        item['shop_name'] = shop_name
        item['name'] = response.xpath('//div[@class="sku-name"]/text()').extract()[0].strip() # 商品名字
        """
        获取京东商品价格的url:通过id和接口
        """
        price_url = "https://p.3.cn/prices/mgets?callback=jQuery9274777&skuIds=" + str(id)
        price = requests.get(price_url).text
        money = re.findall(r'\"p\"\:\"(.*?)\"}]\)', price)
        item['price'] = money[0]
        print(money)

        """
        获取京东商品评论数量
        """
        comment_num = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + str(id)
        print("商品评论数量",comment_num)
        yield scrapy.Request(comment_num, meta={'item': item}, callback=self.parse_getCommentnum)

        """
        通过正则表达式解析评论人数
        """
        comment_nums = requests.get(comment_num).text
        nums = re.findall(r'\"ShowCountStr\"\:\"(.*?)\"', comment_nums)
        print(">>>>>>>", nums)
        page = urllib.request.urlopen(comment_num)
        data = page.read()
        print(data)

    def parse_getCommentnum(self, response):
        item = response.meta['item']  #实例化item对象
        # response.text是一个json格式的
        date = json.loads(response.text)
        print(date)
        item['comment_num']= date['CommentsCount'][0]['CommentCountStr']   # 评论数量
        item['AfterCount'] = date['CommentsCount'][0]['AfterCount']   # 好评
        item['GoodCountStr']= date['CommentsCount'][0]['GoodCountStr']  # 中评
        item['PoorCount']= date['CommentsCount'][0]['PoorCount']  # 差评


        # 返回提取到的每个item数据给管道处理
        yield item

2、爬取京东评论信息

# -*- coding: utf-8 -*-
import requests

from jingdongspider.items import commentItem
import json
import xlrd
import scrapy
from scrapy import Request

#京东书籍评论信息
class JingdongCommentSpider(scrapy.Spider):
    name = 'comment'
    allowed_domains = ['jd.com']
    start_urls = ['https://www.jd.com']


    def parse(self, response):
        """京东"""
        url = "https://list.jd.com/list.html?cat=670,671,672&page=1&sort=sort_totalsales15_desc&trans=1&JL=6_0_0#J_main"
        yield Request(url, callback=self.parseMainPage)


    def parseMainPage(self, response):
        urls = response.xpath('//li[@class="gl-item"]/div/div[@class="p-img"]/a')
        for url in urls:
            url = url.xpath('@href').extract()
            for link in url:
                url = response.urljoin(link)
                yield Request(url, callback=self.parseDetails)


    def parseDetails(self, response):
        id= response.xpath('//a[@class="compare J-compare J_contrast"]/@data-sku').extract()[0] # 商品id

        """
        解析京东商品评论的url
        """
        # url = 'https://sclub.jd.com/comment/productPageComments.action?productId=' + str(id) +'&score=0&sortType=5&page=0&pageSize=10'
        # yield scrapy.Request(url, callback=self.parse_getCommentnum)
        comment_num = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + str(id)
        com = requests.get(comment_num).text
        date = json.loads(com)
        comment_nums = date['CommentsCount'][0]['ShowCount']
        print(comment_nums)
        comment_total = int(comment_nums)
        if comment_total % 10 == 0:  # 算出评论的页数，一页10条评论
            page = comment_total//10
        else:
            page = comment_total//10 + 1


        for k in range(page):
            '''
            京东下一页评论接口
            '''
            com_url = 'https://sclub.jd.com/comment/productPageComments.action?productId=' + str(id) +'&score=0&sortType=5&page='+str(k)+'&pageSize=10'
            # print(">>>>>>>>>>", com_url)
            yield scrapy.Request(com_url, callback=self.parse_getCommentnum)
            # yield scrapy.Request(com_url, callback=self.parseDetails)


    def parse_getCommentnum(self, response):
        js = json.loads(response.text)
        # print(js)
        comments = js['comments']  # 该页所有评论

        items = []
        for comment in comments:
            item1 = commentItem()
            item1['user_name'] = comment['nickname']  # 用户名
            item1['user_id'] = comment['id']       #  用户id
            item1['userProvince'] = comment['userProvince']  # 用户评论用户来自的地区
            item1['content'] = comment['content']  #  评论
            item1['good_id'] = comment['referenceId']  # 评论的商品ID
            item1['good_name'] = comment['referenceName']  # 评论的商品名字
            item1['date'] = comment['referenceTime']   # 评论时间
            item1['replyCount'] = comment['replyCount']  # 回复数
            item1['score'] = comment['score']  # 评分
            item1['status'] = comment['status']   # 状态
            item1['userLevelId'] = comment['userLevelId']  # 用户等级
            item1['productColor'] = comment['productColor']  # 商品颜色
            item1['productSize'] = comment['productSize']   # 商品大小
            item1['userLevelName'] = comment['userLevelName']  # 银牌会员，钻石会员等
            item1['isMobile'] = comment['isMobile']   # 是否来自手机
            item1['userClientShow'] = comment['userClientShow']  # 是否来自手机
            item1['days'] = comment['days']  # 天数
            items.append(item1)
        return items

--------------------------------------------------------------------------------------------------------------------------------------------------

JingdongspiderItem

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

#爬虫的数据容器文件  items.py定义内容存储的关键字
class JingdongspiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    link = scrapy.Field()  # 商品链接
    project_id = scrapy.Field()  # 商品ID
    name = scrapy.Field()  # 商品名字
    comment_num = scrapy.Field()  # 评论人数
    shop_name = scrapy.Field()  # 店家名字
    price = scrapy.Field()  # 价钱
    GoodCountStr = scrapy.Field()  # 好评
    AfterCount = scrapy.Field()  # 中评
    PoorCount = scrapy.Field()  # 差评


class commentItem(scrapy.Item):
    user_name = scrapy.Field()   # 评论用户的名字
    user_id = scrapy.Field()  # 评论用户的ID
    userProvince = scrapy.Field()  # 评论用户来自的地区
    content = scrapy.Field()  # 评论内容
    good_id = scrapy.Field()  # 评论的商品ID
    good_name = scrapy.Field()  # 评论的商品名字
    date = scrapy.Field()   # 评论时间
    replyCount = scrapy.Field()   # 回复数
    score = scrapy.Field()  # 评分
    status = scrapy.Field()  # 状态
    userLevelId = scrapy.Field()  # 用户等级
    productColor = scrapy.Field()  # 商品颜色
    productSize = scrapy.Field()  # 商品大小
    userLevelName = scrapy.Field()   # 银牌会员，钻石会员等
    userClientShow = scrapy.Field()   # 来自什么 比如来自京东客户端
    isMobile = scrapy.Field()  # 是否来自手机
    days = scrapy.Field()  # 天数
    # commentTags = scrapy.Field()   # 标签

MySQLPipeline

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
import MySQLdb.cursors
from twisted.enterprise import adbapi
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from scrapy.utils.project import get_project_settings
from scrapy import log

SETTINGS = get_project_settings()

#管道文件   pipelines.   XinlangSpider.py中返回的内容进行存储
class MySQLPipeline(object):

    #它是一个类方法，用＠classmethod 标识， 是一种依赖注入的方式。 它的参数
    #就是crawler，通过 crawler 我们可以拿到全局配置的每个配置信息。settings.py
    @classmethod
    def from_settings(cls, settings):
        '''1、@classmethod声明一个类方法，而对于平常我们见到的则叫做实例方法。
           2、类方法的第一个参数cls（class的缩写，指这个类本身），而实例方法的第一个参数是self，表示该类的一个实例
           3、可以通过类来调用，就像C.f()，相当于java中的静态方法'''
        dbparams = dict(
            host=settings['MYSQL_HOST'],  # 读取settings中的配置
            db=settings['MYSQL_DBNAME'],
            user=settings['MYSQL_USER'],
            passwd=settings['MYSQL_PASSWD'],
            charset='utf8',  # 编码要加上，否则可能出现中文乱码问题
            cursorclass=MySQLdb.cursors.DictCursor,
            use_unicode=False,
        )
        dbpool = adbapi.ConnectionPool('MySQLdb', **dbparams)  # **表示将字典扩展为关键字参数,相当于host=xxx,db=yyy....
        return cls(dbpool)  # 相当于dbpool付给了这个类，self中可以得

    def __init__(self, dbpool):
        self.dbpool = dbpool

    # pipeline默认调用
    def process_item(self, item, spider):
        query = self.dbpool.runInteraction(self._conditional_insert, item)  # 调用插入的方法
        query.addErrback(self._handle_error, item, spider)  # 调用异常处理方法
        return item

    # 写入数据库中
    def _conditional_insert(self, tx, item):
        sql = "insert into jingdong(project_id,name,comment_num,shop_name,link,GoodCountStr,AfterCount,PoorCount,price) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"

        params = (
        item["project_id"], item["name"], item["comment_num"], item["shop_name"], item["link"], item["GoodCountStr"],
        item["AfterCount"], item["PoorCount"], item["price"])
        tx.execute(sql, params)

    # 错误处理方法
    def _handle_error(self, failue, item, spider):
        print('--------------database operation exception!!-----------------')
        print(failue)

#-----------------------------------------------------------------------------------------------------------
class CommentPipeline(object):

    @classmethod
    def from_settings(cls, settings):
        dbparams = dict(
            host=settings['MYSQL_HOST'],  # 读取settings中的配置
            db=settings['MYSQL_DBNAME'],
            user=settings['MYSQL_USER'],
            passwd=settings['MYSQL_PASSWD'],
            charset='utf8',  # 编码要加上，否则可能出现中文乱码问题
            cursorclass=MySQLdb.cursors.DictCursor,
            use_unicode=False,
        )
        dbpool = adbapi.ConnectionPool('MySQLdb', **dbparams)  # **表示将字典扩展为关键字参数,相当于host=xxx,db=yyy....
        return cls(dbpool)  # 相当于dbpool付给了这个类，self中可以得

    def __init__(self, dbpool):
        self.dbpool = dbpool


    # pipeline默认调用
    def process_item(self, item, spider):
        query = self.dbpool.runInteraction(self._conditional_insert, item)  # 调用插入的方法
        query.addErrback(self._handle_error, item, spider)  # 调用异常处理方法
        return item


    # 写入数据库中
    def _conditional_insert(self, tx, item):

        sql = "insert into jd_comment(user_name,user_id,userProvince,content,good_id,good_name,date,replyCount,score,status,userLevelId,productColor,productSize,userLevelName,userClientShow,isMobile,days) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"

        params = (item["user_name"], item["user_id"], item["userProvince"], item["content"], item["good_id"],
                  item["good_name"], item["date"], item["replyCount"], item["score"], item["status"],
                  item["userLevelId"], item["productColor"], item["productSize"], item["userLevelName"],
                  item["userClientShow"],
                  item["isMobile"], item["days"])
        tx.execute(sql, params)

    # 错误处理方法
    def _handle_error(self, failue, item, spider):
        print('--------------database operation exception!!-----------------')
        print(failue)

settings

# -*- coding: utf-8 -*-

# Scrapy settings for jingdongspider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'jingdongspider'

SPIDER_MODULES = ['jingdongspider.spiders']
NEWSPIDER_MODULE = 'jingdongspider.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'jingdongspider (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'jingdongspider.middlewares.JingdongspiderSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'jingdongspider.middlewares.JingdongspiderDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'jingdongspider.pipelines.CommentPipeline': 300,
    'jingdongspider.pipelines.MySQLPipeline': 350,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# LOG_LEVEL = 'INFO'

# ======================================================================
#Mysql数据库的配置信息
MYSQL_HOST = '127.0.0.1'
MYSQL_DBNAME = 'jingdong'  #数据库名字，请修改
MYSQL_USER = 'root'             #数据库账号，请修改
MYSQL_PASSWD = 'shujuelin321'         #数据库密码，请修改

MYSQL_PORT = 3306               #数据库端口，在dbhelper中使用