京东爬虫记录

一个博主的京东爬虫,自己改写成爬取京东书籍。

两个爬虫:1、爬取京东书籍信息。2、爬取京东评论信息

公用文件:JingdongspiderItem、MySQLPipeline、settings

1、爬取京东书籍信息

# -*- coding: utf-8 -*-
import requests
from jingdongspider.items import JingdongspiderItem
import scrapy
import re
import json
from scrapy import Request
import urllib.request

#爬取京东书籍信息
class JingdongSpider(scrapy.Spider):
    name = 'jingdong'
    allowed_domains = ['jd.com']
    start_urls = ['https://www.jd.com']

    def parse(self, response):
        """京东"""
        url = "https://list.jd.com/list.html?cat=1713,3259,3336&page=1&sort=sort_rank_asc&trans=1&JL=6_0_0#J_main"
        #实现自动爬取
        # 这里体现了yield的用法
        # 可简单的把yield的作用理解为“抛出”,我把信息抛出,此时停止计算,等待parseMainPage运算完成,继续执行yield的下一条代码,即遍历下一书籍
        yield Request(url, callback=self.parseMainPage)


    def parseMainPage(self, response):
        # 书籍的种类是很多种的,我在默认的parse解析方法中得到所有种类的地址,以此循环
        # callback到parseMainPage中,分别对具体的书籍种类类进行爬取
        urls = response.xpath('//li[@class="gl-item"]/div/div[@class="p-img"]/a')
        for url in urls:
            item = JingdongspiderItem()
            url = url.xpath('@href').extract()
            all_url = response.urljoin(url[0])#使用urljoin()方法构建完整的绝对URL
            item['link'] = all_url  # 商品链接
            for link in url:
                # 如果找到下一页的url,得到绝对路径构造新的Request
                '''
                urljoin(url) 用于构造绝对url ,当传入的url参数是一个相对地址的时候,这个伙计会根据response.url计算出相应的绝对地址
                栗子:
               response.url=‘https://mp.csdn.net’,url=‘mdeditor/85640067’。则
               response.joinurl(url)的值为‘https://mp.csdn.net/mdeditor/85640067’
               然后就可以根据这个构造出来的新的url,重新构造request,然后爬取下一页面的内容了

                '''
                url = response.urljoin(link)
                ## 重新构造request方法,然后调用页面解析函数(递归?),不断爬取页面中的内容
                # url(page) :它是请求链接。
                # callback:它是回调函数当指定了该回调函数的请求完成之后,获取到响应,引擎会将该响应作为参数传递给这个回调函数。
                # 回调函数进行解析或生成下一个请求,parseDetails。
                yield Request(url, meta={'meta': item}, callback=self.parseDetails)

        """
        通过递归原理解析下一页
        下一页网页xpath解析地址
        
        """
        next_page = response.xpath('//a[@class="pn-next"]')
        for page in next_page:
            pages = page.xpath('@href').extract()[0]
            page = response.urljoin(pages)
            print(">>>>>>>>>>>>>", page)
            #通过url
            #和callback变革-构造了一个新的请求,回调函数callback依然使用parseMainPage()
            #方法。这个请求完成后,响应会重新经过parseMainPage方法处理,得到第二页的解析结果,然后生成第二页
            #的下一页,也就是第三页的请求。 这样爬虫就进入了一个循环,直到最后一页。
            yield Request(page, callback=self.parseMainPage, dont_filter=True)

    def parseDetails(self, response):
        item = response.meta['meta']
        id = response.xpath('//a/@data-sku').extract()[0]  #商品id
        item['project_id'] = id
        print(">>>>>>",id)

        shop_name = response.xpath('//div[@class="name"]/a/text()').extract()[0] # 商店名称
        print(">>>>>>",shop_name)
        item['shop_name'] = shop_name
        item['name'] = response.xpath('//div[@class="sku-name"]/text()').extract()[0].strip() # 商品名字
        """
        获取京东商品价格的url:通过id和接口
        """
        price_url = "https://p.3.cn/prices/mgets?callback=jQuery9274777&skuIds=" + str(id)
        price = requests.get(price_url).text
        money = re.findall(r'\"p\"\:\"(.*?)\"}]\)', price)
        item['price'] = money[0]
        print(money)

        """
        获取京东商品评论数量
        """
        comment_num = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + str(id)
        print("商品评论数量",comment_num)
        yield scrapy.Request(comment_num, meta={'item': item}, callback=self.parse_getCommentnum)

        """
        通过正则表达式解析评论人数
        """
        comment_nums = requests.get(comment_num).text
        nums = re.findall(r'\"ShowCountStr\"\:\"(.*?)\"', comment_nums)
        print(">>>>>>>", nums)
        page = urllib.request.urlopen(comment_num)
        data = page.read()
        print(data)

    def parse_getCommentnum(self, response):
        item = response.meta['item']  #实例化item对象
        # response.text是一个json格式的
        date = json.loads(response.text)
        print(date)
        item['comment_num']= date['CommentsCount'][0]['CommentCountStr']   # 评论数量
        item['AfterCount'] = date['CommentsCount'][0]['AfterCount']   # 好评
        item['GoodCountStr']= date['CommentsCount'][0]['GoodCountStr']  # 中评
        item['PoorCount']= date['CommentsCount'][0]['PoorCount']  # 差评


        # 返回提取到的每个item数据给管道处理
        yield item



 2、爬取京东评论信息

# -*- coding: utf-8 -*-
import requests

from jingdongspider.items import commentItem
import json
import xlrd
import scrapy
from scrapy import Request

#京东书籍评论信息
class JingdongCommentSpider(scrapy.Spider):
    name = 'comment'
    allowed_domains = ['jd.com']
    start_urls = ['https://www.jd.com']


    def parse(self, response):
        """京东"""
        url = "https://list.jd.com/list.html?cat=670,671,672&page=1&sort=sort_totalsales15_desc&trans=1&JL=6_0_0#J_main"
        yield Request(url, callback=self.parseMainPage)


    def parseMainPage(self, response):
        urls = response.xpath('//li[@class="gl-item"]/div/div[@class="p-img"]/a')
        for url in urls:
            url = url.xpath('@href').extract()
            for link in url:
                url = response.urljoin(link)
                yield Request(url, callback=self.parseDetails)


    def parseDetails(self, response):
        id= response.xpath('//a[@class="compare J-compare J_contrast"]/@data-sku').extract()[0] # 商品id

        """
        解析京东商品评论的url
        """
        # url = 'https://sclub.jd.com/comment/productPageComments.action?productId=' + str(id) +'&score=0&sortType=5&page=0&pageSize=10'
        # yield scrapy.Request(url, callback=self.parse_getCommentnum)
        comment_num = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + str(id)
        com = requests.get(comment_num).text
        date = json.loads(com)
        comment_nums = date['CommentsCount'][0]['ShowCount']
        print(comment_nums)
        comment_total = int(comment_nums)
        if comment_total % 10 == 0:  # 算出评论的页数,一页10条评论
            page = comment_total//10
        else:
            page = comment_total//10 + 1


        for k in range(page):
            '''
            京东下一页评论接口
            '''
            com_url = 'https://sclub.jd.com/comment/productPageComments.action?productId=' + str(id) +'&score=0&sortType=5&page='+str(k)+'&pageSize=10'
            # print(">>>>>>>>>>", com_url)
            yield scrapy.Request(com_url, callback=self.parse_getCommentnum)
            # yield scrapy.Request(com_url, callback=self.parseDetails)


    def parse_getCommentnum(self, response):
        js = json.loads(response.text)
        # print(js)
        comments = js['comments']  # 该页所有评论

        items = []
        for comment in comments:
            item1 = commentItem()
            item1['user_name'] = comment['nickname']  # 用户名
            item1['user_id'] = comment['id']       #  用户id
            item1['userProvince'] = comment['userProvince']  # 用户评论用户来自的地区
            item1['content'] = comment['content']  #  评论
            item1['good_id'] = comment['referenceId']  # 评论的商品ID
            item1['good_name'] = comment['referenceName']  # 评论的商品名字
            item1['date'] = comment['referenceTime']   # 评论时间
            item1['replyCount'] = comment['replyCount']  # 回复数
            item1['score'] = comment['score']  # 评分
            item1['status'] = comment['status']   # 状态
            item1['userLevelId'] = comment['userLevelId']  # 用户等级
            item1['productColor'] = comment['productColor']  # 商品颜色
            item1['productSize'] = comment['productSize']   # 商品大小
            item1['userLevelName'] = comment['userLevelName']  # 银牌会员,钻石会员等
            item1['isMobile'] = comment['isMobile']   # 是否来自手机
            item1['userClientShow'] = comment['userClientShow']  # 是否来自手机
            item1['days'] = comment['days']  # 天数
            items.append(item1)
        return items

 --------------------------------------------------------------------------------------------------------------------------------------------------

                                                                        JingdongspiderItem

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

#爬虫的数据容器文件  items.py定义内容存储的关键字
class JingdongspiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    link = scrapy.Field()  # 商品链接
    project_id = scrapy.Field()  # 商品ID
    name = scrapy.Field()  # 商品名字
    comment_num = scrapy.Field()  # 评论人数
    shop_name = scrapy.Field()  # 店家名字
    price = scrapy.Field()  # 价钱
    GoodCountStr = scrapy.Field()  # 好评
    AfterCount = scrapy.Field()  # 中评
    PoorCount = scrapy.Field()  # 差评


class commentItem(scrapy.Item):
    user_name = scrapy.Field()   # 评论用户的名字
    user_id = scrapy.Field()  # 评论用户的ID
    userProvince = scrapy.Field()  # 评论用户来自的地区
    content = scrapy.Field()  # 评论内容
    good_id = scrapy.Field()  # 评论的商品ID
    good_name = scrapy.Field()  # 评论的商品名字
    date = scrapy.Field()   # 评论时间
    replyCount = scrapy.Field()   # 回复数
    score = scrapy.Field()  # 评分
    status = scrapy.Field()  # 状态
    userLevelId = scrapy.Field()  # 用户等级
    productColor = scrapy.Field()  # 商品颜色
    productSize = scrapy.Field()  # 商品大小
    userLevelName = scrapy.Field()   # 银牌会员,钻石会员等
    userClientShow = scrapy.Field()   # 来自什么 比如来自京东客户端
    isMobile = scrapy.Field()  # 是否来自手机
    days = scrapy.Field()  # 天数
    # commentTags = scrapy.Field()   # 标签

                                                                                        MySQLPipeline

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
import MySQLdb.cursors
from twisted.enterprise import adbapi
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from scrapy.utils.project import get_project_settings
from scrapy import log

SETTINGS = get_project_settings()

#管道文件   pipelines.   XinlangSpider.py中返回的内容进行存储
class MySQLPipeline(object):

    #它是一个类方法,用@classmethod 标识, 是一种依赖注入的方式。 它的参数
    #就是crawler,通过 crawler 我们可以拿到全局配置的每个配置信息。settings.py
    @classmethod
    def from_settings(cls, settings):
        '''1、@classmethod声明一个类方法,而对于平常我们见到的则叫做实例方法。
           2、类方法的第一个参数cls(class的缩写,指这个类本身),而实例方法的第一个参数是self,表示该类的一个实例
           3、可以通过类来调用,就像C.f(),相当于java中的静态方法'''
        dbparams = dict(
            host=settings['MYSQL_HOST'],  # 读取settings中的配置
            db=settings['MYSQL_DBNAME'],
            user=settings['MYSQL_USER'],
            passwd=settings['MYSQL_PASSWD'],
            charset='utf8',  # 编码要加上,否则可能出现中文乱码问题
            cursorclass=MySQLdb.cursors.DictCursor,
            use_unicode=False,
        )
        dbpool = adbapi.ConnectionPool('MySQLdb', **dbparams)  # **表示将字典扩展为关键字参数,相当于host=xxx,db=yyy....
        return cls(dbpool)  # 相当于dbpool付给了这个类,self中可以得

    def __init__(self, dbpool):
        self.dbpool = dbpool

    # pipeline默认调用
    def process_item(self, item, spider):
        query = self.dbpool.runInteraction(self._conditional_insert, item)  # 调用插入的方法
        query.addErrback(self._handle_error, item, spider)  # 调用异常处理方法
        return item

    # 写入数据库中
    def _conditional_insert(self, tx, item):
        sql = "insert into jingdong(project_id,name,comment_num,shop_name,link,GoodCountStr,AfterCount,PoorCount,price) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"

        params = (
        item["project_id"], item["name"], item["comment_num"], item["shop_name"], item["link"], item["GoodCountStr"],
        item["AfterCount"], item["PoorCount"], item["price"])
        tx.execute(sql, params)

    # 错误处理方法
    def _handle_error(self, failue, item, spider):
        print('--------------database operation exception!!-----------------')
        print(failue)

#-----------------------------------------------------------------------------------------------------------
class CommentPipeline(object):

    @classmethod
    def from_settings(cls, settings):
        dbparams = dict(
            host=settings['MYSQL_HOST'],  # 读取settings中的配置
            db=settings['MYSQL_DBNAME'],
            user=settings['MYSQL_USER'],
            passwd=settings['MYSQL_PASSWD'],
            charset='utf8',  # 编码要加上,否则可能出现中文乱码问题
            cursorclass=MySQLdb.cursors.DictCursor,
            use_unicode=False,
        )
        dbpool = adbapi.ConnectionPool('MySQLdb', **dbparams)  # **表示将字典扩展为关键字参数,相当于host=xxx,db=yyy....
        return cls(dbpool)  # 相当于dbpool付给了这个类,self中可以得

    def __init__(self, dbpool):
        self.dbpool = dbpool


    # pipeline默认调用
    def process_item(self, item, spider):
        query = self.dbpool.runInteraction(self._conditional_insert, item)  # 调用插入的方法
        query.addErrback(self._handle_error, item, spider)  # 调用异常处理方法
        return item


    # 写入数据库中
    def _conditional_insert(self, tx, item):

        sql = "insert into jd_comment(user_name,user_id,userProvince,content,good_id,good_name,date,replyCount,score,status,userLevelId,productColor,productSize,userLevelName,userClientShow,isMobile,days) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"

        params = (item["user_name"], item["user_id"], item["userProvince"], item["content"], item["good_id"],
                  item["good_name"], item["date"], item["replyCount"], item["score"], item["status"],
                  item["userLevelId"], item["productColor"], item["productSize"], item["userLevelName"],
                  item["userClientShow"],
                  item["isMobile"], item["days"])
        tx.execute(sql, params)

    # 错误处理方法
    def _handle_error(self, failue, item, spider):
        print('--------------database operation exception!!-----------------')
        print(failue)

                                                                                                settings

# -*- coding: utf-8 -*-

# Scrapy settings for jingdongspider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'jingdongspider'

SPIDER_MODULES = ['jingdongspider.spiders']
NEWSPIDER_MODULE = 'jingdongspider.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'jingdongspider (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'jingdongspider.middlewares.JingdongspiderSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'jingdongspider.middlewares.JingdongspiderDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'jingdongspider.pipelines.CommentPipeline': 300,
    'jingdongspider.pipelines.MySQLPipeline': 350,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# LOG_LEVEL = 'INFO'

# ======================================================================
#Mysql数据库的配置信息
MYSQL_HOST = '127.0.0.1'
MYSQL_DBNAME = 'jingdong'  #数据库名字,请修改
MYSQL_USER = 'root'             #数据库账号,请修改
MYSQL_PASSWD = 'shujuelin321'         #数据库密码,请修改

MYSQL_PORT = 3306               #数据库端口,在dbhelper中使用

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

数据指北Ai

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值