一个博主的京东爬虫,自己改写成爬取京东书籍。
两个爬虫:1、爬取京东书籍信息。2、爬取京东评论信息
公用文件:JingdongspiderItem、MySQLPipeline、settings
1、爬取京东书籍信息
# -*- coding: utf-8 -*-
import requests
from jingdongspider.items import JingdongspiderItem
import scrapy
import re
import json
from scrapy import Request
import urllib.request
#爬取京东书籍信息
class JingdongSpider(scrapy.Spider):
name = 'jingdong'
allowed_domains = ['jd.com']
start_urls = ['https://www.jd.com']
def parse(self, response):
"""京东"""
url = "https://list.jd.com/list.html?cat=1713,3259,3336&page=1&sort=sort_rank_asc&trans=1&JL=6_0_0#J_main"
#实现自动爬取
# 这里体现了yield的用法
# 可简单的把yield的作用理解为“抛出”,我把信息抛出,此时停止计算,等待parseMainPage运算完成,继续执行yield的下一条代码,即遍历下一书籍
yield Request(url, callback=self.parseMainPage)
def parseMainPage(self, response):
# 书籍的种类是很多种的,我在默认的parse解析方法中得到所有种类的地址,以此循环
# callback到parseMainPage中,分别对具体的书籍种类类进行爬取
urls = response.xpath('//li[@class="gl-item"]/div/div[@class="p-img"]/a')
for url in urls:
item = JingdongspiderItem()
url = url.xpath('@href').extract()
all_url = response.urljoin(url[0])#使用urljoin()方法构建完整的绝对URL
item['link'] = all_url # 商品链接
for link in url:
# 如果找到下一页的url,得到绝对路径构造新的Request
'''
urljoin(url) 用于构造绝对url ,当传入的url参数是一个相对地址的时候,这个伙计会根据response.url计算出相应的绝对地址
栗子:
response.url=‘https://mp.csdn.net’,url=‘mdeditor/85640067’。则
response.joinurl(url)的值为‘https://mp.csdn.net/mdeditor/85640067’
然后就可以根据这个构造出来的新的url,重新构造request,然后爬取下一页面的内容了
'''
url = response.urljoin(link)
## 重新构造request方法,然后调用页面解析函数(递归?),不断爬取页面中的内容
# url(page) :它是请求链接。
# callback:它是回调函数当指定了该回调函数的请求完成之后,获取到响应,引擎会将该响应作为参数传递给这个回调函数。
# 回调函数进行解析或生成下一个请求,parseDetails。
yield Request(url, meta={'meta': item}, callback=self.parseDetails)
"""
通过递归原理解析下一页
下一页网页xpath解析地址
"""
next_page = response.xpath('//a[@class="pn-next"]')
for page in next_page:
pages = page.xpath('@href').extract()[0]
page = response.urljoin(pages)
print(">>>>>>>>>>>>>", page)
#通过url
#和callback变革-构造了一个新的请求,回调函数callback依然使用parseMainPage()
#方法。这个请求完成后,响应会重新经过parseMainPage方法处理,得到第二页的解析结果,然后生成第二页
#的下一页,也就是第三页的请求。 这样爬虫就进入了一个循环,直到最后一页。
yield Request(page, callback=self.parseMainPage, dont_filter=True)
def parseDetails(self, response):
item = response.meta['meta']
id = response.xpath('//a/@data-sku').extract()[0] #商品id
item['project_id'] = id
print(">>>>>>",id)
shop_name = response.xpath('//div[@class="name"]/a/text()').extract()[0] # 商店名称
print(">>>>>>",shop_name)
item['shop_name'] = shop_name
item['name'] = response.xpath('//div[@class="sku-name"]/text()').extract()[0].strip() # 商品名字
"""
获取京东商品价格的url:通过id和接口
"""
price_url = "https://p.3.cn/prices/mgets?callback=jQuery9274777&skuIds=" + str(id)
price = requests.get(price_url).text
money = re.findall(r'\"p\"\:\"(.*?)\"}]\)', price)
item['price'] = money[0]
print(money)
"""
获取京东商品评论数量
"""
comment_num = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + str(id)
print("商品评论数量",comment_num)
yield scrapy.Request(comment_num, meta={'item': item}, callback=self.parse_getCommentnum)
"""
通过正则表达式解析评论人数
"""
comment_nums = requests.get(comment_num).text
nums = re.findall(r'\"ShowCountStr\"\:\"(.*?)\"', comment_nums)
print(">>>>>>>", nums)
page = urllib.request.urlopen(comment_num)
data = page.read()
print(data)
def parse_getCommentnum(self, response):
item = response.meta['item'] #实例化item对象
# response.text是一个json格式的
date = json.loads(response.text)
print(date)
item['comment_num']= date['CommentsCount'][0]['CommentCountStr'] # 评论数量
item['AfterCount'] = date['CommentsCount'][0]['AfterCount'] # 好评
item['GoodCountStr']= date['CommentsCount'][0]['GoodCountStr'] # 中评
item['PoorCount']= date['CommentsCount'][0]['PoorCount'] # 差评
# 返回提取到的每个item数据给管道处理
yield item
2、爬取京东评论信息
# -*- coding: utf-8 -*-
import requests
from jingdongspider.items import commentItem
import json
import xlrd
import scrapy
from scrapy import Request
#京东书籍评论信息
class JingdongCommentSpider(scrapy.Spider):
name = 'comment'
allowed_domains = ['jd.com']
start_urls = ['https://www.jd.com']
def parse(self, response):
"""京东"""
url = "https://list.jd.com/list.html?cat=670,671,672&page=1&sort=sort_totalsales15_desc&trans=1&JL=6_0_0#J_main"
yield Request(url, callback=self.parseMainPage)
def parseMainPage(self, response):
urls = response.xpath('//li[@class="gl-item"]/div/div[@class="p-img"]/a')
for url in urls:
url = url.xpath('@href').extract()
for link in url:
url = response.urljoin(link)
yield Request(url, callback=self.parseDetails)
def parseDetails(self, response):
id= response.xpath('//a[@class="compare J-compare J_contrast"]/@data-sku').extract()[0] # 商品id
"""
解析京东商品评论的url
"""
# url = 'https://sclub.jd.com/comment/productPageComments.action?productId=' + str(id) +'&score=0&sortType=5&page=0&pageSize=10'
# yield scrapy.Request(url, callback=self.parse_getCommentnum)
comment_num = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + str(id)
com = requests.get(comment_num).text
date = json.loads(com)
comment_nums = date['CommentsCount'][0]['ShowCount']
print(comment_nums)
comment_total = int(comment_nums)
if comment_total % 10 == 0: # 算出评论的页数,一页10条评论
page = comment_total//10
else:
page = comment_total//10 + 1
for k in range(page):
'''
京东下一页评论接口
'''
com_url = 'https://sclub.jd.com/comment/productPageComments.action?productId=' + str(id) +'&score=0&sortType=5&page='+str(k)+'&pageSize=10'
# print(">>>>>>>>>>", com_url)
yield scrapy.Request(com_url, callback=self.parse_getCommentnum)
# yield scrapy.Request(com_url, callback=self.parseDetails)
def parse_getCommentnum(self, response):
js = json.loads(response.text)
# print(js)
comments = js['comments'] # 该页所有评论
items = []
for comment in comments:
item1 = commentItem()
item1['user_name'] = comment['nickname'] # 用户名
item1['user_id'] = comment['id'] # 用户id
item1['userProvince'] = comment['userProvince'] # 用户评论用户来自的地区
item1['content'] = comment['content'] # 评论
item1['good_id'] = comment['referenceId'] # 评论的商品ID
item1['good_name'] = comment['referenceName'] # 评论的商品名字
item1['date'] = comment['referenceTime'] # 评论时间
item1['replyCount'] = comment['replyCount'] # 回复数
item1['score'] = comment['score'] # 评分
item1['status'] = comment['status'] # 状态
item1['userLevelId'] = comment['userLevelId'] # 用户等级
item1['productColor'] = comment['productColor'] # 商品颜色
item1['productSize'] = comment['productSize'] # 商品大小
item1['userLevelName'] = comment['userLevelName'] # 银牌会员,钻石会员等
item1['isMobile'] = comment['isMobile'] # 是否来自手机
item1['userClientShow'] = comment['userClientShow'] # 是否来自手机
item1['days'] = comment['days'] # 天数
items.append(item1)
return items
--------------------------------------------------------------------------------------------------------------------------------------------------
JingdongspiderItem
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
#爬虫的数据容器文件 items.py定义内容存储的关键字
class JingdongspiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
link = scrapy.Field() # 商品链接
project_id = scrapy.Field() # 商品ID
name = scrapy.Field() # 商品名字
comment_num = scrapy.Field() # 评论人数
shop_name = scrapy.Field() # 店家名字
price = scrapy.Field() # 价钱
GoodCountStr = scrapy.Field() # 好评
AfterCount = scrapy.Field() # 中评
PoorCount = scrapy.Field() # 差评
class commentItem(scrapy.Item):
user_name = scrapy.Field() # 评论用户的名字
user_id = scrapy.Field() # 评论用户的ID
userProvince = scrapy.Field() # 评论用户来自的地区
content = scrapy.Field() # 评论内容
good_id = scrapy.Field() # 评论的商品ID
good_name = scrapy.Field() # 评论的商品名字
date = scrapy.Field() # 评论时间
replyCount = scrapy.Field() # 回复数
score = scrapy.Field() # 评分
status = scrapy.Field() # 状态
userLevelId = scrapy.Field() # 用户等级
productColor = scrapy.Field() # 商品颜色
productSize = scrapy.Field() # 商品大小
userLevelName = scrapy.Field() # 银牌会员,钻石会员等
userClientShow = scrapy.Field() # 来自什么 比如来自京东客户端
isMobile = scrapy.Field() # 是否来自手机
days = scrapy.Field() # 天数
# commentTags = scrapy.Field() # 标签
MySQLPipeline
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
import MySQLdb.cursors
from twisted.enterprise import adbapi
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from scrapy.utils.project import get_project_settings
from scrapy import log
SETTINGS = get_project_settings()
#管道文件 pipelines. XinlangSpider.py中返回的内容进行存储
class MySQLPipeline(object):
#它是一个类方法,用@classmethod 标识, 是一种依赖注入的方式。 它的参数
#就是crawler,通过 crawler 我们可以拿到全局配置的每个配置信息。settings.py
@classmethod
def from_settings(cls, settings):
'''1、@classmethod声明一个类方法,而对于平常我们见到的则叫做实例方法。
2、类方法的第一个参数cls(class的缩写,指这个类本身),而实例方法的第一个参数是self,表示该类的一个实例
3、可以通过类来调用,就像C.f(),相当于java中的静态方法'''
dbparams = dict(
host=settings['MYSQL_HOST'], # 读取settings中的配置
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],
passwd=settings['MYSQL_PASSWD'],
charset='utf8', # 编码要加上,否则可能出现中文乱码问题
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=False,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbparams) # **表示将字典扩展为关键字参数,相当于host=xxx,db=yyy....
return cls(dbpool) # 相当于dbpool付给了这个类,self中可以得
def __init__(self, dbpool):
self.dbpool = dbpool
# pipeline默认调用
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self._conditional_insert, item) # 调用插入的方法
query.addErrback(self._handle_error, item, spider) # 调用异常处理方法
return item
# 写入数据库中
def _conditional_insert(self, tx, item):
sql = "insert into jingdong(project_id,name,comment_num,shop_name,link,GoodCountStr,AfterCount,PoorCount,price) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
params = (
item["project_id"], item["name"], item["comment_num"], item["shop_name"], item["link"], item["GoodCountStr"],
item["AfterCount"], item["PoorCount"], item["price"])
tx.execute(sql, params)
# 错误处理方法
def _handle_error(self, failue, item, spider):
print('--------------database operation exception!!-----------------')
print(failue)
#-----------------------------------------------------------------------------------------------------------
class CommentPipeline(object):
@classmethod
def from_settings(cls, settings):
dbparams = dict(
host=settings['MYSQL_HOST'], # 读取settings中的配置
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],
passwd=settings['MYSQL_PASSWD'],
charset='utf8', # 编码要加上,否则可能出现中文乱码问题
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=False,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbparams) # **表示将字典扩展为关键字参数,相当于host=xxx,db=yyy....
return cls(dbpool) # 相当于dbpool付给了这个类,self中可以得
def __init__(self, dbpool):
self.dbpool = dbpool
# pipeline默认调用
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self._conditional_insert, item) # 调用插入的方法
query.addErrback(self._handle_error, item, spider) # 调用异常处理方法
return item
# 写入数据库中
def _conditional_insert(self, tx, item):
sql = "insert into jd_comment(user_name,user_id,userProvince,content,good_id,good_name,date,replyCount,score,status,userLevelId,productColor,productSize,userLevelName,userClientShow,isMobile,days) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
params = (item["user_name"], item["user_id"], item["userProvince"], item["content"], item["good_id"],
item["good_name"], item["date"], item["replyCount"], item["score"], item["status"],
item["userLevelId"], item["productColor"], item["productSize"], item["userLevelName"],
item["userClientShow"],
item["isMobile"], item["days"])
tx.execute(sql, params)
# 错误处理方法
def _handle_error(self, failue, item, spider):
print('--------------database operation exception!!-----------------')
print(failue)
settings
# -*- coding: utf-8 -*-
# Scrapy settings for jingdongspider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'jingdongspider'
SPIDER_MODULES = ['jingdongspider.spiders']
NEWSPIDER_MODULE = 'jingdongspider.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'jingdongspider (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'jingdongspider.middlewares.JingdongspiderSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'jingdongspider.middlewares.JingdongspiderDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'jingdongspider.pipelines.CommentPipeline': 300,
'jingdongspider.pipelines.MySQLPipeline': 350,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# LOG_LEVEL = 'INFO'
# ======================================================================
#Mysql数据库的配置信息
MYSQL_HOST = '127.0.0.1'
MYSQL_DBNAME = 'jingdong' #数据库名字,请修改
MYSQL_USER = 'root' #数据库账号,请修改
MYSQL_PASSWD = 'shujuelin321' #数据库密码,请修改
MYSQL_PORT = 3306 #数据库端口,在dbhelper中使用