Python—Scrapy爬取京东商城
1.创建项目
scrapy startproject jd
效果:
2.生成一个爬虫
scrapy genspider jd_category jd.com
效果:
3.在items.py文件中定义要提取的字段
import scrapy
class JdItem(scrapy.Item):
"""商品信息"""
title = scrapy.Field() # 标题
price = scrapy.Field() # 价格
sku_id = scrapy.Field() # 商品id
url = scrapy.Field() # 商品链接
info = scrapy.Field() # 评论
class CommentItem(scrapy.Item):
"""
评论
"""
# '留言时间', '评分', '回复数', '点赞数', '图片数', '评论内容'
content = scrapy.Field()
comment_time = scrapy.Field()
reply_count = scrapy.Field()
score = scrapy.Field()
vote_count = scrapy.Field()
image_count = scrapy.Field()
4.jd_category.py中的内容,对列表页进行了爬取
对评论信息进行了爬取
import html
import json
import re
import scrapy
from ..items import JdItem, CommentItem
class JdSpider(scrapy.Spider):
name = 'jd_goods'
allowed_domains = ['jd.com'] # 有的时候写个www.jd.com会导致search.jd.com无法爬取
# https: // list.jd.com / list.html?cat = 9987, 653, 655
page = 1
s = 1
url = 'https://list.jd.com/list.html?cat=9987%2C653%2C655&page=1&s=1&click=0'
next_url = 'https://list.jd.com/list.html?cat=9987%2C653%2C655&page={}&s={}&click=0'
def start_requests(self):
yield scrapy.Request(self.url)
def parse(self, response):
"""
爬取每页的前三十个商品,数据直接展示在原网页中
:param response:
:return:
"""
for li in response.xpath('//*[@id="J_goodsList"]/ul/li'):
item = JdItem()
title = li.xpath('div/div/a/em/text()').extract_first("") # 标题
price = li.xpath('div/div/strong/i/text()').extract_first("") # 价格
sku_id = li.xpath('./@data-sku').extract_first("") # id
# 详细内容的url
url = li.xpath('./div/div[@class="p-img"]/a/@href').extract_first("") # 需要跟进的链接
item['title'] = title
item['price'] = price
item['url'] = url
item['sku_id'] = sku_id
if not item['url'].startswith("https:"):
item['info'] = None
item['url'] = "https:" + item['url']
# yield item
# 详细页面
yield scrapy.Request(item['url'], callback=self.info_parse, meta={"item": item})
if self.page <=10:
self.page +=2
self.s +=60
# print(self.next_url.format(self.page, self.s))
yield scrapy.Request(url=self.next_url.format(self.page, self.s),callback=self.parse)
def info_parse(self, response):
"""
详细页面
:param response:
:return:
"""
item = response.meta['item']
# 评论页面的url
# page是评论的页面,如果爬取多页,可以更改page
comment_url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={}' \
'&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1'
# print(comment_url.format(item.get('sku_id')))
# 评论页面
yield scrapy.Request(comment_url.format(item.get('sku_id')), callback=self.comment_parse, meta={"item": item})
def comment_parse(self, response):
"""
爬取评论
:param response:
:return:
"""
text= response.text
comment_list = re.findall(
r'guid":".*?"content":"(.*?)".*?"creationTime":"(.*?)",".*?"replyCount":(\d+),"score":(\d+).*?usefulVoteCount":(\d+).*?imageCount":(\d+).*?images":',
text)
info = []
for result in comment_list:
# 根据正则表达式结果匹配数据
# '留言时间', '评分', '回复数', '点赞数', '图片数', '评论内容'
comment_item = CommentItem()
comment_item['content'] = result[0]
comment_item['comment_time'] = result[1]
comment_item['reply_count'] = result[2]
comment_item['score'] = result[3]
comment_item['vote_count'] = result[4]
comment_item['image_count'] = result[5]
info.append(comment_item)
item = response.meta['item']
item['info'] = info
yield item
5.只在pipelines.py中进行了简单的打印
6.执行: python -m scrapy crawl jd_category
效果: