目标:京东商品详情页的商品标题、商店名、商店链接、价格、好评率、评论数。
随意进入京东一款商品详情页面,查看源码可以看到商品标题、商店名、商店链接就在源码里面,可以直接获得,但是源码里面没有商品的价格,说明隐藏了,需要进行抓包分析
商品的链接:
https://item.jd.com/100003395443.html
抓包之后得到下面这两个链接:
https://p.3.cn/prices/mgets?callback=jQuery8092423&type=1&area=18_1522_29460_31350&pdtk=&pduid=1094136628&pdpin=jd_66b27ab550846&pin=jd_66b27ab550846&pdbp=0&skuIds=J_100003395443%2C&ext=11100000&source=item-pc
https://club.jd.com/comment/productCommentSummaries.action?referenceIds=100003395443&callback=jQuery2313070&_=1564707435635
第一个链接包含了价格,分析这个URL可以知道,100003395443就是商品的ID,更换这个ID就可以切换到别的商品的价格URL,第二个链接包含了好评率和评论数,代码如下:
items.py:
import scrapy
class JingdongItem(scrapy.Item):
title = scrapy.Field()
shop = scrapy.Field()
shoplink = scrapy.Field()
price = scrapy.Field()
GoodRateShow = scrapy.Field()
CommentCountStr = scrapy.Field()
编写spider程序:
写一个开始的函数
def start_requests(self):
headers = {"Cookie":"你的cookie值"}
url = "http://www.jd.com/"
yield Request(url=url,headers=headers)
rules = (
Rule(LinkExtractor(allow=r''), callback='parse_item', follow=True),#要找到每个商品的详情,就不用了设置allow,直接每个URL都进去
)
因为每个商品详情页面的URL为:https://item.jd.com/100003395443.html
中间数字为商品的ID,因此我们通过自动爬虫,自动进入每个页面,找到每个符合详情页面URL的规则,
编写页面规则函数:
def parse_item(self, response):
try:
item = JingdongItem()
thisurl = response.url #找到每个页面的URL
pat = "item.jd.com/(.*?).html" #设置每个商品详情页的链接规则
x = re.search(pat,thisurl) #找到每个URL看是否符合详情页面URL规则
if(x): #符合详情页规则 则进入
thisid = re.compile(pat).findall(thisurl)[0] #找到每个商品的ID,就是详情URL的那个数字
item["title"] = response.xpath("//html/head/title/text()").extract()#标题
item["shop"] = response.xpath("//div[@class='name']/a[@target='_blank']/text()").extract()#店名
item["shoplink"] =response.xpath("//div[@class='name']/a/@href").extract()#商店链接
priceurl = "https://p.3.cn/prices/mgets?callback=jQuery8092423&type=1&area=18_1522_29460_31350&pdtk=&pduid=1094136628&pdpin=jd_66b27ab550846&pin=jd_66b27ab550846&pdbp=0&skuIds=J_"+str(thisid)+"%2C&ext=11100000&source=item-pc"
pricedata = request.urlopen(priceurl).read().decode("utf-8","ignore")
item["price"] = re.findall(r'"p":"(.*?)"',pricedata)[0]#价格
Data_url = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds="+str(thisid)+"&callback=jQuery2313070&_=1564707435635"
GoodRateShowData = request.urlopen(Data_url).read().decode("utf-8","ignore")
item["GoodRateShow"] = re.findall(r'GoodRateShow":(.*?),',GoodRateShowData)[0]#好评率
item["CommentCountStr"] = re.findall(r'CommentCountStr":"(.*?)"',GoodRateShowData)[0]#评论数
if item["price"] == "-1.00":#因为某些价格的URL会出错,为-1.00,所以我们设置取到的值为-1.00就跳过
pass
else:
return item
else:
pass
except:
pass
取到值后把item返回到pipelines里面去处理,整个spider的代码为:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jingdong.items import JingdongItem
from scrapy.http import Request
from urllib import request
import re
class JdSpider(CrawlSpider):
name = 'jd'
allowed_domains = ['jd.com']
"""
start_urls = ['http://jd.com/']
"""
def start_requests(self):
headers = {"Cookie":"你的cookie值"}
url = "https://www.jd.com"
yield Request(url=url,headers=headers)
rules = (
Rule(LinkExtractor(allow=r''), callback='parse_item', follow=True),#要找到每个商品的详情,就不用了设置allow,直接每个URL都进去
)
def parse_item(self, response):
try:
item = JingdongItem()
thisurl = response.url #找到每个页面的URL
pat = "item.jd.com/(.*?).html" #设置每个商品详情页的链接规则
x = re.search(pat,thisurl) #找到每个URL看是否符合详情页面URL规则
if(x): #符合详情页规则 则进入
thisid = re.compile(pat).findall(thisurl)[0] #找到每个商品的ID,就是详情URL的那个数字
item["title"] = response.xpath("//html/head/title/text()").extract()#标题
item["shop"] = response.xpath("//div[@class='name']/a[@target='_blank']/text()").extract()#店名
item["shoplink"] =response.xpath("//div[@class='name']/a/@href").extract()#商店链接
priceurl = "https://p.3.cn/prices/mgets?callback=jQuery8092423&type=1&area=18_1522_29460_31350&pdtk=&pduid=1094136628&pdpin=jd_66b27ab550846&pin=jd_66b27ab550846&pdbp=0&skuIds=J_"+str(thisid)+"%2C&ext=11100000&source=item-pc"
pricedata = request.urlopen(priceurl).read().decode("utf-8","ignore")
item["price"] = re.findall(r'"p":"(.*?)"',pricedata)[0]#价格
Data_url = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds="+str(thisid)+"&callback=jQuery2313070&_=1564707435635"
GoodRateShowData = request.urlopen(Data_url).read().decode("utf-8","ignore")
item["GoodRateShow"] = re.findall(r'GoodRateShow":(.*?),',GoodRateShowData)[0]#好评率
item["CommentCountStr"] = re.findall(r'CommentCountStr":"(.*?)"',GoodRateShowData)[0]#评论数
if item["price"] == "-1.00":#因为某些价格的URL会出错,为-1.00,所以我们设置取到的值为-1.00就跳过
pass
else:
return item
else:
pass
except:
pass
pipelines.py中直接写入数据库就行了:
import pymysql
class JingdongPipeline(object):
def process_item(self, item, spider):
db = pymysql.connect(host="127.0.0.1",port=3306,user="数据库名",passwd="密码",db="表名")
cursor = db.cursor()
title = item["title"][0]
shop = item["shop"][0]
shoplink = "http:" + item["shoplink"][0]
price = item["price"]
GoodRateShow = item["GoodRateShow"]
CommentCountStr = item["CommentCountStr"]
print(title)
print(shop)
print(shoplink)
print(price)
print(GoodRateShow)
print(CommentCountStr)
print("-"*20)
sql = "insert into shangping(title,shop,shoplink,price,GoodRateShow,CommentCountStr) values('"+title+"','"+shop+"','"+shoplink+"','"+price+"','"+GoodRateShow+"','"+CommentCountStr+"')"
cursor.execute(sql)
db.commit()
cursor.close()
db.close()
return item
注意:记得设置好settings里面的一些函数。比如ROBOTSTXT_OBEY要改成FALSE