关键之处在于页面上的评论数的信息是动态加载的,是通过那个标签唯一的id号进行url拼接获得一个json文件然后显示的评论数。
抓包寻找了许久之后发现了一个奇特的文件。
然后再源代码里面搜索发现这个是商品的id号,于是提取这个id号并构造url进行进一步的响应和提取json的信息即可。
写了那么多天scrapy,换下口味~~滑稽
import csv
import json
import re
import time
import requests
from lxml import etree
from requests.exceptions import RequestException
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
}
fp = open('D:/京东图书.csv', 'wt', newline='', encoding='utf-8')
writer = csv.writer(fp)
writer.writerow(('书名', '价格', '评论数', '经营店', '作者', '出版社', '发行时间'))
def get_html(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.content.decode()
else:
return None
except RequestException:
return None
def get_info(html):
selector = etree.HTML(html)
goods = selector.xpath("//div[@class='gl-i-wrap']")
for good in goods:
try:
price = good.xpath("div[@class='p-price']/strong/i/text()")[0] + '元'
# summary = good.xpath("div[@class='p-name']/a/em/text()")[0]
name = good.xpath("div[@class='p-name']/a/em/text()")[0]
comment_link = good.xpath("div[@class='p-commit']/strong/a/@id")[0]
jquery_id = comment_link.split('_')[2]
comment_number = get_comment_number(jquery_id)
manufacturer = good.xpath("div[@class='p-shopnum']/*[@class='curr-shop']/text()")[0]
author = good.xpath("div[@class='p-bookdetails']/span[@class='p-bi-name']/a/@title")[0]
maker = good.xpath("div[@class='p-bookdetails']/span[@class='p-bi-store']/a/@title")[0]
release_time = good.xpath("div[@class='p-bookdetails']/span[@class='p-bi-date']/text()")[0]
now = (price, summary, comment_number, manufacturer, author, maker, release_time)
writer.writerow((name, price, comment_number, manufacturer, author, maker, release_time))
print(now)
except:
pass
def get_comment_number(jquery_id):
url = 'https://sclub.jd.com/comment/productCommentSummaries.action?referenceIds={}&callback=jQuery4085889&_=1550374407814'.format(
jquery_id)
data = requests.get(url, headers=headers).content.decode('gbk')
real_data = re.findall('\((.*?)\)', data, re.S)[0]
json_data = json.loads(real_data)
comment_info = json_data['CommentsCount'][0]['CommentCountStr']
return comment_info
if __name__ == '__main__':
urls = ['https://search.jd.com/Search?keyword=machine%20learning&page={}'.format(i) for i in range(1, 34, 2)]
for url in urls:
html = get_html(url)
get_info(html)
time.sleep(2)
最好控制好爬取频率,反正我是爬了一遍想爬第二遍的时候不行了,估计ip被封了。