python爬取jd商品评论数据
1.导入包
# -*- coding: utf-8 -*-
import pandas as pd
import urllib.request as req
import json
import sys
import time
import random
# Return the current default encoding used by the Unicode implementation.
print(sys.getdefaultencoding())
2.声明商品类
class JDCommentsCrawler():
# 初始化商品评论参数,产品id,回调函数,页码,评分0,1,2,3,4,5,排序类型,一页显示条数默认为京东一页显示十条评论数,获取的最大评论页数
def __init__(self, score=0,startPage=1,toPage=2,productId=None, callback=None,sortType=5, pageSize=10,maxPage=100):
self.productId = productId # 商品ID
self.score = int(score) # 评论类型(好:3、中:2、差:1、所有:0)
self.startPage = int(startPage)
self.toPage = int(toPage)
self.sortType = sortType # 排序类型(推荐:5、时间:6)
self.pageSize = pageSize # 每页显示多少条记录(默认10)
self.callback = callback # 回调函数,每个商品都不一样
self.maxPage=maxPage
# 将不含变量的url储存到locationLink,方便调用
self.locationLink = 'https://sclub.jd.com/comment/productPageComments.action'
# 通过url传递的变量值
self.paramValue = {
'callback': self.callback,
'productId': self.productId,
'score': self.score,
'sortType': self.sortType,
'pageSize': self.pageSize,
}
self.locationUrl = None
# 拼接传参变量:变量=值&变量=值……,返回拼接好变量的字符串str0
def paramDictStr(self, params):
str0 = ''
for key, value in params.items():
str0 = str0 + key + '=' + str(value) + '&'
return str0
# :locationLink?param=value...&isShadowSku=0&fold=1&page=0,返回可以在网页直接显示的json文件数据
def concatLinkParam(self):
self.locationUrl = self.locationLink + '?' + self.paramDictStr(self.paramValue) + 'isShadowSku=0&fold=1&page=0'
print(self.locationUrl)
# 设置请求标头,连接,接收文件类型,接收文件语言,用户代理,商品网页,主页
def requestMethodPage(self, p):
# 伪装浏览器 ,打开网站
headers = {
'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Referer': 'https://item.jd.com/%d.html' % (self.productId),
'Host': 'sclub.jd.com'
}
url = self.locationUrl[:-1] + str(p)
print('url : ', url)
reqs = req.Request(url, headers=headers)
return reqs
# 将json文件其中的对象提取出来,转换为python对象dict
def showListPage(self, p):
request_m = self.requestMethodPage(p)
conn = req.urlopen(request_m)
return_str = conn.read().decode('gbk')
# print(return_str[1:-2])fetchJSON_comment98回调函数值19位,1:-2消除‘);’,从json获取完整的对象
return_str = return_str[len(self.callback) + 1:-2]
return json.loads(return_str)
def save_csv(self, df, p,path):
# 保存文件
df.to_csv(path_or_buf=path+'%s.csv' % p, encoding='gbk')
def crawler(self,path):
# 把抓取的数据存入CSV文件,设置时间间隔,以免被屏蔽
dfs = []
for p in range(0 if self.startPage<=1 else self.startPage-1,self.toPage if self.toPage<=self.maxPage else self.maxPage):
json_info = self.showListPage(p)
tmp_list = []
self.maxPage=json_info['maxPage']
# productCommentSummary = json_info['productCommentSummary']
# productId = productCommentSummary['productId']
comments = json_info['comments']
for com in comments:
if('afterUserComment' not in com):
tmp_list.append(
[ com['nickname'],com['creationTime'],str(com['content']).replace('\n',' '),com['score'],
'null','null',\
com['referenceTime'], \
com['productColor'],com['productSize'],com['userImageUrl']])
df = pd.DataFrame(tmp_list,
columns=['nickname(用户昵称)','create_time(评论时间)','content(评论内容)','score(评分)',
'hCreate_time(追评时间)','hContent(追评)',
'reference_time(购买时间)',
'productColor(购买产品颜色)' ,'productSize(购买产品尺寸)','userImageUrl(用户头像)'])
else:
tmp_list.append(
[com['nickname'],com['creationTime'], str(com['content']).replace('\n',' '),com['score'], \
com['afterUserComment']['created'],com['afterUserComment']['content'],\
com['referenceTime'], \
com['productColor'], com['productSize'], com['userImageUrl']])
df = pd.DataFrame(tmp_list,
columns=['nickname(用户昵称)','create_time(评论时间)','content(评论内容)','score(评分)',
'hCreate_time(追评时间)','hContent(追评)',
'reference_time(购买时间)',
'productColor(购买产品颜色)' ,'productSize(购买产品尺寸)','userImageUrl(用户头像)'])
print("第{0}页已完成".format(p+1))
dfs.append(df)
time.sleep(random.randint(5, 7))
final_df = pd.concat(dfs, ignore_index=True )
self.save_csv(final_df, "JD_{2}_score{0}_num{1}".format(self.score,self.toPage*self.pageSize,"%s"%com['referenceName'].split(' ')[0]),path)
if(p==self.toPage-1):
print("已爬取{}页".format(self.toPage))
# --------------------------------------------------------------------------------------------
3.设置爬取调用方法,关键变量在jd有关商品网页上查找url标头,实现不同商品的爬取:点击商品评论–>f12–>network–>找到对应文件查看.json文件
通过查询jd页面的荣耀官方旗舰店下荣耀30的评价json文件得到其评论文件标头url:
https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=100012597526&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1
def jdComment():
# 设置关键变量
startPage = 1 # 开始页数
toPage=1 #结束页码
# productId={ 100012545832:'太空银',
# 100012597526:'幻夜黑',
# 100012545854:'绿野仙踪',
# 100006875109:'霓影紫',
# 100012545856:'流光幻境'}
productId = 100006875109 # 商品ID
callback = 'fetchJSON_comment98' # 回调函数
score=0 #获取其它评价类型更改此项:追评:5,全评:0,好|中|差:3,2,1
# path='D:\\PASS\\JD_HONOR30\\'
# for i in range(3,0,-1):
path='D:\\'
JDC = JDCommentsCrawler(score,startPage,toPage,productId,callback)
JDC.concatLinkParam()
print("可爬取最大页数:{0},从第{1}页爬取到第{2}页".format(JDC.maxPage,startPage,toPage))
# 选择从startPage开始要打印的页码数,若超出最大评论页数,toPage:maxPage,path:存储路径
JDC.crawler(path)
if __name__ == '__main__':
jdComment()
4.结果如图
参考链接:https://blog.csdn.net/uvyoaa/article/details/80575503