京东商品评论是动态网页,用get请求,但是得到之后不是json,所以就需要去一个参数或者返回text,然后用切片
1、更改url参数返回json
url获取方法:打开京东商品,用谷歌开发者工具,找到network,然后刷新,查找comments,找到返回评论的那个文件。找到url后,去除参数callback即可返回json
(1)获取
import requests
def get_comments(url):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
try:
r = requests.get(url,headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.json()
except Exception as e:
print("Error", e)
return ""
(2)解析内容
import pandas as pd
def parse_comments(data):
comments = data['comments']
com = []
for comment in comments:
tmp = {}
tmp['用户名'] = comment['nickname']
tmp['评价内容'] = comment['content']
tmp['星级评分'] = comment['score']
tmp['评价日期'] = comment['creationTime'].split(" ")[0]
if 'videos' in comment:
tmp['评价视频url'] = comment['videos'][0]['remark']
else:
tmp['评价视频url'] = ''
com.append(tmp)
result = pd.DataFrame(com,columns=['用户名','评价内容','星级评分','评价日期','评价视频url'])
return result
(3)保存
import os
def save_comments(data_df):
if os.path.exists('评论信息.csv'):
#存在则追加,不写入表头
# excel能够正确识别用gb2312、gbk、gb18030或utf_8_sig编码的中文,utf-8可能出现乱码
data_df.to_csv('评论信息.csv', mode='a',header = False,index=False)
#不存在,则直接写入,带表头
else:
# 加上参数mode='a'也可以
data_df.to_csv('评论信息.csv', index=False)
print("保存成功!")
(4)主函数
if __name__ =='__main__':
begin = int(input('请输入起始页:'))
end = int(input('请输入结束页:'))
for i in range(begin, end+1):
url = 'https://club.jd.com/comment/productPageComments.action?productId=100004770263&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&fold=1'.format(i)
data = get_comments(url)
result = parse_comments(data)
save_comments(result)
2.不更改url,返回text,然后对返回数据切片,得到json
只获取数据改变,其余同1
def get_comments(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
r_json = r.text[20:-2] #去掉前面多余字符,取出json数据
return json.loads(r_json)
#return r_json.json() #此种写法是错误的
except Exception as e:
print("Error", e)
return ""