导入模块及写入要下载的产品编号
import requests
import pandas as pd
import json
from lxml import etree
import time
c_num = input(r"请输入要下载评论的商品编号:")
爬取产品总评论数
def d_cont(c_num,num):
url =f"https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={c_num}&score=0&sortType=5&page={int(num)}&pageSize=10&isShadowSku=0&rid=0&fold=1"
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'}
response = requests.get(url=url,headers=headers,timeout=5).text
response = response.replace("fetchJSON_comment98(","")
response = response.replace(');','')
dt = json.loads(response)
c_cont = dt['productCommentSummary']['commentCount']
num1 = c_cont/10 if c_cont%10 == 0 else c_cont//10 + c_cont%10
return num1,dt
获取产品评论、型号、颜色并保存
def download_cont(c_num,num1):
for i in range(num1):
d_cont(c_num,i)
dt_list = dt['comments']
cont = [i["content"] for i in dt_list]
color = [j["productColor"] for j in dt_list]
size = [b["productSize"] for b in dt_list]
date = pd.DataFrame({"评价":cont,"颜色":color,"型号":size},)
date.index = date.index + 1
date.to_csv("e:/京东评论.csv",mode = "a",header=0,encoding = "ANSI")
j = 1
print(f"第{j}页下载完成")
j += 1
time.sleep(3)
调用函数
num1,dt = d_cont(c_num,0)
download_cont(c_num,num1)
完整代码
import requests
import pandas as pd
import json
from lxml import etree
import time
c_num = input(r"请输入要下载评论的商品编号:")
def d_cont(c_num,num):
url =f"https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={c_num}&score=0&sortType=5&page={int(num)}&pageSize=10&isShadowSku=0&rid=0&fold=1"
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'}
response = requests.get(url=url,headers=headers,timeout=5).text
response = response.replace("fetchJSON_comment98(","")
response = response.replace(');','')
dt = json.loads(response)
c_cont = dt['productCommentSummary']['commentCount']
num1 = c_cont/10 if c_cont%10 == 0 else c_cont//10 + c_cont%10
return num1,dt
def download_cont(c_num,num1):
for i in range(num1):
d_cont(c_num,i)
dt_list = dt['comments']
cont = [i["content"] for i in dt_list]
color = [j["productColor"] for j in dt_list]
size = [b["productSize"] for b in dt_list]
date = pd.DataFrame({"评价":cont,"颜色":color,"型号":size},)
date.index = date.index + 1
date.to_csv("e:/京东评论.csv",mode = "a",header=0,encoding = "ANSI")
j = 1
print(f"第{j}页下载完成")
j += 1
time.sleep(3)
num1,dt = d_cont(c_num,0)
download_cont(c_num,num1)
项目思路
#1. 评论页面为动态页面
#2.json 数据返回不全,考虑加timeout延迟
#3.函数1 返回时,要总页数及json读取数据