通过分析Ajax请求可以看到天猫评论的地址规律变化:
http://rate.tmall.com/list_detail_rate.htm?itemId=560016036932&sellerId=2103295383¤tPage=1
其中itemId是商品id,sellerid是卖家id,currentPage是页面号
那么拿到一个商品的URL后,通过分析源代码可以筛选出 itemid和sellerid
得到itemid和sellerid 就通过评论的模板修改对应的数值
然后得到评论信息的源代码
通过正则表达式得到评论内容
import requests
from requests.exceptions import RequestException #添加异常
import re
def find_one_url(url):
try:
response = requests.get(url)
if response.status_code == 200: # 判断状态码 如果是200则请求成功
return response.text # 返回文档信息
return None
except RequestException: # 捕捉异常
return None
def put_out_informetion(url):
response=find_one_url(url)
pattern=re.compile('itemId=(\d+)')
items=re.search(pattern,response)
print(items.group(1))
pattern1 = re.compile('userId=(\d+)')
items1 = re.search(pattern1, response)
print(items1.group(1))
anlay_the_juge(items.group(1),items1.group(1))
def anlay_the_juge(itemID,userID):
url = 'http://rate.tmall.com/list_detail_rate.htm?itemId={}&sellerId={}3¤tPage=1'.format(itemID,userID)
responseds=find_one_url(url)
retest=re.compile('rateContent.*?:"(.*?)","rateDate"',re.S)
items=re.findall(retest,responseds)
print(items)
def main():
url='https://chaoshi.detail.tmall.com/item.htm?spm=a230r.1.14.6.338a6494ARFiQO&id=522151264418&cm_id=140105335569ed55e27b&abbucket=9'
put_out_informetion(url)
main()