技术:
- 浏览器伪装
- Fiddler解析url的js文件
代码如下:
'''爬取腾讯评论
技术:浏览器伪装、每次加载评论的对应url也做规律性改变
网址url通过Fildler抓包分析获取(https)
最后打算通过正则的方式剔除网页中一些不太友好的内容 但不是很理想'''
import urllib.request
import re
import urllib.error
headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
pattern = '[<=>]+'
comid="6165793094371986503"
url="http://video.coral.qq.com/filmreviewr/c/upcomment/0dfpyvfa7tp0ewe?commentid="+comid+"&reqnum=3&callback=jQuery1120026430801920245595_1478436999932&_=1478436999935"
for i in range(0,100):
data=urllib.request.urlopen(url).read().decode()
# 提取“加载评论”所在的网址
patnext='"last":"(.*?)"'
nextid=re.compile(patnext).findall(data)[0]
patcom='"content":"(.*?)",'
comdata=re.compile(patcom).findall(data)
for j in range(0,len(comdata)):
print("------第"+str(i)+str(j)+"条评论内容是:")
print(re.sub(pattern,'',eval('u"'+comdata[j]+'"'))) #eval让内容运行一下
url="http://video.coral.qq.com/filmreviewr/c/upcomment/0dfpyvfa7tp0ewe?commentid="+nextid+"&reqnum=3&callback=jQuery1120026430801920245595_1478436999932&_=1478436999935"