#自动进行Ajax异步请求数据与腾讯视频评论爬虫
#接近底层的去开发,框架开发
#scrapy框架
#爬虫时难的部分用翻糖js(处理复杂,但效率低哦),后续cookie,网址传递给scrapy#或者urllib来写,因为他们运行效率高
#xpath表达式也是用于信息提取,比正则表达式效率高,能力不高,一般只适合提取#xml格式的数据
#解决异步数据情况,首先想到抓包解决,通过fiddler这个软件,通过往下拉,出现#剩余网址,找规律用抓包的方法
#1腾讯视频评论爬虫(单页评论爬虫)
import urllib.request
import re
vid="1743283224"#视频id
cid="6399981406690050721"#评论id
num="20"
#构造当前评论网址
url="https://video.coral.qq.com/varticle/"+vid+"/comment/v2?cursor="+cid+"&orinum="+num
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0",
"Content-Type":"application/javascript",
}
opener=urllib.request.build_opener()
headall=[]
for key,value in headers.items():
item=(key,value)
headall.append(item)
opener.addheaders=headall
urllib.request.install_opener(opener)
#爬取当前评论页面
data=urllib.request.urlopen(url).read().decode("utf-8")
print(data)
commentpat='"content":"(.*?)"'
commentall=re.compile(commentpat).findall(data)
for i in range(0,len(commentall)):
try:
print("评论内容是:"+commentall[i])
print("_______________")
except Exception as err:
print(err)
#2 腾讯视频评论爬虫(自动切换下一页评论的爬虫):获取短评内容
import urllib.request
import re
vid="1743283224"
cid="0"
num="15"
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0",
"Content-Type":"application/javascript",
}
opener=urllib.request.build_opener()
headall=[]
for key,value in headers.items():
item=(key,value)
headall.append(item)
opener.addheaders=headall
urllib.request.install_opener(opener)
for j in range(0,100):
#爬取当前页面
print("第"+str(j)+"页")
thisurl="https://video.coral.qq.com/varticle/"+vid+"/comment/v2?cursor="+cid+"&orinum="+num
data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
commentpat='"content":"(.*?)"'
commentall=re.compile(commentpat).findall(data)
lastpat='"last":"(.*?)"'
cid=re.compile(lastpat).findall(data)[0]
for i in range(0,len(commentall)):
try:
print("评论内容是:"+eval('u"'+commentall[i]+'"'))
print("________")
except Exception as err:
print(err)