微信爬虫实战
'''
搜狗微信爬虫参考学习
搜狗反扒机制做的很好,所以容易触发验证码审核
# 封装的用户代理和IP代理方法,方法参数为访问链接,方法内部最多循环5次访问,IP和代理或网站还不能成功访问则停止
def ua_ip(myurl):
import urllib.request
import random
uapools = [
"Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
"Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1",
"Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11"
]
ippools = [
"107.0.68.29:3128",
"66.228.54.238:8080",
"142.54.191.252:8080
]
def api(ippools, uapools):
thisua = random.choice(uapools)
print(thisua)
thisip = random.choice(ippools)
print(thisip)
proxy = urllib.request.ProxyHandler({"http": thisip})
opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
headers = ('User-Agent', thisua)
opener.addheaders = [headers]
urllib.request.install_opener(opener)
datas = []
for i in range(0, 5):
try:
api(ippools, uapools)
url = myurl
data = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
datas.append(data)
break
except Exception as err:
print(err)
return datas[0]
# 业务逻辑
import urllib.request, re
key = "Python"
for i in range(0, 1):
key = urllib.request.quote(key)
thispageurl = "https://weixin.sogou.com/weixin?query="+key+"&type=2&page="+str(i+1)+"&ie=utf8"
thispagedata = ua_ip(thispageurl)
print(len(thispagedata))
pat = '<div class="txt-box">.*?href="(.*?)"'
rsttmp = re.compile(pat, re.S).findall(thispagedata)
if(len(rsttmp)==0):
print("本次没有爬取成功!")
continue
for j in range(0, len(rsttmp)):
thisurl = rsttmp[j]
print("原始链接:"+thisurl)
minpat = '/link'
thisurl = thisurl.replace(minpat, "https://weixin.sogou.com/link")
print("完善后链接:"+thisurl)
thisdata = ua_ip(thisurl)
print("文章爬取成功,长度为:"+str(len(thisdata)))
# fh = open("./sgwxpage.html", "w")
# fh.write(thispagedata)
# fh.close()
'''
抓包分析实战
'''
抓包分析就是使用软件将访问网页的加载地址行为都获得到
Windows一般使用Fiddler
Mac一般使用Charles
相应软件使用和HTTPS内容抓取,详情看:Python网络爬虫-抓包工具篇(Charles安装证书抓取HTTPS)和其他网上公开视频博客资料
'''
自动进行ajax异步请求数据
- 自动进行ajax异步请求数据思路
- 腾讯视频评论爬虫实战
'''
如果要通过爬虫实现Ajax异步请求,可以通过抓包分析进行实现
例如:向下拉取页面,重新加载出数据,这是观察Charles里面触发的新网址进行寻找规律。
'''
'''
腾讯视频老版深度解读评论链接格式:
https://video.coral.qq.com/filmreviewr/c/upcomment/[视频id]?commentid=[评论id]&reqnum=[每次提取的评论数]
当前版深度解读评论链接格式:
https://video.coral.qq.com/filmreviewr/c/upcomment/[视频id]?callback=_filmreviewrcupcommentihhsfwvvhcm16nd&reqnum=[每次提取的评论数]&commentid=[评论id]
老版短评链接格式:
https://video.coral.qq.com/filmreviewr/c/upcomment/[视频id]?commentid=[评论id]&reqnum=[每次提取的评论的个数]
时间有限当前版没做分析
'''
'''
# 1. 腾讯视频深度解读评论单页爬虫
import urllib.request
import re
vid = "ihhsfwvvhcm16nd"
cid = "6637814391144053894"
num = "5"
# 构造评论网址
url = "https://video.coral.qq.com/filmreviewr/c/upcomment/"+vid+"?callback=_filmreviewrcupcommentihhsfwvvhcm16nd&reqnum="+num+"&commentid="+cid
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0",
"Content-Type":"application/javascript",
}
opener = urllib.request.build_opener()
headall = []
for key, value in headers.items():
item = (key, value)
headall.append(item)
opener.addheaders = headall
urllib.request.install_opener(opener)
# 抓取当前页面评论
data = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
titlepat = '"title":"(.*?)"'
commentpat = '"content":"(.*?)"'
titleall = re.compile(titlepat, re.S).findall(data)
commentall = re.compile(commentpat, re.S).findall(data)
for i in range(0, len(titleall)):
try:
print("评论标题是:"+eval('u"'+titleall[i]+'"'))
print("评论内容是:" + eval('u"' + commentall[i] + '"'))
print("=======================")
except Exception as err:
print(err)
'''
'''
# 2. 腾讯视频深度解读评论爬虫自动切换下一页评论
import urllib.request
import re
vid = "ihhsfwvvhcm16nd"
cid = "6637814391144053894"
num = "3"
# 构造评论网址
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0",
"Content-Type":"application/javascript",
}
opener = urllib.request.build_opener()
headall = []
for key, value in headers.items():
item = (key, value)
headall.append(item)
opener.addheaders = headall
urllib.request.install_opener(opener)
# 抓取每一页评论
for i in range(0, 5):
print("第"+str(i)+"页")
thisurl = "https://video.coral.qq.com/filmreviewr/c/upcomment/" + vid + "?callback=_filmreviewrcupcommentihhsfwvvhcm16nd&reqnum=" + num + "&commentid=" + cid
data = urllib.request.urlopen(thisurl).read().decode("utf-8", "ignore")
titlepat = '"title":"(.*?)"'
commentpat = '"content":"(.*?)"'
lastpat = '"last":"(.*?)"'
titleall = re.compile(titlepat, re.S).findall(data)
commentall = re.compile(commentpat, re.S).findall(data)
cid = re.compile(lastpat, re.S).findall(data)[0]
for j in range(0, len(titleall)):
try:
print("评论标题是:"+eval('u"'+titleall[j]+'"'))
print("评论内容是:" + eval('u"' + commentall[j] + '"'))
print("=======================")
except Exception as err:
print(err)
'''
'''
import urllib.request
import re
vid="j6cgzhtkuonf6te"
cid="6233603654052033588"
num="3"
# 构造评论网址
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0",
"Content-Type":"application/javascript",
}
opener = urllib.request.build_opener()
headall = []
for key, value in headers.items():
item = (key, value)
headall.append(item)
opener.addheaders = headall
urllib.request.install_opener(opener)
# 抓取每一页评论
for i in range(0, 5):
print("第"+str(i)+"页")
thisurl="https://video.coral.qq.com/filmreviewr/c/upcomment/"+vid+"?commentid="+cid+"&reqnum="+num
data = urllib.request.urlopen(thisurl).read().decode("utf-8", "ignore")
commentpat = '"content":"(.*?)"'
lastpat = '"last":"(.*?)"'
commentall = re.compile(commentpat, re.S).findall(data)
cid = re.compile(lastpat, re.S).findall(data)[0]
for j in range(0, len(commentall)):
try:
print("评论内容是:" + eval('u"' + commentall[j] + '"'))
print("=======================")
except Exception as err:
print(err)
'''