#微信爬虫 自动获取微信相关文章信息的一种爬虫。伪装浏览器,使用代理ip
import urllib.request
import urllib.error
import time
import re
def use_proxy(url,proxy_addr):
try:
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36')
proxy = urllib.request.ProxyHandler({'http':proxy_addr})
opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
data = urllib.request.urlopen(req).read().decode('utf-8','ignore')
return data
except urllib.error.URLError as e:
if hasattr(e,'code'):
print(e.code)
if hasattr(e,'reason'):
print(e.reason)
time.sleep(5)
except Exception as e:
print('exception:'+str(e))
time.sleep(1)
key = "python"
proxy = "58.244.59.185:8080" #代理服务器地址
for i in range(0,10):
key = urllib.request.quote(key)
thispageurl = "https://weixin.sogou.com/weixin?query="+key+"&_sug_type_=&sut=1044&lkt=7%2C1567498024564%2C1567498025603&s_from=input&_sug_=y&type=2&sst0=1567498025704&page="+str(i)+"&ie=utf8&w=01019900&dr=1"
print(thispageurl)
thispagedata = use_proxy(thispageurl,proxy)
print(len(str(thispagedata)))
pat1 = 'data-share="(.*?)"'
rs1 = re.compile(pat1,re.S).findall(str(thispagedata))
if(len(rs1) == 0):
print("此次"+str(i)+"页没成功")
continue
for j in range(0,len(rs1)):
thisurl = rs1[j]
thisurl = thisurl.replace('amp;','')
file = "d:/25/"+str(i)+"页第"+str(j)+"篇文章.html"
thisdata = use_proxy(thisurl,proxy)
try:
fh = open(file,'wb')
fh.write(thisdata)
fh.close()
print("第"+str(i)+"页第"+str(j)+"篇文章成功")
except Exception as e:
print(e)
print("第"+str(i)+"页第"+str(j)+"篇文章失败")