爬取微信的文章
# 模拟浏览器和设置代理来获取数据
def use_proxy(proxy_addr, url):
# 模拟浏览器访问
headers = ('User-agent', 'Moz+illa/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36')
# 设置http代理
proxy = urllib.request.HTTPHandler({'https': proxy_addr})
opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
opener.addheaders = [headers]
# 创建全局的opener
urllib.request.install_opener(opener)
# 发送请求获取数据
data = urllib.request.urlopen(url).read().decode('utf-8')
return data
# 获取每页文章的链接
def getListUrl(keywords, pagestart, pageend, proxy):
# 存放获取每页的文章链接
listUrls = []
# 对搜索关键字进行编码
keywords_encode = urllib.request.quote(keywords)
page_encode = urllib.request.quote("&page")
# 获取每页的文章链接将其保存
try:
for page in range(pagestart, pageend):
url = "http://weixin.sogou.com/weixin?type=2&query="+keywords_encode+page_encode+str(page)
print(url)
data = use_proxy(proxy, url)
print(data)
# 过滤每页的链接
pattrn = '<a data-z="art" target="_blank" id=".*?" href="(http://.*?)"'
results = re.compile(pattrn, re.S).findall(data)
# 保存每页链接的数据
listUrls.append(results)
return listUrls
except urllib.request.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
# 如果是URLError错误就等待10秒
time.sleep(10)
except Exception as e:
print("出现异常")
# 如果是Exception就等待1秒
time.sleep(1)
# 获取每个文章对应的标题和内容
def getContent(litsUrls, proxy):
# 网页的头部
html1 = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>微信文章</title>
</head>
<body>'''
dataStart = open("D:/python/file/9.html", "wb")
dataStart.write(html1.encode('utf-8'))
dataStart.close()
# listUrls是一个二维的列表[][],存放这每页的数据和每页文章的链接
for i in litsUrls:
for j in litsUrls[i]:
try:
url = litsUrls[i][j]
# 观察每个文章的链接地址与浏览器上地址不一样,多了一些“map;”,因此我们需要将其处理掉
deal_url = url.replace("↦", "&")
# 通过代理去爬取网页数据
articelData = use_proxy(proxy, deal_url)
# 文章标题的正则
title_pattrn = '<title>(.*?)</title>'
title = re.compile(title_pattrn, re.S).findall(articelData)
print("标题:"+title)
# 文章内容的正则
content_pattrn = 'id="js_content">(.*?)id="js_sg_bar"'
content = re.compile(content_pattrn, re.S).findall(articelData)
print("内容:"+title)
articelTitle = "没有获取到标题"
articelContent = "没有获取到内容"
if title != []:
articelTitle = title[0]
if content != []:
articelContent = content[0]
detail = "<p>标题为:"+articelTitle+"</p><p>内容为:"+articelContent+"</p><br>"
detailContent = open("D:/python/file/9.html", "ab")
detailContent.write(detail.encode('utf-8'))
detailContent.close()
print("第"+str(i)+"页的数据处理了"+str(j)+"次")
except urllib.request.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
# 如果是URLError错误就等待10秒
time.sleep(10)
except Exception as e:
print("出现异常")
# 如果是Exception就等待1秒
time.sleep(1)
# 网页的尾部
html2 = '''</body>
</html>'''
dataEnd = open("D:/python/file/9.html", "ab")
dataEnd.write(html2.encode('utf-8'))
dataEnd.close()
# 代理设置
proxy1 = "344.48.5.154:8989"
proxy2 = "303.213.247.12:8989"
pageStart = 1
pageEnd = 2
# 搜索的关键字
keywords = "物联网"
listUrls = getListUrl(keywords, pageStart, pageEnd, proxy2)
getContent(listUrls, proxy2)
如果各位想用的话,请换上自己的代理ip。我这个免费的已经失效了。