python 微信爬虫_Python爬虫实战(三) — 微信文章爬虫

最新推荐文章于 2024-06-18 13:56:08 发布

weixin_39880623

最新推荐文章于 2024-06-18 13:56:08 发布

阅读量435

点赞数 2

文章标签： python 微信爬虫

前言

最近烦心事挺多的，能让我得到快乐的是一行行能够运行的代码，那么今天为大家带来微信文章爬取实战。

本篇目标

根据关键词搜索微信文章，并提取文章链接

自动保存微信文章，并保存为HTML格式

实现设置提取文章数目，并提供有关交互操作

快速开始

1.确定URL链接格式

首先打开搜狗微信搜索平台，任意搜索一个感兴趣的关键词，观察网址http://weixin.sogou.com/weixin?type=2&query=%E5%B0%8F%E7%B1%B3&ie=utf8&s_from=input&_sug_=n&_sug_type_=1&w=01015002&oq=&ri=0&sourceid=sugg&sut=0&sst0=1520605349656&lkt=0%2C0%2C0&p=40040108

这里我输入的是小米，可以观察到，链接中type字段控制检索信息类型，query字段则对应于我们的关键词，因为会自动编码，所以我看看到的是%E5...类似信息，其他内容则是无关信息，这里我们不讨论，下面点击下一页，

http://weixin.sogou.com/weixin?oq=&query=%E5%B0%8F%E7%B1%B3&_sug_type_=1&sut=0&lkt=0%2C0%2C0&s_from=input&ri=0&_sug_=n&type=2&sst0=1520605349656&page=2&ie=utf8&p=40040108&dp=1&w=01015002&dr=1

观察到链接中多出了一个page字段，对应于数字2，猜测这就是页码，尝试之后，果不其然，我们的猜测是正确的，下面我们来爬取文章的具体内容

2.提取文章地址

点击F12，通过控制台，我们发现文章网址对应在，txt-box元素下，如图

由此，我们很容易的写出我们的匹配模式代码

.*?(http://.*?)"'

对应完整代码如下：

def getlisturl(key, pagestart, pageend):

try:

page = pagestart

keycode = urllib.request.quote(key)

pagecode = urllib.request.quote("&page")

for page in range(pagestart, pageend+1):

url = "http://weixin.sogou.com/weixin?type=2&query=" + keycode + pagecode + str(page)

data1 = downloader(url)

listurlpat = '

.*?(http://.*?)"'

data2 = re.compile(listurlpat, re.S)

result = data2.findall(data1)

listurl.append(result)

#print(listurl)

print("共获取到" + str(len(listurl)) + "页")

return listurl

except urllib.error.URLError as e:

if hasattr(e, "code"):

print(e.code)

if hasattr(e, "reason"):

print(e.reason)

time.sleep(10)

except Exception as e:

print("exception:" + str(e))

time.sleep(1)

其中，key对应于关键词，pagestart对应于起始页码，默认为第一页，pageend对应于终止页码。

下载模块

我们需要从对应链接中下载内容，才能做到后面内容的提取，这里，下载模块我提供了两种方案，一种使用了代理服务器，一种没有使用，下面是没有使用代理服务器的代码：

def downloader(url):

headers = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "

"(KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 "

"Core/1.63.4793.400 QQBrowser/10.0.702.400")

opener = urllib.request.build_opener()

opener.addheaders = [headers]

urllib.request.install_opener(opener)

try:

data = urllib.request.urlopen(url).read()

data = data.decode('utf-8')

return data

except urllib.error.URLError as e:

if hasattr(e, "code"):

print(e.code)

if hasattr(e, "reason"):

print(e.reason)

time.sleep(10)

except Exception as e:

print("exception:" + str(e))

time.sleep(1)

这里使用了一些简单的伪装，以防止微信对爬虫的阻止，但这样并不安全，所以在完整代码中还提供了代理服务器模块，但是因为免费的代理服务器几乎都不能用，所以在此不多讲解。

完整代码如下

import re

import urllib.request

import time

import urllib.error

'''

def use_proxy(proxy_addr, url):

headers = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "

"(KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 "

"Core/1.63.4793.400 QQBrowser/10.0.702.400")

opener = urllib.request.build_opener()

opener.addheaders = [headers]

urllib.request.install_opener(opener)

try:

proxy = urllib.request.ProxyHandler({'http':proxy_addr})

opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)

urllib.request.install_opener(opener)

data = urllib.request.urlopen(url).read().decode('utf-8')

return data

except urllib.error.URLError as e:

if hasattr(e, "code"):

print(e.code)

if hasattr(e, "reason"):

print(e.reason)

time.sleep(10)

except Exception as e:

print("exception:" + str(e))

time.sleep(1)

'''

def downloader(url):

headers = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "

"(KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 "

"Core/1.63.4793.400 QQBrowser/10.0.702.400")

opener = urllib.request.build_opener()

opener.addheaders = [headers]

urllib.request.install_opener(opener)

try:

data = urllib.request.urlopen(url).read()

data = data.decode('utf-8')

return data

except urllib.error.URLError as e:

if hasattr(e, "code"):

print(e.code)

if hasattr(e, "reason"):

print(e.reason)

time.sleep(10)

except Exception as e:

print("exception:" + str(e))

time.sleep(1)

def getlisturl(key, pagestart, pageend):

try:

page = pagestart

keycode = urllib.request.quote(key)

pagecode = urllib.request.quote("&page")

for page in range(pagestart, pageend+1):

url = "http://weixin.sogou.com/weixin?type=2&query=" + keycode + pagecode + str(page)

data1 = downloader(url)

listurlpat = '

.*?(http://.*?)"'

data2 = re.compile(listurlpat, re.S)

result = data2.findall(data1)

listurl.append(result)

#print(listurl)

print("共获取到" + str(len(listurl)) + "页")

return listurl

except urllib.error.URLError as e:

if hasattr(e, "code"):

print(e.code)

if hasattr(e, "reason"):

print(e.reason)

time.sleep(10)

except Exception as e:

print("exception:" + str(e))

time.sleep(1)

def getcontent(listurl):

i = 0

html1 = '''

微信文章页面

fh = open("C://Users/59574/Desktop/python/test/WeChatSpider/1.html", "wb")

fh.write(html1.encode("utf-8"))

fh.close()

fh = open("C://Users/59574/Desktop/python/test/WeChatSpider/1.html", "ab")

for i in range(0, len(listurl)):

for j in range(0, len(listurl[i])):

try:

url = listurl[i][j]

url = url.replace("amp;", "")

data = downloader(url)

titlepat = "

(.*?)"

contentpat = 'id="js_content">(.*?)id="js_sg_bar"'

title = re.compile(titlepat).findall(data)

content = re.compile(contentpat, re.S).findall(data)

if (title != []):

thistitle = title[0]

if (content != []):

thiscontent = content[0]

dataall = "

标题为:" + thistitle + "

内容为:" + thiscontent + "

fh.write(dataall.encode("utf-8"))

print("第"+str(i+1) + "个网页第" + str(j+1) + "条内容保存")

except urllib.error.URLError as e:

if hasattr(e, "code"):

print(e.code)

if hasattr(e, "reason"):

print(e.reason)

time.sleep(10)

except Exception as e:

print("exception:" + str(e))

time.sleep(1)

fh.close()

html2 = '''

'''

fh = open("C://Users/59574/Desktop/python/test/WeChatSpider/1.html", "ab")

fh.write(html2.encode("utf-8"))

fh.close()

if __name__ == '__main__':

listurl = list()

key = str(input('请输入要查询的关键词：'))

#proxy = "115.223.209.169:9000"

pagestart = 1

pageend = int(input('请输入结束页码(每页保存10条内容)'))

listurl = getlisturl(key, pagestart, pageend)

getcontent(listurl)

Github版本

运行结果

小结

开学了，事情有些多，个人的一些事情也让人难过，不过我会尽力更新，下一次我会为大家带来该代码多线程运行版本。

最后，希望大家都能尊重我的付出，转载我的文章请注明出处；

也希望大家关注我的个人博客 HD Blog

谢谢~

weixin_39880623

关注

2
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python 微信爬虫_Python爬虫实战(三) — 微信文章爬虫

前言最近烦心事挺多的，能让我得到快乐的是一行行能够运行的代码，那么今天为大家带来微信文章爬取实战。本篇目标根据关键词搜索微信文章，并提取文章链接自动保存微信文章，并保存为HTML格式实现设置提取文章数目，并提供有关交互操作快速开始1.确定URL链接格式首先打开搜狗微信搜索平台，任意搜索一个感兴趣的关键词，观察网址http://weixin.sogou.com/weixin?type=2&q...
复制链接

扫一扫