python采集微信公众号_python采集微信公众号文章

本文实例为大家分享了python采集微信公众号文章的具体代码,供大家参考,具体内容如下

在python一个子目录里存2个文件,分别是:采集公众号文章.py和config.py。 代码如下:

1.采集公众号文章.py

from urllib.parse import urlencode

import pymongo

import requests

from lxml.etree import xmlsyntaxerror

from requests.exceptions import connectionerror

from pyquery import pyquery as pq

from config import *

#配置mongodb

client = pymongo.mongoclient(mongo_uri)

db = client[mongo_db]

base_url = 'http://weixin.sogou.com/weixin?'

#添加头文件

headers = {

'cookie': 'usid=s-pkm6vw_ac4ktr1; suv=00a75e9078efd9f75a6573ecad0ec883; wuid=aagcxershqaaaaqrgn4soagaaaa=; iploc=cn4414; suid=767beab73220910a000000005aa9e2aa; pgv_pvi=159197184; pgv_si=s8252565504; abtest=0|1521083055|v1; weixinindexvisited=1; sct=1; jsessionid=aaalxqkrp6jjs8ac4hwhw; ppinf=5|1521083238|1522292838|dhj1c3q6mtoxfgnsawvudglkojq6mjaxn3x1bmlxbmftzto2oiuzqsuyoxxjcnq6mta6mtuymta4mzizohxyzwzuawnrojy6jtnbjti5fhvzzxjpzdo0ndpvoxqybhvoaexncs1vlw1zbjmxmmnmskp4ogpzqhdlaxhpbi5zb2h1lmnvbxw; pprdig=tbvf7qlzddmjpcn4jtf3dg8c8nerx-ygdi8kucezn0rtewuhkgu4xmnaxzbakvquswboigl_rd-34abu6vy9jkv7me3bypigydniv2ljuchgco7gk58m9qhrm3aa7nhlhjfvyoaqkqgbsykpatxmnpe3tm57zdlzdpg_8mbmbnq; sgid=23-30671195-avqp42zctqiacybbdvvfwno4; phpsessid=4jjk2a9rv6kq7m50f42r92u3r3; suir=d2df4e12a5a1c3ce1a8ad7f2a5fe18fe; ppmdig=1521087492000000855f9824f94abe82b25d2839135ad3a8; snuid=fef36d3f8882efec4fcf61e68801da49; seccoderight=success; successcount=1|thu, 15 mar 2018 04:23:23 gmt',

'host': 'weixin.sogou.com',

'referer': 'http://weixin.sogou.com/antispider/?from=%2fweixin%3fquery%3d%e9%a3%8e%e6%99%af%26type%3d2%26page%3d95%26ie%3dutf8',

'upgrade-insecure-requests': '1',

'user-agent': 'mozilla/5.0 (macintosh; intel mac os x 10_11_6) applewebkit/537.36 (khtml, like gecko) chrome/65.0.3325.146 safari/537.36'

}

#初始化代理为本地ip

proxy = none

#定义获取代理函数

def get_proxy():

try:

response = requests.get(proxy_pool_url)

if response.status_code == 200:

return response.text

return none

except connectionerror:

return none

#添加代理获取网页内容

def get_html(url, count=1):

print('crawling', url)

print('trying count', count)

global proxy

if count >= max_count:

print('tried too many counts')

return none

try:

if proxy:

proxies = {

'http': 'http://' + proxy

}

response = requests.get(url, allow_redirects=false, headers=headers, proxies=proxies)

else:

response = requests.get(url, allow_redirects=false, headers=headers)

if response.status_code == 200:

return response.text

if response.status_code == 302:

# need proxy

print('302')

proxy = get_proxy()

if proxy:

print('using proxy', proxy)

return get_html(url)

else:

print('get proxy failed')

return none

except connectionerror as e:

print('error occurred', e.args)

proxy = get_proxy()

count += 1

return get_html(url, count)

#获取索引页内容

def get_index(keyword, page):

data = {

'query': keyword,

'type': 2,

'page': page

}

queries = urlencode(data)

url = base_url + queries

html = get_html(url)

return html

#解析索引页,提取详情页网址

def parse_index(html):

doc = pq(html)

items = doc('.news-box .news-list li .txt-box h3 a').items()

for item in items:

yield item.attr('href')

#获取详情页

def get_detail(url):

try:

response = requests.get(url)

if response.status_code == 200:

return response.text

return none

except connectionerror:

return none

#解析索引页,返回微信文章标题、内容、日期、公众号名称等

def parse_detail(html):

try:

doc = pq(html)

title = doc('.rich_media_title').text()

content = doc('.rich_media_content').text()

date = doc('#post-date').text()

nickname = doc('#js_profile_qrcode > div > strong').text()

wechat = doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()

return {

'title': title,

'content': content,

'date': date,

'nickname': nickname,

'wechat': wechat

}

except xmlsyntaxerror:

return none

#存储到mongodb,去重操作

def save_to_mongo(data):

if db['articles'].update({'title': data['title']}, {'$set': data}, true):

print('saved to mongo', data['title'])

else:

print('saved to mongo failed', data['title'])

#主函数

def main():

for page in range(1, 101):

html = get_index(keyword, page)

if html:

article_urls = parse_index(html)

for article_url in article_urls:

article_html = get_detail(article_url)

if article_html:

article_data = parse_detail(article_html)

print(article_data)

if __name__ == '__main__':

main()

2.config.py代码:

#爬取公众号文章

proxy_pool_url = 'http://127.0.0.1:5000/get'

keyword ='计算机等级二级' # 输入关键词

mongo_uri = 'localhost'

mongo_db = 'data'

max_count = 5

其中,config.py中keyword为查找关键词,可以根据需要更改。经实测,运行"采集公众号文章.py"成功!若因受限不成功,可多运行几次。

04894ac9c36f56d8e27b0b96e76d96dd.png

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持萬仟网。

如您对本文有疑问或者有任何想说的,请点击进行留言回复,万千网友为您解惑!

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值