找了好多程序,终于找到能用的,经过调试一下发现可以运行
运行软件是PyCharm2021
环境是python3.6
文章链接来源:搜狗微信文章搜索(搜狗微信搜索_订阅号及文章内容独家收录,一搜即达)
import requests
from lxml import etree
import os
def get_con(url):
html = requests.get(url).text
con = etree.HTML(html)
#获取标题
h2 = con.xpath('//h1[@class="rich_media_title"]/text()')
h2 = ",".join(map(str, h2))#将数组转成字符串
h2 = os.linesep.join([s for s in h2.splitlines(True) if s.strip()])
h2 = h2.rstrip()#去除右空行
#print(h2)
# 获取正文
p_text = ''
span = con.xpath('//p | //section/span')#通过‘|’可以增加筛选的条件
#print(span)
for p_tex in span:
p_tex = p_tex.xpath('string(.)')
p_text = p_text + p_tex + '\n'
# print(p_tex)
#print(p_text)
# 保存内容
con_text = '%s%s%s%s' % (h2, '\n', p_text, '\n')
with open(f'D:/python/weixin/{h2}.txt', 'w', encoding='utf-8') as f:#要改成自己的文件路径
f.write(con_text)
print(f'《{h2}》 文章保存成功!')
if __name__ == '__main__':
url=input("请输入要采集的微信公众号文章地址:")
#url=""
get_con(url)
运行截图:
文件获取如下: