python快速保存微信公众号文章中的图片(可指定多个文章)

若竹之心

已于 2022-07-02 14:15:41 修改

阅读量622

点赞数 1

分类专栏： Python 源码微信公众号文章文章标签： python 微信公众号文章

于 2022-06-22 14:30:23 首次发布

本文链接：https://blog.csdn.net/qq_45365214/article/details/125405386

版权

Python 同时被 3 个专栏收录

106 篇文章 29 订阅

订阅专栏

源码

57 篇文章 5 订阅

订阅专栏

微信公众号文章

1 篇文章 0 订阅

订阅专栏

系列文章目录

文章目录

系列文章目录
前言
一、实现效果(以槿泉壁纸为例)
二、实现过程
三、源码
- 1.sound code
四、Python正则表达式匹配日期与时间
五、单个链接下载图片
六、多个链接下载文章
七、单链接下载音频
总结

前言

一、实现效果(以槿泉壁纸为例)

在这里插入图片描述

二、实现过程

1.新建一个link文本，将需要下载的文章链接依次保存；

在这里插入图片描述

2.新建一个.py文件，将下面的源码复制进去；

在这里插入图片描述

3.新建一个pic文件夹，用来保存图片；

在这里插入图片描述

4.运行即可；

三、源码

1.sound code

代码如下（示例）：

import requests
from re import findall
from bs4 import BeautifulSoup
import time
import os
import sys


weixin_title=""
weixin_time=""

#获取微信公众号内容,保存标题和时间
def get_weixin_html(url):
    global weixin_time,weixin_title
    res=requests.get(url)
    soup=BeautifulSoup(res.text,"html.parser")
    
    #获取标题
    temp=soup.find('h1')
    weixin_title=temp.string.strip()
    
    #使用正则表达式获取时间
#    result=findall(r'[0-9]{4}-[0-9]{2}-[0-9]{2}.+:[0-9]{2}',res.text)
    result=findall(r"(\d{4}-\d{1,2}-\d{1,2})",res.text)
    weixin_time=result[0]
    
    #获取正文html并修改
    content=soup.find(id='js_content')
    soup2=BeautifulSoup((str(content)),"html.parser")
    soup2.div['style']='visibility: visible;'
    html=str(soup2)
    pattern=r'http[s]?:\/\/[a-z.A-Z_0-9\/\?=-_-]+'
    result = findall(pattern, html)
    
    #将data-src修改为src
    for url in result:
        html=html.replace('data-src="'+url+'"','src="'+url+'"')
    
    return html

#上传图片至服务器
def download_pic(content):
    
    pic_path= 'pic/' + str(path)+ '/'
    if not os.path.exists(pic_path):
        os.makedirs(pic_path)
        
    #使用正则表达式查找所有需要下载的图片链接
    pattern=r'http[s]?:\/\/[a-z.A-Z_0-9\/\?=-_-]+'
    pic_list = findall(pattern, content)
    
    for index, item in enumerate(pic_list,1):
        count=1
        flag=True
        pic_url=str(item)
        
        while flag and count<=10:
            try:
                 data=requests.get(pic_url);
   
                 if pic_url.find('png')>0:
                     file_name = str(index)+'.png'
                     
                 elif pic_url.find('gif')>0:
                     file_name=str(index)+'.gif'
                     
                 else:
                     file_name=str(index)+'.jpg'

                 with open( pic_path + file_name,"wb") as f:
                     f.write(data.content)
                     
                 #将图片链接替换为本地链接
                 content = content.replace(pic_url, pic_path + file_name)
                 
                 flag = False
                 print('已下载第' + str(index) +'张图片.')
                 count += 1
                 time.sleep(1)
                      
            except:
                 count+=1
                 time.sleep(1)
                 
        if count>10:
            print("下载出错：",pic_url)
    return content


def get_link(dir):
    link = []
    with open(dir,'r') as file_to_read:
        while True:
            line = file_to_read.readline()
            if not line:
                break
            line = line.strip('\n')
            link.append(line)
    return link

path = 'link.txt'
linklist = get_link(path)
print(linklist)
s = len(linklist)
        

if __name__ == "__main__":
    
    #获取html
    input_flag=True
    while input_flag:
#        for j in range(0,s):
#            pic = str(j)
        j = 1
        for i in linklist:
            weixin_url = i  
            path = j
            j += 1     
            #weixin_url=input()
            re=findall(r'http[s]?:\/\/mp.weixin.qq.com\/s\/[0-9a-zA-Z_]+',weixin_url) 
            if len(re)<=0:
                    print("链接有误，请重新输入!")
            else:
                input_flag=False
            
            content=get_weixin_html(weixin_url)
            content=download_pic(content)
            #保存至本地
            with open(weixin_title+'.txt','w+',encoding="utf-8") as f:
                f.write(content) 
            with open(weixin_title+'.html','w+',encoding="utf-8") as f:
                f.write(content)  
                
            print()
            print("标题：《"+weixin_title+"》")
            print("发布时间："+weixin_time)

四、Python正则表达式匹配日期与时间


import re
from datetime import datetime

test_date = '小明的生日是2016-12-12 14:34,小张的生日是2016-12-21 11:34 .'
test_datetime = '小明的生日是2016-12-12 14:34,.小晴的生日是2016-12-21 11:34,好可爱的.'

# date
mat = re.search(r"(\d{4}-\d{1,2}-\d{1,2})",test_date)
print mat.groups()
# ('2016-12-12',)
print mat.group(0)
# 2016-12-12

date_all = re.findall(r"(\d{4}-\d{1,2}-\d{1,2})",test_date)
for item in date_all:
    print item
# 2016-12-12
# 2016-12-21

# datetime
mat = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",test_datetime)
print mat.groups()
# ('2016-12-12 14:34',)
print mat.group(0)
# 2016-12-12 14:34

date_all = re.findall(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",test_datetime)
for item in date_all:
    print item
# 2016-12-12 14:34
# 2016-12-21 11:34
## 有效时间

# 如这样的日期2016-12-35也可以匹配到.测试如下.
test_err_date = '如这样的日期2016-12-35也可以匹配到.测试如下.'
print re.search(r"(\d{4}-\d{1,2}-\d{1,2})",test_err_date).group(0)
# 2016-12-35

# 可以加个判断
def validate(date_text):
    try:
        if date_text != datetime.strptime(date_text, "%Y-%m-%d").strftime('%Y-%m-%d'):
            raise ValueError
        return True
    except ValueError:
        # raise ValueError("错误是日期格式或日期,格式是年-月-日")
        return False

print validate(re.search(r"(\d{4}-\d{1,2}-\d{1,2})",test_err_date).group(0))
# false

# 其他格式匹配. 如2016-12-24与2016/12/24的日期格式.
date_reg_exp = re.compile('\d{4}[-/]\d{2}[-/]\d{2}')

test_str= """
     平安夜圣诞节2016-12-24的日子与去年2015/12/24的是有不同哦.
     """
# 根据正则查找所有日期并返回
matches_list=date_reg_exp.findall(test_str)

# 列出并打印匹配的日期
for match in matches_list:
  print match

# 2016-12-24
# 2015/12/24

五、单个链接下载图片

代码如下：

import requests
from re import findall
from bs4 import BeautifulSoup
import time
import os

weixin_title=""
weixin_time=""

#获取微信公众号内容,保存标题和时间
def get_weixin_html(url):
    global weixin_time,weixin_title
    res=requests.get(url)
    soup=BeautifulSoup(res.text,"html.parser")
    
    #获取标题
    temp=soup.find('h1')
    weixin_title=temp.string.strip()
    
    #使用正则表达式获取时间
    result=findall(r"(\d{4}-\d{1,2}-\d{1,2})",res.text)
#    result=findall(r'[0-9]{4}-[0-9]{2}-[0-9]{2}.+:[0-9]{2}',res.text)
	
    weixin_time=result[0]
    
    #获取正文html并修改
    content=soup.find(id='js_content')
    soup2=BeautifulSoup((str(content)),"html.parser")
    soup2.div['style']='visibility: visible;'
    html=str(soup2)
    pattern=r'http[s]?:\/\/[a-z.A-Z_0-9\/\?=-_-]+'
    result = findall(pattern, html)
    
    #将data-src修改为src
    for url in result:
        html=html.replace('data-src="'+url+'"','src="'+url+'"')
    
    return html

#上传图片至服务器
def download_pic(content):
    
    pic_path='pic/'
    if not os.path.exists(pic_path):
        os.makedirs(pic_path)
        
    #使用正则表达式查找所有需要下载的图片链接
    pattern=r'http[s]?:\/\/[a-z.A-Z_0-9\/\?=-_-]+'
    pic_list = findall(pattern, content)
    
    for index, item in enumerate(pic_list,1):
        count=1
        flag=True
        pic_url=str(item)
        
        while flag and count<=10:
            try:
                 data=requests.get(pic_url);
   
                 if pic_url.find('png')>0:
                     file_name = str(index)+'.png'
                     
                 elif pic_url.find('gif')>0:
                     file_name=str(index)+'.gif'
                     
                 else:
                     file_name=str(index)+'.jpg'

                 with open( pic_path + file_name,"wb") as f:
                     f.write(data.content)
                     
                 #将图片链接替换为本地链接
                 content = content.replace(pic_url, pic_path + file_name)
                 
                 flag = False
                 print('已下载第' + str(index) +'张图片.')
                 count += 1
                 time.sleep(1)
                      
            except:
                 count+=1
                 time.sleep(1)
                 
        if count>10:
            print("下载出错：",pic_url)
    return content

if __name__ == "__main__":
    
    #获取html
    input_flag=True
    while input_flag:
       weixin_url=input('请输入微信文章链接后按Enter：')
       re=findall(r'http[s]?:\/\/mp.weixin.qq.com\/s\/[0-9a-zA-Z_]+',weixin_url) 
       if len(re)<=0:
            print("链接有误，请重新输入!")
       else:
           input_flag=False
     
    content=get_weixin_html(weixin_url)
    content=download_pic(content)
    #保存至本地
    with open(weixin_title+'.txt','w+',encoding="utf-8") as f:
        f.write(content) 
    with open(weixin_title+'.html','w+',encoding="utf-8") as f:
        f.write(content)  
        
    print()
    print("标题：《"+weixin_title+"》")
    print("发布时间："+weixin_time)

六、多个链接下载文章

在这里插入图片描述

代码如下：

import requests
import re
from bs4 import BeautifulSoup
import time
headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1295.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.5 WindowsWechat'
         }
requests.packages.urllib3.disable_warnings()
def get_weixin_artical_content(profile_url):
    session = requests.Session()
    session.headers.update(headers)
    biz = 'MzAwNTA5NTYxOA=='
    pass_ticket = ''
    appmsg_token = '1055_YAmuAw2QG7dM3aTwSVZVqgtRdct6ilAMTwlz7g'
    params = {
                'action': 'getmsg',
                '__biz': biz,
                'f': 'json',
                'offset': '0',
                'count': '10',
                'is_ok': '1',
                'scene': '123',
                'uin': '777',
                'key': '777',
                'pass_ticket': pass_ticket,
                'wxtoken': '',
                'appmsg_token': appmsg_token,
                'x5': '0'
            }
    res = session.get(profile_url, params=params, verify=False)
    nick_name = res.json()[ 'nick_name']
    title = res.json()[ 'title']
    soup = BeautifulSoup(res.json()['content_noencode'])
    text = soup.get_text().replace('　　','\n')

    pub_time = time.strftime("%Y-%m-%d", time.localtime(res.json()['svr_time']))
    return nick_name,title,text,pub_time,res.json()

def save_wechat_content(cache_value):    
    with open('out.txt','a',encoding='utf-8') as fwt:
        fwt.write(cache_value+'\n')
if __name__ == '__main__':
    url = []
    import sys
    with open(r'link.txt','r',encoding='utf-8') as f:
        cache = f.readlines()
        for i in cache:
            url.append(i.strip())
    for i in url:
        profile_url = i
        nick_name,title,text,pub_time,res = get_weixin_artical_content(profile_url)
        cache_value ='文章标题：'+title+'\n公众号名称：'+nick_name+'\n发布时间：'+pub_time+'\n正文内容：\n'+text+'\n'
        save_wechat_content(cache_value)
        print(str(i)+"抓取完毕！")

七、单链接下载音频

代码如下：


import os
import time
import requests
from bs4 import BeautifulSoup

########参数定义
# MP3内容页地址
subpage =input('请输入下载链接：')
#subpage ='https://mp.weixin.qq.com/s/c_cwAc32MM7etO7VfcxauQ'
# 保存目录,末尾带 / 斜杠
basedir =r'E:\py\python3.7\test2\test40wechartphoto\Audio\MP/'
################


########### 采集函数定义
def getaudio(res, nub=0):
    global basedir
    # 开始向内容页地址发送请求
    response=requests.get(res)
    # 获取地址源码
    html=response.text
    soup=BeautifulSoup(html, 'lxml')

    pagesubject = soup.find('h1',attrs={u"class":u"rich_media_title"}).string.strip()
    pagesubject = pagesubject.replace("|","_")
    savedir = basedir + pagesubject.strip()

    # 获取所有声音标签
    mpvoices = soup.find_all('mpvoice')
    # 遍历提取的声音标签
    for n,mid in enumerate(mpvoices):
        # 防止名称中有.mp3
        mp3name = 'save_%s'%(n+1) if len(mid['name'])==0 else mid['name'].replace(".mp3","")
        filesavepath = savedir+'/'+mp3name+'.mp3'

        # 判断并自动创建目录
        direxists = os.path.exists(savedir)
        if not direxists:
            print('创建目录：%s' % savedir)
            os.makedirs(savedir)
        fileexists = os.path.exists(filesavepath)
        if not fileexists:
            print('正在下载：%s.mp3' % mp3name)
            req = requests.get('https://res.wx.qq.com/voice/getvoice?mediaid=' + mid['voice_encode_fileid'])
            with open(filesavepath,'wb')as f:
               f.write(req.content)
            time.sleep(1)
        else:
            print('---跳过文件：%s.mp3' % mp3name)

    print("第%d个主题【%s】完成" % (nub, pagesubject))
##############函数定义结束

getaudio(subpage,1)

print('\n下载任务完成-------------')
quit()