爬取微信公众号详情页

最新推荐文章于 2024-06-05 10:58:23 发布

sxn777

最新推荐文章于 2024-06-05 10:58:23 发布

阅读量1k

点赞数

分类专栏： python 爬虫学习

本文链接：https://blog.csdn.net/sxn777/article/details/118676280

版权

python 爬虫学习专栏收录该内容

3 篇文章 0 订阅

订阅专栏

需求：爬取一个微信公众号，获取文章里面的图片并把它下载到本地，上传到阿里云，修改原文章的图片地址

import requests
import re
import os
import json
import oss2
from lxml import etree

headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.64'
    }

oss2_ak='XXX'
oss2_sk='XXX'
oss2_endpoint='XXX'
oss2_bucket='XXX'

def zhuaqu():
    '''爬取网页内容'''
    # url = 'https://mp.weixin.qq.com/s?__biz=MzUzMzQ0NjQ2OA==&mid=2247508308&idx=3&sn=60effeb3984b66d3de01f00b226de8ff&chksm=faa10b7fcdd682695d6445f89c7591bcab88300d77f0e62b73e46561ee76ba7455876a837773&cur_album_id=1862926854316163084&scene=189#rd'
    # url = 'https://mp.weixin.qq.com/s?__biz=MzUzMzQ0NjQ2OA==&mid=2247507973&idx=1&sn=58f00d880e831455e9af194438916da2&chksm=faa10a2ecdd683386c89d2c4e7a45c43288ccb154011b81d836c52bceee7d784fd3215452516&cur_album_id=1862926854316163084&scene=189#rd'
    # url ='https://mp.weixin.qq.com/s?__biz=MzUzMzQ0NjQ2OA==&mid=2247507973&idx=3&sn=40463d1cd426ce18823031be5f797754&chksm=faa10a2ecdd683383a36e13295fdf3daf2fbf05d54049e0737f108c4dffb4f8eacf96bd1a5a6&cur_album_id=1862926854316163084&scene=189#rd'
    url = 'https://mp.weixin.qq.com/s?__biz=MzUzMzQ0NjQ2OA==&mid=2247505777&idx=1&sn=1fc52faec6399ce72692ba28cc16920b&chksm=faa17d5acdd6f44c8e8c9a2ad456f8adf1aa029f0588c5e9cbdd3efcbb1e81a77aa7e26ed0ca&cur_album_id=1862926854316163084&scene=189#rd'
    response = requests.get(url=url, headers=headers)
    content = response.content.decode('utf8')
    return content


def save_html(res):
    '''保存网页到本地'''
    # 获取title名
    detail_text = "test.html"
    with open(detail_text, "w", encoding='utf-8') as file:
        file.write(res)


def open_html():
    '''爬取图片和视频'''
    htmlf = open('./test.html', 'r', encoding="utf-8")
    htmldetail = htmlf.read()
    html_pic = re.findall(r'data-src="(.*?)"', htmldetail)
    html_pic_list = []
    for item in html_pic:
        if 'yiaush' in item:
            html_pic_list.append(item)
    html_video = re.findall(r"wxv_.{19}", htmldetail)
    html_video_list = list(set(html_video))
    return html_pic_list, html_video_list


def save_pic(pic_url):
    '''保存图片到本地'''
    d = 'D:\\pic\\'
    new_path = re.findall(r'g/(.*?)/|f/(.*?)/', pic_url)[0]
    for i in new_path:
        if i:
           new_path = i
    type = re.findall(r'mmbiz_(.*?)/', pic_url)[0]
    path = d + str(new_path) + '.' + type
    try:
        if not os.path.exists(d):
            os.mkdir(d)
        if not os.path.exists(path):
            r = requests.get(pic_url)
            r.raise_for_status()
            with open(path, 'wb') as f:
                f.write(r.content)
                f.close()
                print("图片保存成功")
        else:
            print("图片已存在")
    except:
        print("图片获取失败")
    return new_path, type


def save_video(wxv):
    '''保存视频到本地'''
    video_url_temp = "https://mp.weixin.qq.com/mp/videoplayer?action=get_mp_video_play_url&preview=0&__biz=MzI4MzkzMTc3OA==&mid=2247488495&idx=4&vid=" + wxv
    response = requests.get(video_url_temp, headers=headers)
    content = response.content.decode()
    content = json.loads(content)
    url_info = content.get("url_info")
    video_url2 = url_info[0].get("url")
    d = 'D:\\video\\'
    video_path = d + str(wxv) + '.mp4'
    try:
        if not os.path.exists(d):
            os.mkdir(d)
        if not os.path.exists(video_path):
            html = requests.get(video_url2)
            html.raise_for_status()
            with open(video_path, 'wb') as f:
                f.write(html.content)
                f.close()
                print("视频保存成功")
        else:
            print("视频已存在")
    except:
        print("视频获取失败")
    return wxv


def upload_file_to_oss2(oss2_file_name, local_file_name):
    """上传文件到阿里云oss

    Args:
        oss2_file_name：上传到oss2的文件名称（包含路径）
        local_file_name：本地文件名（绝对路径）
    """
    auth = oss2.Auth(oss2_ak, oss2_sk)
    bucket = oss2.Bucket(auth, oss2_endpoint, oss2_bucket)
    resp = bucket.put_object_from_file(oss2_file_name, local_file_name)
    if resp.status == 200:
        return 'https://' + oss2_bucket + '.' + oss2_endpoint + '/' + oss2_file_name
    else:
        return


def main():
    # 爬取网页内容 并保存到本地
    content = zhuaqu()
    save_html(content)
    # 爬取网页的图片和视频保存到本地
    html_pic_list, html_video_list = open_html()
    if html_video_list or html_pic_list:
        if html_pic_list:
            htmlf = open('./test.html', 'r+', encoding="utf-8")
            htmldetail = htmlf.read()
            for item in html_pic_list:
                new_path, type = save_pic(item)
                local_path = 'D:\\pic\\' + str(new_path) + '.' + type
                oss2_file_name = 'images/' + new_path + '.' + type
                url = upload_file_to_oss2(oss2_file_name, local_path)
                # 上传到阿里云 更新html
                wxm = new_path
                data_type = type
                if type == 'jpg':
                    data_type = 'jpeg'
                htmldetail = htmldetail.replace('data-src="https://mmbiz.qpic.cn/mmbiz_{}/{}/640?wx_fmt={}"'.format(type, wxm, data_type), 'src={}'.format(url))
            save_html(htmldetail)
        if html_video_list:
            htmlf = open('./test.html', 'r+', encoding="utf-8")
            htmldetail = htmlf.read()
            for item in html_video_list:
                new_path = save_video(item)
                local_path = 'D:\\video\\' + str(new_path) + '.mp4'
                oss2_file_name = 'images/' + new_path + '.mp4'
                url = upload_file_to_oss2(oss2_file_name, local_path)
                # 上传到阿里云 更新html
                wxm = re.findall("\d+",url)[0]
                htmldetail = htmldetail.replace(';vid=wxv_{}"'.format(wxm), ';vid=wxv_{}" src="https://attached-file.oss-cn-shanghai.aliyuncs.com/files/wxv_1935516298600316940.mp4" width=100% height="300px"'.format(wxm))
            save_html(htmldetail)
    # 保存到数据库



main()

sxn777

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬取微信公众号详情页

需求：爬取一个微信公众号，获取文章里面的图片并把它下载到本地，上传到阿里云，修改原文章的图片地址import requestsimport reimport osimport jsonimport oss2from lxml import etreeheaders = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chro
复制链接

扫一扫