【Python~分享】解析本地HTML文档，替换里面所有 img 标签的链接

最新推荐文章于 2023-03-25 10:55:47 发布

猪猪传奇

最新推荐文章于 2023-03-25 10:55:47 发布

阅读量2.7k

点赞数 1

分类专栏： Python 学习

本文链接：https://blog.csdn.net/qq_42127861/article/details/86594241

版权

Python 学习专栏收录该内容

20 篇文章

订阅专栏

注：实现解析本地HTML文档，将其中的网络图片下载到本地，并将其中的网络图片地址，改为本地地址

import requests
import os
from bs4 import BeautifulSoup

def getContent(url):
    try:
        r=requests.get(url,timeout=20)
        r.raise_for_status()
        return r.content
    except:
        return ""

def writeFile(path,content):
    if not os.path.exists(path):
        with open(path,"wb") as file:
            file.write(content)
    else:pass

def dealUrl(url,dir):
    str=url.split("/")
    img=str[-1]
    path=dir+img
    return path

def readUrl():
    file="D:\\pa_chong\\new.txt"
    if os.path.exists(file):
        with open(file) as fi:
            url_img=fi.read()
           # for img in url_img:
            print(url_img)

def findSrc(html):
    srcs=[]
    with open(html, 'r', encoding='utf-8') as f:
        fi=f.read()
        soup=BeautifulSoup(fi,"html.parser")
        imgs=soup.find_all("img")
        for img in imgs:
            if img["src"] not in srcs:
                srcs.append(img["src"])
        bgimg=soup.find("section",{"class":"section section_welcome"})
        srcs.append(bgimg["data-image-src"])

    with open("D:\\pa_chong\\new.txt","w") as f:
        for src in srcs:
            f.write(src)
            f.write("\n")

    return srcs

def changehtml(html,list):
    with open(html, 'r', encoding='utf-8') as f:
        fi = f.read()
        soup = BeautifulSoup(fi, "html.parser")
        imgs = soup.find_all("img")
        for img in imgs:
            img["src"]=img["src"].split("/")[-1]
        bgimg = soup.find("section", {"class": "section section_welcome"})
        bgimg["data-image-src"]=bgimg["data-image-src"].split("/")[-1]
        global fx
        fx = soup
    with open(html,"w",encoding="utf-8") as f:
        f.write(str(fx))

def main():
    html="D:\\pa_chong\\index.html"
    list=findSrc(html)#获取图片路径
    changehtml(html,list)
    dir="D:\\pa_chong\\"
    for item in list:
        url=item
        content=getContent(url)
        path=dealUrl(url,dir)
        writeFile(path,content)
    readUrl()

main()