# install beautifulsoup4
import os
import re
import chardet
import requests
from bs4 import BeautifulSoup
def IsArrayEmpty(list):
return not list
def DownloadFile(path, url, strFilePath):
try:
response = requests.get(url)
response.raise_for_status()
with open(strFilePath, "wb") as file:
file.write(response.content)
except Exception as e:
print('---------------------------------------------')
print('文件路径:', path)
print('文件下载失败:', url)
print('失败原因:', str(e))
def ReadFile(strFilePath):#读取文件
with open(strFilePath, 'rb') as f:
varContent = f.read()
encoding = chardet.detect(varContent)['encoding']
varContent = varContent.decode(encoding)
return varContent
return ""
def ParseFile(path, strReplace):
strData = ReadFile(path)
#html
soup = BeautifulSoup(strData, 'html.parser')
#查找所有的 img
strContent = soup.find_all('img')
if IsArrayEmpty(strContent) == False:
for varIndex in strContent:
#打印单个img
#print(varIndex)
#打印属性
href = varIndex['src']
#print(href)
#打印文件名
nPos = href.rfind("/")
imageName = href[nPos + 1:]
#print(imageName)
imageName = strReplace + imageName
#print(imageName)
varIndex['src'] = imageName
#BeautifulSoup 保存文件
with open(path, 'w', encoding='utf-8') as file:
file.write(soup.prettify())
def enumerate_folder(strFolder, strReplace):
for root, dirs, files in os.walk(strFolder):
for file in files:
path = os.path.join(root, file)
ParseFile(path, strReplace)
for dir in dirs:
pass
enumerate_folder("./", "http://*.*.*.*:4004/test/")
python 解析html 并保存
于 2024-07-27 16:33:27 首次发布