注:实现解析本地HTML文档,将其中的网络图片下载到本地,并将其中的网络图片地址,改为本地地址
import requests
import os
from bs4 import BeautifulSoup
def getContent(url):
try:
r=requests.get(url,timeout=20)
r.raise_for_status()
return r.content
except:
return ""
def writeFile(path,content):
if not os.path.exists(path):
with open(path,"wb") as file:
file.write(content)
else:pass
def dealUrl(url,dir):
str=url.split("/")
img=str[-1]
path=dir+img
return path
def readUrl():
file="D:\\pa_chong\\new.txt"
if os.path.exists(file):
with open(file) as fi:
url_img=fi.read()
# for img in url_img:
print(url_img)
def findSrc(html):
srcs=[]
with open(html, 'r', encoding='utf-8') as f:
fi=f.read()
soup=BeautifulSoup(fi,"html.parser")
imgs=soup.find_all("img")
for img in imgs:
if img["src"] not in srcs:
srcs.append(img["src"])
bgimg=soup.find("section",{"class":"section section_welcome"})
srcs.append(bgimg["data-image-src"])
with open("D:\\pa_chong\\new.txt","w") as f:
for src in srcs:
f.write(src)
f.write("\n")
return srcs
def changehtml(html,list):
with open(html, 'r', encoding='utf-8') as f:
fi = f.read()
soup = BeautifulSoup(fi, "html.parser")
imgs = soup.find_all("img")
for img in imgs:
img["src"]=img["src"].split("/")[-1]
bgimg = soup.find("section", {"class": "section section_welcome"})
bgimg["data-image-src"]=bgimg["data-image-src"].split("/")[-1]
global fx
fx = soup
with open(html,"w",encoding="utf-8") as f:
f.write(str(fx))
def main():
html="D:\\pa_chong\\index.html"
list=findSrc(html)#获取图片路径
changehtml(html,list)
dir="D:\\pa_chong\\"
for item in list:
url=item
content=getContent(url)
path=dealUrl(url,dir)
writeFile(path,content)
readUrl()
main()