from lxml.html import parse import base64 from glob import glob import os html_file="D:/DataSet/liusheng/blur/url_blur/*.htm" for i in glob(html_file): print(i) floder=i.split("\\")[-1].replace(".","") parsed = parse(i) doc = parsed.getroot() ls=doc.findall(".//img") count=0 for j in ls: count+=1 src=j.get('src') data = src.split(',')[1] image_data = base64.b64decode(data) jpg= "D:/DataSet/liusheng/blur/url_blur_img/" + floder+str(count)+".jpg" with open(jpg, 'wb') as f: f.write(image_data) print(len(ls))
参考文章:
https://www.cnblogs.com/changdasheng/p/11496289.html
https://www.cnblogs.com/wwwwwei/p/10728060.html