网页图片爬取
import requests
import re
import time
url = "http://10.4.7.173/python-spider/" #爬取地址
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.5195.102 Safari/537.36" #爬取地址的user-agent
}
def get_html(url):
res = requests.get(url = url, headers = headers)
return res.content
def get_img_path_list(html):
img_path_list = re.findall(r"style/\w*\.jpg", html)
return img_path_list
def img_download(img_save_path, img_url):
with open(img_save_path, "wb") as f:
f.write(get_html(url = img_url))
html = get_html(url = url).decode()
img_path_list = get_img_path_list(html = html)
for img_path in img_path_list:
img_url = url + img_path
img_save_path = f"./images/{time.time()}.jpg"
img_download(img_save_path = img_save_path, img_url = img_url)