小伙伴们可以进行更改爬取别的网站,本次爬取的网站是https://www.qqtn.com/tx/weixintx_1.html
一、导包
import re
import requests
import os
import time
二、获取每一页网页的text,并利用正则提取图片地址
for page in range(1, 4):
url = f"https://www.qqtn.com/tx/weixintx_{page}.html"
page_text = requests.get(url=url).text
ex = '<li>.*?<img src="(.*?)" alt.*?li>'
img_list = re.findall(ex, page_text, re.S)
三、利用for循环遍历列表,获取数据
for post_url in img_list:
img_data = requests.get(url=post_url).content
源码注意缩进
import re
import requests
import os
import time
#获取开始时间
start = time.perf_counter()
# 创建文件夹
if not os.path.exists("./微信头像"):
os.mkdir("./微信头像")
#请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
for page in range(1, 4):
url = f"https://www.qqtn.com/tx/weixintx_{page}.html"
page_text = requests.get(url=url).text
ex = '<li>.*?<img src="(.*?)" alt.*?li>'
img_list = re.findall(ex, page_text, re.S)
for post_url in img_list:
img_data = requests.get(url=post_url).content
# 生成图片名称
img_name = post_url.split("/")[-1]
#路径
imgpath = "./微信头像/" + img_name
#存放在文件夹内
with open(imgpath, "wb") as fp:
fp.write(img_data)
#获取结束时间
end = time.perf_counter()
#计算时间
runTime = end - start
print(f"下载完成,用时{runTime}秒")