直接上代码,下来有时间单独发一篇进行讲解,文章底部有效果图
仅用学习,如有侵权立即删除
有两个地方需要改一下
1、headers 改成自己浏览器的
2、path 改成自己本地任意路径
import requests
import re
import threading
headers = {
"User-Agent": "xxxxxxxxxxxxxxxx"
}
def get_img(url):
res_img = requests.get(url=url,headers=headers)
res_img.encoding = 'utf-8'
html = res_img.text
img_url = re.compile('lazysrc="(.*?)" lazysrc2x=')
title = re.compile('target="_blank" class="imgw" title="(.*?)">')
"""
知识点:
re.findall()在字符串中找到正则表达式所匹配的所有子串,并返回一个列表
"""
img_url_d = img_url.findall(html)
title_pattern = title.findall(html)
for url,title in zip(img_url_d,title_pattern):
url = url[0:url.rindex("220.320.jpg") -1]
seva_img(url,title)
def seva_img(url,title):
path = '/Users/Bob/PycharmProjects/ModeTest/meinv'
try:
print(f"{title} - {url}")
res = requests.get(url=url,headers=headers)
if res is not None:
html = res.content
with open(f"{str(path)}/{str(title)}.jpg",'wb+') as f:
f.write(html)
except Exception as er:
print(url,er)
if __name__ == '__main__':
# 最后使用多线程进行爬取,开启 5 个线程,当所有线程结束运行时,停止整体代码。
semaphore = threading.BoundedSemaphore(5)
for index in range(1,30+1):
t = threading.Thread(target=get_img, args=(f"https://www.3gbizhi.com/meinv/index_{index}.html",))
t.start()
while threading.active_count() != 1:
pass
else:
print("所有进程已结束!!!")
#普通的办法进行便利
# url='https://www.3gbizhi.com/sjbz/index_{}.html'
# for i in range(1,118+1):
# # print(i)
# get_imga(f'https://www.3gbizhi.com/sjbz/index_{i}.html')
效果图
参考地址:https://dream.blog.csdn.net/article/details/117918309
在此感谢 梦想橡皮擦 大佬的优秀文章