python分页爬取网站的图片
import requests
import re
import os
import time
"""获取主网页"""
web_page = 'https://www.vmgirls.com/'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
urls_response = requests.get(web_page,headers=headers)
urls_html = urls_response.text
"""解析主主网页获取下一层网页"""
all_urls = re.findall('https://.*?/\d*.html',urls_html)
urls = list(set(all_urls))
# print(urls)
"""下载下一页的网页图片"""
num_list = []
for url in urls:
url_resopnse = requests.get(url,headers=headers)
html=url_resopnse.text
dir_name = re.findall('
(.*?)
',html)[-1]wget_urls = re.findall('https:.*?.jpeg',html)
print("\033[32;1m %s upload %s pictures\033[0m" %(dir_name,len(wget_urls)))
num = len(wget_urls)
num_list.append(num)
for wget_url in wget_urls:
time.sleep(1)
file_name = wget_url.split('/')[-1]
print(file_name)
dir_name = re.findall('
(.*?)
',html)[-1]if not os.path.exists(dir_name):
os.mkdir(dir_name)
response = requests.get(wget_url,headers=headers)
with open(dir_name + '/' + file_name,'wb') as f:
f.write(response.content)
sum = 0
for i in num_list:
sum += i
print("\033[31;1mThere are %s pictures that need to be crawled\033[0m" % (sum))
©著作权归作者所有:来自51CTO博客作者Leo_zhjl的原创作品,如需转载,请注明出处,否则将追究法律责任
好知识,才能预见未来
赞赏
0人进行了赞赏支持
Leo_zhjl
36篇文章,16W+人气,3粉丝
没有什么是永恒的,但没有什么是永恒的