爬取唯美小姐姐网站
链接: 源代码文件下载地址
展示 源代码
import requests
import re
import os
import time
# 伪装 用于可以伪装成浏览器。
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}
# 请求网页
# 爬取妹子图网站 https://www.tupianzj.com/meinv/mm/meizitu/
print("网页请求中...")
time.sleep(0.5)
response = requests.get("https://www.tupianzj.com/meinv/mm/meizitu/", headers=headers)
html = response.text # 获取html信息
# print(html)
print("网页信息已获取...")
time.sleep(0.5)
# 网页部分内容展示示例
# <LI><a href="/meinv/20200728/214746.html" target="_blank"><img src="https://img.tupianzj.com/uploads/allimg/202007/9999/rn815784ca39.jpg" alt="ͲÍàÃÀŮЦØÌÈ绨ƯÁÁÃÃ×Óͼ" border="0" /></a>
print("网页信息解析中...")
# 解析网页
urls = re.findall(r'<img src="[a-zA-Z]+://[^\s]*[.jpg]',html) # 使用正则表达式匹配图片的URL
# print(urls[0])
# <img src="https://img.tupianzj.com/uploads/allimg/202009/9999/rn984c2cbc21.jpg
time.sleep(0.5)
print("建立下载目录...")
# 建立下载目录
dir_name = "download"
if not os.path.exists(dir_name):
os.mkdir(dir_name)
time.sleep(0.5)
print("下载目录:/",dir_name,'建立完成...')
# url = "https://img.tupianzj.com/uploads/allimg/202007/9999/rn815784ca39.jpg"
# print(url.split('/'))
# print(url.split('/')[-1]) # 获取图片名
time.sleep(0.5)
print("网页图片请求中...")
# 下载图片
for url in urls:
# 相隔1s
# time.sleep(0.01)
# 图片的名字
file_name = url.split('/')[-1] # 以'/'来分割字符串
response = requests.get(url[10:], headers=headers) # url[10:]的作用是出去前缀<img src="
# 保存
with open(dir_name + "/" + file_name, 'wb') as f:
f.write(response.content) # 将图片写入到文件夹下保存
info = "图片文件: {0:25}{1}".format(file_name," 成功下载...")
# print("图片文件: ",file_name," 成功下载...")
print(info)
time.sleep(0.5)
print("图片爬取完成...")
print("当前时间: ",time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
效果展示1:
效果展示2:
效果展示3: