目录
批量获取https://www.vilipix.com中的图片,并且下载
获取网站的数据
获取所有的a标签
import requests
from bs4 import BeautifulSoup
def getImg(url,afile):
# 发送请求并获取页面内容
# 'https://www.vilipix.com/'
url = url # 替换为你想要获取信息的网页地址
response = requests.get(url)
# 使用BeautifulSoup解析页面内容
soup = BeautifulSoup(response.text, 'html.parser')
# 提取<body>标签内的所有img标签信息
body = soup.find('body')
# print(body)
img_tags = body.find_all('a')
# print(img_tags)
# for img in img_tags:
count=0
# 打印所有<img>标签的信息
for img in img_tags:
# print(img)
with open(afile, 'a', encoding='utf-8') as f:
f.write(str(img))
f.write('\n')
count+=1
print(f"写入a标签{count}个成功")
数据处理
获取文件中的id
import re
def getImgId(input_url,output_url):
count=0
# 打开输入文件和输出文件
with open(input_url, 'r', encoding='utf-8') as input_file, open(output_url, 'a', encoding='utf-8') as output_file:
for line in input_file:
data = line.strip() # 去除换行符等空白字符
pattern = r'href="/illust/(.*?)"'
result = re.search(pattern, data)
if result:
illust_link = result.group(1)
output_file.write(illust_link + '\n') # 将提取的 id 写入输出文件
else:
count+=1
print(f"有{count}个未找到/illust链接")
id去重
def delRepeat(file_path,out_file_path):
file_path = file_path
output_file_path = out_file_path
# 用集合来存储读取的内容,确保内容不重复
content_set = set()
# 读取文件并去除重复项
with open(file_path, 'r') as file:
for line in file:
content = line.strip() # 去除换行符等空白字符
content_set.add(content)
count=0
# 将去除重复项后的内容写入到输出文件
with open(output_file_path, 'a',encoding='utf-8') as output_file:
for content in content_set:
output_file.write(content + '\n')
print("文件中重复项已成功去除并写入到文件中")
连接成查看图片链接
def linkUrl(input_path,output_path):
file1_path = input_path
file2_path = output_path
with open(file1_path, 'r') as file1, open(file2_path, 'w') as file2:
for line in file1:
# 对每一行数据进行处理
line = line.strip() # 去除换行符等空白字符
new_content = "https://www.vilipix.com/illust/" + line
file2.write(new_content + '\n') # 将处理后的内容写入第二个文件
print("已获取到图片的id链接")
读取图片查看链接获取数据
from bs4 import BeautifulSoup
import requests
import getimg
def getSrc(readurl):
# 读取 URL 列表
with open(readurl, 'r') as file:
urls = file.readlines()
count=0
# 遍历 URL 列表
for url in urls:
response = requests.get(url.strip()) # 去除 URL 中可能存在的空白字符
getimg.gethtml_img(response)
print('已获取到图片的img标签')
处理获取到的数据,得到img标签
from bs4 import BeautifulSoup
def gethtml_img(htmldata):
# 从文件中读取HTML内容
html_content = htmldata.text
# 使用BeautifulSoup解析HTML内容
soup = BeautifulSoup(html_content, 'html.parser')
# 找出所有的img标签
img_tags = soup.find_all('img')
count = 0
result = ""
# 如果找到至少两个img标签
if len(img_tags) >= 2:
# 去除第一个和最后一个img标签
img_tags_trimmed = img_tags[1:-1]
# 输出剩余img标签的src属性
for img_tag in img_tags_trimmed:
result += str(img_tag) + "\n" # 将每个 img 标签及其属性添加到结果中
with open('img.txt', 'a', encoding='utf-8') as file:
file.write(result) # 将结果写入文件
else:
count += 0
将img标签转化为键值对
from bs4 import BeautifulSoup
def extract_and_save(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
html_content = file.read()
# 使用BeautifulSoup解析HTML内容
soup = BeautifulSoup(html_content, 'html.parser')
# 找出所有的img标签
img_tags = soup.find_all('img')
result = ""
for img_tag in img_tags:
alt = img_tag.get('alt', 'No Alt Text') # 获取alt属性,不存在则默认为'No Alt Text'
src = img_tag.get('src', 'No Src') # 获取src属性,不存在则默认为'No Src'
# 将提取的alt和src内容以指定格式添加到结果中
result += f'{alt}^/*\^ {src}\n'
with open('src.txt', 'w', encoding='utf-8') as output_file:
output_file.write(result) # 将结果写入文件
print('img标签信息转化完毕')
# 传入包含HTML内容的文件路径
# extract_and_save('img.txt')
下载照片
import os
import requests
import hashlib
import os
import requests
import re
def download(file_path, save_folder):
print('Downloading...')
# 从文件中读取多个图片链接
file_path = file_path
output_dir = save_folder # 保存图片的文件夹路径
count=0
if not os.path.exists(output_dir):
os.makedirs(output_dir)
def clean_filename(name):
cleaned_name = re.sub(r'[<>:"/\\|?*]', ' ', name) # 将不支持的字符替换为空格
return cleaned_name.strip()
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
name, url = line.split('^/*\^ ')
name = name.strip()
url = url.strip() # 去除链接两侧的引号
# 下载图片
response = requests.get(url)
if response.status_code == 200:
image_name = f'{clean_filename(name)}.jpg'
image_path = os.path.join(output_dir, image_name)
# 处理重名文件
counter = 1
while os.path.exists(image_path):
new_name = f'{clean_filename(name)}_{counter}.jpg'
image_path = os.path.join(output_dir, new_name)
counter += 1
with open(image_path, 'wb') as f:
f.write(response.content)
# print(f'图片 {name} 下载完成')
count+=1
else:
print(f'下载 {name} 的图片失败')
print(f'所有图片下载完成!共{count}张')
# download('src.txt','D:\二次元照片\长腿')
清空文件
file_list = ['atag.txt', 'delrepeat.txt', 'img.txt','imgid.txt','src.txt','url.txt'] # 将你的文件名存储在一个列表中
def cleanAllFile():
for file_name in file_list:
with open(file_name, 'w', encoding='utf-8') as file:
file.truncate(0)
print(f'所有文件已清空')
cleanAllFile()
主函数
import getimg_atag
import delRepeat
import getimgId
import linkUrl
import getSrc
import download
import img_transform
from urllib.parse import quote, unquote
import cleanAllFile
if __name__ == '__main__':
try:
while True:
print("请输入保存的位置(必选)")
savefilelocal = input()
if savefilelocal != '':
break;
print('请输入你的关键词(可选)')
title=input()
titles=quote(title)
if title:
# 爬取的网站
url='https://www.vilipix.com/tags/'+titles+'/illusts'
else:
print('1.新作 2.榜单 3.特辑(可选)')
choose=input()
if choose==1:
url='https://www.vilipix.com/new'
elif choose==2:
url='https://www.vilipix.com/ranking'
elif choose==3:
url='https://www.vilipix.com/p'
else:
url='https://www.vilipix.com'
# 存放爬取到的超链接的文件
afile="atag.txt"
#存放超链接去重之后的文件
atag_delrepeat="delrepeat.txt"
# 存放查找到的所有的id
imgid="imgid.txt"
#需要将id拼接成链接后存放的文件
imgurl="url.txt"
# 下载的地址
downlocal= savefilelocal
#存储img标签的位置
imglocal='img.txt'
#存储“name:连接”
namePic='src.txt'
# 获取所有的a标签
getimg_atag.getImg(url,afile)
# 查找所有的标签的id
getimgId.getImgId(afile,imgid)
# 去重
delRepeat.delRepeat(imgid, atag_delrepeat)
#将id拼接成链接并存储
linkUrl.linkUrl(atag_delrepeat,imgurl)
# 从链接中获取src的图片链接
getSrc.getSrc(imgurl)
#将获取到的img标签转换
img_transform.extract_and_save(imglocal)
# 下载图片
download.download(namePic,downlocal)
#不发生异常清空文件
cleanAllFile.cleanAllFile()
except Exception as e:
print(f"发生异常:{e}")