先修改GetSrc文件中的19行为保存的路径,然后进行下面的操作
save_local = 'G:\写真'
运行的顺序:RequestUrl.py-->pagesize.py-->Getsrc.py
先将代码全部写好,如下:
download.py
import os
import requests
from tqdm import tqdm
import threading
def download_file(url, output_folder, pbar):
try:
response = requests.get(url, stream=True)
if response.status_code == 200:
filename = url.split('/')[-1]
with open(os.path.join(output_folder, filename), 'wb') as f:
total_length = response.headers.get('content-length')
if total_length is None:
f.write(response.content)
else:
dl = 0
total_length = int(total_length)
for data in response.iter_content(chunk_size=4096):
dl += len(data)
f.write(data)
pbar.update(len(data)) # 更新进度条
except Exception as e:
print(f"下载 {url} 时出错......")
print(f'异常信息: {e}')
def download_photos_from_file(file_path, output_folder):
try:
print("Downloading........")
if not os.path.exists(output_folder):
os.makedirs(output_folder)
with open(file_path, 'r') as file:
lines = file.readlines()
count = 0
with tqdm(total=len(lines), desc="Downloading images") as pbar:
threads = []
for line in lines:
url = line.strip()
thread = threading.Thread(target=download_file, args=(url, output_folder, pbar))
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
count += 1
print(f"总下载照片数量: {count}张")
except Exception as e:
print(f"下载 {count} 照片时出错......")
print(f'异常信息: {e}')
# 调用函数并传入文件路径和输出文件夹路径
# download_photos_from_file('src.txt', 'G:\写真\ 2024.01.18 NO.7979 大美妞儿')
cleanAllFile.py
def cleanAllFile(file):
file_name=file
with open(file_name, 'w', encoding='utf-8') as file:
file.truncate(0)
print(f'{file_name}文件已清空')
RequestUrl.py
import requests
from bs4 import BeautifulSoup
import cleanAllFile
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0',
'Referer': 'Your Referer', # 添加 Referer 头,指示请求来源页面
'Cache-Control': 'no-cache',
'Content-Security-Policy': "script-src 'self';",
'Cross-Origin-Resource-Policy': 'cross-origin',
}
# 清空url文件
cleanAllFile.cleanAllFile('url.txt')
# print('清空了url.txt文件,将重新获取...')
cleanAllFile.cleanAllFile('pagesize.txt')
# print('清空了pagesize.txt文件,请重新获取...')
url = 'https://www.hh12345.cc'
response = requests.get(url,headers=headers)
response.encoding='gbk'
soup = BeautifulSoup(response.text, 'html.parser')
atags = soup.find_all('a')
count=0
for index, value in enumerate(atags):
if index>8 and index%2==0:
link = value.get('href')
if link:
with open('url.txt', 'a') as f:
f.write(link)
f.write('\n')
count+=1
print(f'写入url成功共{count}条')
pagesize.py
import re
import requests
from bs4 import BeautifulSoup
from photo import cleanAllFile
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0',
'Referer': 'Your Referer', # 添加 Referer 头,指示请求来源页面
'Cache-Control': 'no-cache',
'Content-Security-Policy': "script-src 'self';",
'Cross-Origin-Resource-Policy': 'cross-origin',
}
with open('url.txt', 'r') as f:
cleanAllFile.cleanAllFile('pagesize.txt')
count=0#总的
shang=0#尚物集
for line in f:
line = line.strip()
if not line:
continue
req = requests.get(line.strip(),headers=headers)
req.encoding='gbk'
soup = BeautifulSoup(req.text, 'html.parser')
title = soup.find('title').get_text()
pattern = r'\[.*?\](.*?)_美女写真网_hh123\.cc'#正则匹配
# match = re.search(pattern, title)
# match = re.findall(pattern,title)
imgtag = soup.find_all(class_='page-list')
for img in imgtag:
imgss = img.find_all('a')
for a_tag in imgss:
text = a_tag.text
number = re.search(r'\d+', text)
if number:
number1 = number.group()
match = re.findall(pattern, title)
if len(match)>0:
count = count+1
with open('pagesize.txt', 'a') as f:
f.write(f'{number1}@{line}@{match}')
f.write('\n')
else:
shang+=1
continue
print(f'{count}条匹配成功页数和名称,写入成功,{shang}条未匹配到名称,写入失败')
print('pagesize.txt准备完成,运行getsrc开始下载,运行一次下载一套')
GetSrc.py
import os
import re
import sys
import requests
from bs4 import BeautifulSoup
import download
import cleanAllFile
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0',
'Referer': 'Your Referer', # 添加 Referer 头,指示请求来源页面
'Cache-Control': 'no-cache',
'Content-Security-Policy': "script-src 'self';",
'Cross-Origin-Resource-Policy': 'cross-origin',
}
def get_src():
save_local = 'G:\\写真'
src_list = [] # 存储图片地址的列表
if os.path.getsize('pagesize.txt') > 0:
with open('pagesize.txt', 'r') as f:
for line in f:
lines = line.strip().split('@')
if len(lines) >= 3:
pagesize = lines[0]
url = lines[1]
cleaned_text = lines[2]
name = cleaned_text.replace("[", "").replace("]", "").replace("'","")
else:
print("Error: Line does not have enough elements to unpack:", line)
continue # 处理异常情况后继续下一行
output_folder = save_local + '\\' + name
if os.path.exists(output_folder) and os.path.getsize(output_folder) > 0:
print("文件内容已下载过,无需重复下载。")
cleanpage()
continue # 已下载过的文件继续下一行
urls = url
print(f'{urls}开始下载{name}')
count = 0 # 统计个数
for i in range(int(pagesize)):
if i != 0:
last_dot = url.rfind('.')
if last_dot != -1: # 确保找到了点
url = url[:last_dot] + '_' + str(i) + url[last_dot:]
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text, 'html.parser')
atag = soup.find_all('img', class_=True)
for src in atag:
srcs = src.get('src', 'No Src')
src_list.append(srcs) # 将图片地址添加到列表中
count += 1
url = urls
print(f'{count}个图片的地址已经保存')
# 如果 name 不为空,拼接完整的文件路径
if name:
name = save_local + '\\' + name
else:
name = save_local + '\无名'
break
# 将所有图片地址一次性写入文件
with open('src.txt', 'a') as f:
for src in src_list:
f.write(src)
f.write('\n')
print("所有图片地址已保存到src.txt")
if name: # 返回最后一个处理的文件路径
return name
else:
print("文件为空,全部已经下载完毕,重新获取数据吧")
return None
def cleanpage():
with open('pagesize.txt', 'r', encoding='gbk') as file:
lines = file.readlines()
# 排除第一句
new_content = ''.join(lines[1:])
# 将修改后的内容写回文件
with open('pagesize.txt', 'w', encoding='gbk') as file:
file.write(new_content)
name=get_src()
if name:
download.download_photos_from_file('src.txt',name)
cleanAllFile.cleanAllFile('src.txt')
# print('是否删除已经下载过的链接y/n')
# judge=input()
# if judge=='y':
# # 读取文件内容
# with open('pagesize.txt', 'r',encoding='gbk') as file:
# lines = file.readlines()
#
# # 排除第一句
# new_content = ''.join(lines[1:])
#
# # 将修改后的内容写回文件
# with open('pagesize.txt', 'w', encoding='gbk') as file:
# file.write(new_content)
while True:
print('是否退出1.是,(其他继续)')
judge=input()
if judge!=1 and judge!='1':
cleanpage()
name = get_src()
if name:
download.download_photos_from_file('src.txt', name)
cleanAllFile.cleanAllFile('src.txt')
else:
cleanpage()
sys.exit()