爬虫获取图片数据

先修改GetSrc文件中的19行为保存的路径,然后进行下面的操作

save_local = 'G:\写真'
运行的顺序:RequestUrl.py-->pagesize.py-->Getsrc.py

先将代码全部写好,如下:

download.py

import os
import requests
from tqdm import tqdm
import threading

def download_file(url, output_folder, pbar):
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            filename = url.split('/')[-1]
            with open(os.path.join(output_folder, filename), 'wb') as f:
                total_length = response.headers.get('content-length')
                if total_length is None:
                    f.write(response.content)
                else:
                    dl = 0
                    total_length = int(total_length)
                    for data in response.iter_content(chunk_size=4096):
                        dl += len(data)
                        f.write(data)
                        pbar.update(len(data))  # 更新进度条
    except Exception as e:
        print(f"下载 {url} 时出错......")
        print(f'异常信息: {e}')

def download_photos_from_file(file_path, output_folder):
    try:
        print("Downloading........")
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

        with open(file_path, 'r') as file:
            lines = file.readlines()
            count = 0
            with tqdm(total=len(lines), desc="Downloading images") as pbar:
                threads = []
                for line in lines:
                    url = line.strip()
                    thread = threading.Thread(target=download_file, args=(url, output_folder, pbar))
                    threads.append(thread)
                    thread.start()

                for thread in threads:
                    thread.join()
                    count += 1

            print(f"总下载照片数量: {count}张")
    except Exception as e:
        print(f"下载 {count} 照片时出错......")
        print(f'异常信息: {e}')

# 调用函数并传入文件路径和输出文件夹路径
# download_photos_from_file('src.txt', 'G:\写真\ 2024.01.18 NO.7979 大美妞儿')
cleanAllFile.py
def cleanAllFile(file):
    file_name=file
    with open(file_name, 'w', encoding='utf-8') as file:
        file.truncate(0)
    print(f'{file_name}文件已清空')

RequestUrl.py
import requests
from bs4 import BeautifulSoup
import cleanAllFile
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0',
    'Referer': 'Your Referer',  # 添加 Referer 头,指示请求来源页面
    'Cache-Control': 'no-cache',
    'Content-Security-Policy': "script-src 'self';",
    'Cross-Origin-Resource-Policy': 'cross-origin',
}
# 清空url文件
cleanAllFile.cleanAllFile('url.txt')
# print('清空了url.txt文件,将重新获取...')
cleanAllFile.cleanAllFile('pagesize.txt')
# print('清空了pagesize.txt文件,请重新获取...')
url = 'https://www.hh12345.cc'
response = requests.get(url,headers=headers)
response.encoding='gbk'
soup = BeautifulSoup(response.text, 'html.parser')
atags = soup.find_all('a')
count=0
for index, value in enumerate(atags):
    if index>8 and index%2==0:
        link = value.get('href')
        if link:
            with open('url.txt', 'a') as f:
                f.write(link)
                f.write('\n')
                count+=1
print(f'写入url成功共{count}条')

pagesize.py

import re

import requests
from bs4 import BeautifulSoup

from photo import cleanAllFile

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0',
    'Referer': 'Your Referer',  # 添加 Referer 头,指示请求来源页面
    'Cache-Control': 'no-cache',
    'Content-Security-Policy': "script-src 'self';",
    'Cross-Origin-Resource-Policy': 'cross-origin',
}

with open('url.txt', 'r') as f:
    cleanAllFile.cleanAllFile('pagesize.txt')
    count=0#总的
    shang=0#尚物集
    for line in f:
        line = line.strip()
        if not line:
            continue
        req = requests.get(line.strip(),headers=headers)
        req.encoding='gbk'
        soup = BeautifulSoup(req.text, 'html.parser')
        title = soup.find('title').get_text()
        pattern = r'\[.*?\](.*?)_美女写真网_hh123\.cc'#正则匹配
        # match = re.search(pattern, title)
        # match = re.findall(pattern,title)
        imgtag = soup.find_all(class_='page-list')
        for img in imgtag:
            imgss = img.find_all('a')
            for a_tag in imgss:
                text = a_tag.text
                number = re.search(r'\d+', text)
                if number:
                    number1 = number.group()
            match = re.findall(pattern, title)
            if len(match)>0:
                count = count+1
                with open('pagesize.txt', 'a') as f:
                    f.write(f'{number1}@{line}@{match}')
                    f.write('\n')
            else:
                shang+=1
                continue
print(f'{count}条匹配成功页数和名称,写入成功,{shang}条未匹配到名称,写入失败')
print('pagesize.txt准备完成,运行getsrc开始下载,运行一次下载一套')

GetSrc.py

import os
import re
import sys

import requests
from bs4 import BeautifulSoup
import download
import cleanAllFile
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0',
    'Referer': 'Your Referer',  # 添加 Referer 头,指示请求来源页面
    'Cache-Control': 'no-cache',
    'Content-Security-Policy': "script-src 'self';",
    'Cross-Origin-Resource-Policy': 'cross-origin',
}


def get_src():
    save_local = 'G:\\写真'
    src_list = []  # 存储图片地址的列表
    if os.path.getsize('pagesize.txt') > 0:
        with open('pagesize.txt', 'r') as f:
            for line in f:
                lines = line.strip().split('@')
                if len(lines) >= 3:
                    pagesize = lines[0]
                    url = lines[1]
                    cleaned_text = lines[2]
                    name = cleaned_text.replace("[", "").replace("]", "").replace("'","")
                else:
                    print("Error: Line does not have enough elements to unpack:", line)
                    continue  # 处理异常情况后继续下一行

                output_folder = save_local + '\\' + name
                if os.path.exists(output_folder) and os.path.getsize(output_folder) > 0:
                    print("文件内容已下载过,无需重复下载。")
                    cleanpage()
                    continue  # 已下载过的文件继续下一行

                urls = url
                print(f'{urls}开始下载{name}')
                count = 0  # 统计个数
                for i in range(int(pagesize)):
                    if i != 0:
                        last_dot = url.rfind('.')
                        if last_dot != -1:  # 确保找到了点
                            url = url[:last_dot] + '_' + str(i) + url[last_dot:]
                    req = requests.get(url, headers=headers)
                    soup = BeautifulSoup(req.text, 'html.parser')
                    atag = soup.find_all('img', class_=True)
                    for src in atag:
                        srcs = src.get('src', 'No Src')
                        src_list.append(srcs)  # 将图片地址添加到列表中
                        count += 1
                    url = urls
                print(f'{count}个图片的地址已经保存')

                # 如果 name 不为空,拼接完整的文件路径
                if name:
                    name = save_local + '\\' + name
                else:
                    name = save_local + '\无名'
                break

    # 将所有图片地址一次性写入文件
    with open('src.txt', 'a') as f:
        for src in src_list:
            f.write(src)
            f.write('\n')

    print("所有图片地址已保存到src.txt")

    if name:  # 返回最后一个处理的文件路径
        return name
    else:
        print("文件为空,全部已经下载完毕,重新获取数据吧")
        return None
def cleanpage():
    with open('pagesize.txt', 'r', encoding='gbk') as file:
        lines = file.readlines()

    # 排除第一句
    new_content = ''.join(lines[1:])

    # 将修改后的内容写回文件
    with open('pagesize.txt', 'w', encoding='gbk') as file:
        file.write(new_content)
name=get_src()
if name:
    download.download_photos_from_file('src.txt',name)
    cleanAllFile.cleanAllFile('src.txt')
# print('是否删除已经下载过的链接y/n')
# judge=input()
# if judge=='y':
#     # 读取文件内容
#     with open('pagesize.txt', 'r',encoding='gbk') as file:
#         lines = file.readlines()
#
#     # 排除第一句
#     new_content = ''.join(lines[1:])
#
#     # 将修改后的内容写回文件
#     with open('pagesize.txt', 'w', encoding='gbk') as file:
#         file.write(new_content)

while True:
    print('是否退出1.是,(其他继续)')
    judge=input()
    if judge!=1 and judge!='1':
        cleanpage()
        name = get_src()
        if name:
            download.download_photos_from_file('src.txt', name)
            cleanAllFile.cleanAllFile('src.txt')
    else:
        cleanpage()
        sys.exit()

  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值