爬虫获取图片数据

最新推荐文章于 2024-06-28 18:26:42 发布

xianyu4625

最新推荐文章于 2024-06-28 18:26:42 发布

阅读量798

点赞数 2

文章标签：爬虫开发语言 pycharm python3.11

本文链接：https://blog.csdn.net/xianyu4625/article/details/137011102

版权

先修改GetSrc文件中的19行为保存的路径，然后进行下面的操作

save_local = 'G:\写真'

运行的顺序：RequestUrl.py-->pagesize.py-->Getsrc.py

先将代码全部写好,如下：

download.py

import os
import requests
from tqdm import tqdm
import threading

def download_file(url, output_folder, pbar):
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            filename = url.split('/')[-1]
            with open(os.path.join(output_folder, filename), 'wb') as f:
                total_length = response.headers.get('content-length')
                if total_length is None:
                    f.write(response.content)
                else:
                    dl = 0
                    total_length = int(total_length)
                    for data in response.iter_content(chunk_size=4096):
                        dl += len(data)
                        f.write(data)
                        pbar.update(len(data))  # 更新进度条
    except Exception as e:
        print(f"下载 {url} 时出错......")
        print(f'异常信息: {e}')

def download_photos_from_file(file_path, output_folder):
    try:
        print("Downloading........")
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

        with open(file_path, 'r') as file:
            lines = file.readlines()
            count = 0
            with tqdm(total=len(lines), desc="Downloading images") as pbar:
                threads = []
                for line in lines:
                    url = line.strip()
                    thread = threading.Thread(target=download_file, args=(url, output_folder, pbar))
                    threads.append(thread)
                    thread.start()

                for thread in threads:
                    thread.join()
                    count += 1

            print(f"总下载照片数量: {count}张")
    except Exception as e:
        print(f"下载 {count} 照片时出错......")
        print(f'异常信息: {e}')

# 调用函数并传入文件路径和输出文件夹路径
# download_photos_from_file('src.txt', 'G:\写真\ 2024.01.18 NO.7979 大美妞儿')

cleanAllFile.py

def cleanAllFile(file):
    file_name=file
    with open(file_name, 'w', encoding='utf-8') as file:
        file.truncate(0)
    print(f'{file_name}文件已清空')

RequestUrl.py

import requests
from bs4 import BeautifulSoup
import cleanAllFile
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0',
    'Referer': 'Your Referer',  # 添加 Referer 头，指示请求来源页面
    'Cache-Control': 'no-cache',
    'Content-Security-Policy': "script-src 'self';",
    'Cross-Origin-Resource-Policy': 'cross-origin',
}
# 清空url文件
cleanAllFile.cleanAllFile('url.txt')
# print('清空了url.txt文件，将重新获取...')
cleanAllFile.cleanAllFile('pagesize.txt')
# print('清空了pagesize.txt文件，请重新获取...')
url = 'https://www.hh12345.cc'
response = requests.get(url,headers=headers)
response.encoding='gbk'
soup = BeautifulSoup(response.text, 'html.parser')
atags = soup.find_all('a')
count=0
for index, value in enumerate(atags):
    if index>8 and index%2==0:
        link = value.get('href')
        if link:
            with open('url.txt', 'a') as f:
                f.write(link)
                f.write('\n')
                count+=1
print(f'写入url成功共{count}条')

pagesize.py

import re

import requests
from bs4 import BeautifulSoup

from photo import cleanAllFile

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0',
    'Referer': 'Your Referer',  # 添加 Referer 头，指示请求来源页面
    'Cache-Control': 'no-cache',
    'Content-Security-Policy': "script-src 'self';",
    'Cross-Origin-Resource-Policy': 'cross-origin',
}

with open('url.txt', 'r') as f:
    cleanAllFile.cleanAllFile('pagesize.txt')
    count=0#总的
    shang=0#尚物集
    for line in f:
        line = line.strip()
        if not line:
            continue
        req = requests.get(line.strip(),headers=headers)
        req.encoding='gbk'
        soup = BeautifulSoup(req.text, 'html.parser')
        title = soup.find('title').get_text()
        pattern = r'\[.*?\](.*?)_美女写真网_hh123\.cc'#正则匹配
        # match = re.search(pattern, title)
        # match = re.findall(pattern,title)
        imgtag = soup.find_all(class_='page-list')
        for img in imgtag:
            imgss = img.find_all('a')
            for a_tag in imgss:
                text = a_tag.text
                number = re.search(r'\d+', text)
                if number:
                    number1 = number.group()
            match = re.findall(pattern, title)
            if len(match)>0:
                count = count+1
                with open('pagesize.txt', 'a') as f:
                    f.write(f'{number1}@{line}@{match}')
                    f.write('\n')
            else:
                shang+=1
                continue
print(f'{count}条匹配成功页数和名称，写入成功，{shang}条未匹配到名称，写入失败')
print('pagesize.txt准备完成，运行getsrc开始下载，运行一次下载一套')

GetSrc.py

import os
import re
import sys

import requests
from bs4 import BeautifulSoup
import download
import cleanAllFile
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0',
    'Referer': 'Your Referer',  # 添加 Referer 头，指示请求来源页面
    'Cache-Control': 'no-cache',
    'Content-Security-Policy': "script-src 'self';",
    'Cross-Origin-Resource-Policy': 'cross-origin',
}


def get_src():
    save_local = 'G:\\写真'
    src_list = []  # 存储图片地址的列表
    if os.path.getsize('pagesize.txt') > 0:
        with open('pagesize.txt', 'r') as f:
            for line in f:
                lines = line.strip().split('@')
                if len(lines) >= 3:
                    pagesize = lines[0]
                    url = lines[1]
                    cleaned_text = lines[2]
                    name = cleaned_text.replace("[", "").replace("]", "").replace("'","")
                else:
                    print("Error: Line does not have enough elements to unpack:", line)
                    continue  # 处理异常情况后继续下一行

                output_folder = save_local + '\\' + name
                if os.path.exists(output_folder) and os.path.getsize(output_folder) > 0:
                    print("文件内容已下载过，无需重复下载。")
                    cleanpage()
                    continue  # 已下载过的文件继续下一行

                urls = url
                print(f'{urls}开始下载{name}')
                count = 0  # 统计个数
                for i in range(int(pagesize)):
                    if i != 0:
                        last_dot = url.rfind('.')
                        if last_dot != -1:  # 确保找到了点
                            url = url[:last_dot] + '_' + str(i) + url[last_dot:]
                    req = requests.get(url, headers=headers)
                    soup = BeautifulSoup(req.text, 'html.parser')
                    atag = soup.find_all('img', class_=True)
                    for src in atag:
                        srcs = src.get('src', 'No Src')
                        src_list.append(srcs)  # 将图片地址添加到列表中
                        count += 1
                    url = urls
                print(f'{count}个图片的地址已经保存')

                # 如果 name 不为空，拼接完整的文件路径
                if name:
                    name = save_local + '\\' + name
                else:
                    name = save_local + '\无名'
                break

    # 将所有图片地址一次性写入文件
    with open('src.txt', 'a') as f:
        for src in src_list:
            f.write(src)
            f.write('\n')

    print("所有图片地址已保存到src.txt")

    if name:  # 返回最后一个处理的文件路径
        return name
    else:
        print("文件为空，全部已经下载完毕，重新获取数据吧")
        return None
def cleanpage():
    with open('pagesize.txt', 'r', encoding='gbk') as file:
        lines = file.readlines()

    # 排除第一句
    new_content = ''.join(lines[1:])

    # 将修改后的内容写回文件
    with open('pagesize.txt', 'w', encoding='gbk') as file:
        file.write(new_content)
name=get_src()
if name:
    download.download_photos_from_file('src.txt',name)
    cleanAllFile.cleanAllFile('src.txt')
# print('是否删除已经下载过的链接y/n')
# judge=input()
# if judge=='y':
#     # 读取文件内容
#     with open('pagesize.txt', 'r',encoding='gbk') as file:
#         lines = file.readlines()
#
#     # 排除第一句
#     new_content = ''.join(lines[1:])
#
#     # 将修改后的内容写回文件
#     with open('pagesize.txt', 'w', encoding='gbk') as file:
#         file.write(new_content)

while True:
    print('是否退出1.是，(其他继续)')
    judge=input()
    if judge!=1 and judge!='1':
        cleanpage()
        name = get_src()
        if name:
            download.download_photos_from_file('src.txt', name)
            cleanAllFile.cleanAllFile('src.txt')
    else:
        cleanpage()
        sys.exit()