python爬虫获取二次元照片

最新推荐文章于 2024-07-09 23:56:48 发布

xianyu4625

最新推荐文章于 2024-07-09 23:56:48 发布

阅读量165

点赞数 5

分类专栏：爬虫文章标签： python 爬虫开发语言

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文链接：https://blog.csdn.net/xianyu4625/article/details/136923433

版权

爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

文章描述了一个Python脚本，用于批量从Vilipix.com网站抓取图片，包括获取页面数据、提取a标签中的图片ID，去除重复，连接ID生成图片链接，读取并处理链接获取img标签，下载图片，并在出错时清空临时文件。

摘要由CSDN通过智能技术生成

目录

获取网站的数据

获取所有的a标签

获取文件中的id

连接成查看图片链接

读取图片查看链接获取数据

处理获取到的数据，得到img标签

批量获取https://www.vilipix.com中的图片，并且下载

获取网站的数据

获取所有的a标签

import requests
from bs4 import BeautifulSoup
def getImg(url,afile):
    # 发送请求并获取页面内容
    # 'https://www.vilipix.com/'
    url = url  # 替换为你想要获取信息的网页地址
    response = requests.get(url)

    # 使用BeautifulSoup解析页面内容
    soup = BeautifulSoup(response.text, 'html.parser')

    # 提取<body>标签内的所有img标签信息
    body = soup.find('body')
    # print(body)
    img_tags = body.find_all('a')
    # print(img_tags)
    # for img in img_tags:
    count=0
    # 打印所有<img>标签的信息
    for img in img_tags:
        # print(img)
        with open(afile, 'a', encoding='utf-8') as f:
            f.write(str(img))
            f.write('\n')
        count+=1
    print(f"写入a标签{count}个成功")

数据处理

获取文件中的id

import re
def getImgId(input_url,output_url):
    count=0
    # 打开输入文件和输出文件
    with open(input_url, 'r', encoding='utf-8') as input_file, open(output_url, 'a', encoding='utf-8') as output_file:
        for line in input_file:
            data = line.strip()  # 去除换行符等空白字符
            pattern = r'href="/illust/(.*?)"'
            result = re.search(pattern, data)
            if result:
                illust_link = result.group(1)
                output_file.write(illust_link + '\n')  # 将提取的 id 写入输出文件
            else:
                count+=1
    print(f"有{count}个未找到/illust链接")

id去重

def delRepeat(file_path,out_file_path):
    file_path = file_path
    output_file_path = out_file_path

    # 用集合来存储读取的内容，确保内容不重复
    content_set = set()

    # 读取文件并去除重复项
    with open(file_path, 'r') as file:
        for line in file:
            content = line.strip()  # 去除换行符等空白字符
            content_set.add(content)
    count=0
    # 将去除重复项后的内容写入到输出文件
    with open(output_file_path, 'a',encoding='utf-8') as output_file:
        for content in content_set:
            output_file.write(content + '\n')


    print("文件中重复项已成功去除并写入到文件中")

连接成查看图片链接

def linkUrl(input_path,output_path):
    file1_path = input_path
    file2_path = output_path

    with open(file1_path, 'r') as file1, open(file2_path, 'w') as file2:
        for line in file1:
            # 对每一行数据进行处理
            line = line.strip()  # 去除换行符等空白字符
            new_content = "https://www.vilipix.com/illust/" + line
            file2.write(new_content + '\n')  # 将处理后的内容写入第二个文件

    print("已获取到图片的id链接")

读取图片查看链接获取数据

from bs4 import BeautifulSoup
import requests
import getimg
def getSrc(readurl):
    # 读取 URL 列表
    with open(readurl, 'r') as file:
        urls = file.readlines()
    count=0
    # 遍历 URL 列表
    for url in urls:
        response = requests.get(url.strip())  # 去除 URL 中可能存在的空白字符
        getimg.gethtml_img(response)

    print('已获取到图片的img标签')

处理获取到的数据，得到img标签

from bs4 import BeautifulSoup


def gethtml_img(htmldata):
    # 从文件中读取HTML内容
    html_content = htmldata.text
    # 使用BeautifulSoup解析HTML内容
    soup = BeautifulSoup(html_content, 'html.parser')

    # 找出所有的img标签
    img_tags = soup.find_all('img')
    count = 0
    result = ""
    # 如果找到至少两个img标签
    if len(img_tags) >= 2:
        # 去除第一个和最后一个img标签
        img_tags_trimmed = img_tags[1:-1]

        # 输出剩余img标签的src属性
        for img_tag in img_tags_trimmed:
            result += str(img_tag) + "\n"  # 将每个 img 标签及其属性添加到结果中

        with open('img.txt', 'a', encoding='utf-8') as file:
            file.write(result)  # 将结果写入文件
    else:
        count += 0

将img标签转化为键值对

from bs4 import BeautifulSoup


def extract_and_save(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    # 使用BeautifulSoup解析HTML内容
    soup = BeautifulSoup(html_content, 'html.parser')

    # 找出所有的img标签
    img_tags = soup.find_all('img')

    result = ""
    for img_tag in img_tags:
        alt = img_tag.get('alt', 'No Alt Text')  # 获取alt属性，不存在则默认为'No Alt Text'
        src = img_tag.get('src', 'No Src')  # 获取src属性，不存在则默认为'No Src'

        # 将提取的alt和src内容以指定格式添加到结果中
        result += f'{alt}^/*\^ {src}\n'

    with open('src.txt', 'w', encoding='utf-8') as output_file:
        output_file.write(result)  # 将结果写入文件

    print('img标签信息转化完毕')
# 传入包含HTML内容的文件路径
# extract_and_save('img.txt')

下载照片

import os

import requests
import hashlib
import os
import requests
import re
def download(file_path, save_folder):

    print('Downloading...')

    # 从文件中读取多个图片链接
    file_path = file_path
    output_dir = save_folder  # 保存图片的文件夹路径

    count=0
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    def clean_filename(name):
        cleaned_name = re.sub(r'[<>:"/\\|?*]', ' ', name)  # 将不支持的字符替换为空格
        return cleaned_name.strip()

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            name, url = line.split('^/*\^ ')
            name = name.strip()
            url = url.strip()  # 去除链接两侧的引号

            # 下载图片
            response = requests.get(url)
            if response.status_code == 200:
                image_name = f'{clean_filename(name)}.jpg'
                image_path = os.path.join(output_dir, image_name)

                # 处理重名文件
                counter = 1
                while os.path.exists(image_path):
                    new_name = f'{clean_filename(name)}_{counter}.jpg'
                    image_path = os.path.join(output_dir, new_name)
                    counter += 1

                with open(image_path, 'wb') as f:
                    f.write(response.content)
                # print(f'图片 {name} 下载完成')
                count+=1
            else:
                print(f'下载 {name} 的图片失败')

    print(f'所有图片下载完成！共{count}张')


# download('src.txt','D:\二次元照片\长腿')

清空文件

file_list = ['atag.txt', 'delrepeat.txt', 'img.txt','imgid.txt','src.txt','url.txt']  # 将你的文件名存储在一个列表中
def cleanAllFile():
    for file_name in file_list:
        with open(file_name, 'w', encoding='utf-8') as file:
            file.truncate(0)
    print(f'所有文件已清空')


cleanAllFile()

主函数

import getimg_atag
import delRepeat
import getimgId
import linkUrl
import getSrc
import download
import img_transform
from urllib.parse import quote, unquote
import cleanAllFile

if __name__ == '__main__':
    try:
        while True:
            print("请输入保存的位置(必选)")
            savefilelocal = input()
            if savefilelocal != '':
                break;
        print('请输入你的关键词(可选)')
        title=input()
        titles=quote(title)
        if title:
            # 爬取的网站
            url='https://www.vilipix.com/tags/'+titles+'/illusts'
        else:
            print('1.新作 2.榜单 3.特辑(可选)')
            choose=input()
            if choose==1:
                url='https://www.vilipix.com/new'
            elif choose==2:
                url='https://www.vilipix.com/ranking'
            elif choose==3:
                url='https://www.vilipix.com/p'
            else:
                url='https://www.vilipix.com'
        # 存放爬取到的超链接的文件
        afile="atag.txt"
        #存放超链接去重之后的文件
        atag_delrepeat="delrepeat.txt"
        # 存放查找到的所有的id
        imgid="imgid.txt"
        #需要将id拼接成链接后存放的文件
        imgurl="url.txt"
        # 下载的地址
        downlocal= savefilelocal
        #存储img标签的位置
        imglocal='img.txt'
        #存储“name：连接”
        namePic='src.txt'
        # 获取所有的a标签
        getimg_atag.getImg(url,afile)
        # 查找所有的标签的id
        getimgId.getImgId(afile,imgid)
        # 去重
        delRepeat.delRepeat(imgid, atag_delrepeat)
        #将id拼接成链接并存储
        linkUrl.linkUrl(atag_delrepeat,imgurl)
    #     从链接中获取src的图片链接
        getSrc.getSrc(imgurl)
        #将获取到的img标签转换
        img_transform.extract_and_save(imglocal)
        #     下载图片
        download.download(namePic,downlocal)
        #不发生异常清空文件
        cleanAllFile.cleanAllFile()

    except Exception as e:
        print(f"发生异常：{e}")

关注

5
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
python爬虫获取二次元照片

批量获取https://www.vilipix.com中的二次元图片，并且下载。
复制链接

扫一扫

专栏目录

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。