Python爬虫程序——从官网抓取所有奥特曼图片以及对应的介绍信息-CSDN博客

本文链接：https://blog.csdn.net/weixin_60395491/article/details/136329602

爬取图片：

import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup

# Disable requests and urllib3 warnings
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)

url = "https://tsuburaya-prod.com/heroeslist"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

response = requests.get(url, headers=headers, verify=False)
soup = BeautifulSoup(response.text, 'html.parser')

ultramen_articles = soup.find_all('article', class_='p-herolist__item')

# Create a directory to save images if it doesn't exist
image_directory = "ultraman_images"
os.makedirs(image_directory, exist_ok=True)

for ultraman_article in ultramen_articles:
    h3_element = ultraman_article.find('h3', class_='p-herolist__name')
    if h3_element:
        ultraman_name = h3_element.text.strip()
    else:
        ultraman_name = "N/A"

    img_element = ultraman_article.find('img')
    if img_element and 'data-src' in img_element.attrs:
        ultraman_image_url = urljoin(url, img_element['data-src'])

        # Download the image
        try:
            image_data = requests.get(ultraman_image_url, verify=False).content
            image_path = os.path.join(image_directory, f"{ultraman_name}.jpg")

            with open(image_path, 'wb') as image_file:
                image_file.write(image_data)

            print(f"Downloaded {ultraman_name}'s image")

        except Exception as e:
            print(f"Error downloading {ultraman_name}'s image:", repr(e))
            continue
    else:
        ultraman_image_url = "N/A"

爬取文字介绍：

import requests
from bs4 import BeautifulSoup
import os
# 目标URL
url = 'https://tsuburaya-prod.com/heroeslist'

# 发送GET请求获取网页内容
response = requests.get(url)

# 检查是否成功获取页面
if response.status_code == 200:
    # 使用BeautifulSoup解析HTML
    soup = BeautifulSoup(response.text, 'html.parser')

    # 获取所有奥特曼列表项
    ultraman_list_items = soup.find_all('article', class_='p-herolist__item')

    # Create a directory to save profiles if it doesn't exist
    profiles_directory = "ultraman_profiles"
    os.makedirs(profiles_directory, exist_ok=True)

    # 循环遍历每个奥特曼项
    for item in ultraman_list_items:
        # 获取奥特曼名称
        h3_element = item.find('h3', class_='p-herolist__name')
        ultraman_name = h3_element.text.strip()
        # 获取奥特曼详情页链接
        detail_link = item.find('a', class_='p-herolist__link')['href']

        # 发送GET请求获取奥特曼详情页内容
        detail_response = requests.get(detail_link)

        # 检查是否成功获取详情页
        if detail_response.status_code == 200:
            # 使用BeautifulSoup解析奥特曼详情页HTML
            detail_soup = BeautifulSoup(detail_response.text, 'html.parser')

            # 获取奥特曼介绍信息
            introduction = detail_soup.find('div', class_='p-hero__profile').text.strip()

            # 将奥特曼名称和介绍保存到txt文件
            filename = os.path.join(profiles_directory, f"{ultraman_name}.txt")
            with open(filename, 'w',encoding='utf-8') as file:
                file.write(f'奥特曼名称: {ultraman_name}\n')
                file.write(f'奥特曼介绍: {introduction}\n')

            print(f'已保存奥特曼信息至文件: {filename}')
            
        else:
            print(f'无法获取奥特曼详情页: {detail_link}')

else:
    print(f'无法获取页面: {url}')

形成ppt：

import os
from PIL import Image
from pptx import Presentation
from pptx.util import Inches

# 指定图片和文本文件夹路径
image_folder = 'D:\\py\\ultraman_images'
text_folder = 'D:\\py\\ultraman_profiles'

# 创建PPT
presentation = Presentation()

# 获取图片文件夹中的所有文件
image_files = [f for f in os.listdir(image_folder) if f.endswith('.jpg')]

# 遍历每个图片文件
for image_file in image_files:
    # 获取奥特曼的名称
    character_name = os.path.splitext(image_file)[0]

    # 构建图片文件路径
    image_path = os.path.join(image_folder, image_file)

    # 构建文本文件路径
    text_path = os.path.join(text_folder, f'{character_name}.txt')

    # 读取文本文件中的介绍
    with open(text_path, 'r', encoding='utf-8') as file:
        character_intro = file.read()

    # 创建PPT的一页
    slide = presentation.slides.add_slide(presentation.slide_layouts[5])  # 使用空白幻灯片布局

    # 添加奥特曼图片
    left = Inches(1)
    top = Inches(1)
    slide.shapes.add_picture(image_path, left, top, width=Inches(4))

    # 添加奥特曼介绍文本
    textbox = slide.shapes.add_textbox(left=Inches(5), top=Inches(1), width=Inches(4), height=Inches(4))
    text_frame = textbox.text_frame
    p = text_frame.add_paragraph()
    p.text = character_intro

# 保存PPT
presentation.save('奥特曼介绍.pptx')