爬取图片:
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
# Disable requests and urllib3 warnings
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
url = "https://tsuburaya-prod.com/heroeslist"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
response = requests.get(url, headers=headers, verify=False)
soup = BeautifulSoup(response.text, 'html.parser')
ultramen_articles = soup.find_all('article', class_='p-herolist__item')
# Create a directory to save images if it doesn't exist
image_directory = "ultraman_images"
os.makedirs(image_directory, exist_ok=True)
for ultraman_article in ultramen_articles:
h3_element = ultraman_article.find('h3', class_='p-herolist__name')
if h3_element:
ultraman_name = h3_element.text.strip()
else:
ultraman_name = "N/A"
img_element = ultraman_article.find('img')
if img_element and 'data-src' in img_element.attrs:
ultraman_image_url = urljoin(url, img_element['data-src'])
# Download the image
try:
image_data = requests.get(ultraman_image_url, verify=False).content
image_path = os.path.join(image_directory, f"{ultraman_name}.jpg")
with open(image_path, 'wb') as image_file:
image_file.write(image_data)
print(f"Downloaded {ultraman_name}'s image")
except Exception as e:
print(f"Error downloading {ultraman_name}'s image:", repr(e))
continue
else:
ultraman_image_url = "N/A"
爬取文字介绍:
import requests
from bs4 import BeautifulSoup
import os
# 目标URL
url = 'https://tsuburaya-prod.com/heroeslist'
# 发送GET请求获取网页内容
response = requests.get(url)
# 检查是否成功获取页面
if response.status_code == 200:
# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(response.text, 'html.parser')
# 获取所有奥特曼列表项
ultraman_list_items = soup.find_all('article', class_='p-herolist__item')
# Create a directory to save profiles if it doesn't exist
profiles_directory = "ultraman_profiles"
os.makedirs(profiles_directory, exist_ok=True)
# 循环遍历每个奥特曼项
for item in ultraman_list_items:
# 获取奥特曼名称
h3_element = item.find('h3', class_='p-herolist__name')
ultraman_name = h3_element.text.strip()
# 获取奥特曼详情页链接
detail_link = item.find('a', class_='p-herolist__link')['href']
# 发送GET请求获取奥特曼详情页内容
detail_response = requests.get(detail_link)
# 检查是否成功获取详情页
if detail_response.status_code == 200:
# 使用BeautifulSoup解析奥特曼详情页HTML
detail_soup = BeautifulSoup(detail_response.text, 'html.parser')
# 获取奥特曼介绍信息
introduction = detail_soup.find('div', class_='p-hero__profile').text.strip()
# 将奥特曼名称和介绍保存到txt文件
filename = os.path.join(profiles_directory, f"{ultraman_name}.txt")
with open(filename, 'w',encoding='utf-8') as file:
file.write(f'奥特曼名称: {ultraman_name}\n')
file.write(f'奥特曼介绍: {introduction}\n')
print(f'已保存奥特曼信息至文件: {filename}')
else:
print(f'无法获取奥特曼详情页: {detail_link}')
else:
print(f'无法获取页面: {url}')
形成ppt:
import os
from PIL import Image
from pptx import Presentation
from pptx.util import Inches
# 指定图片和文本文件夹路径
image_folder = 'D:\\py\\ultraman_images'
text_folder = 'D:\\py\\ultraman_profiles'
# 创建PPT
presentation = Presentation()
# 获取图片文件夹中的所有文件
image_files = [f for f in os.listdir(image_folder) if f.endswith('.jpg')]
# 遍历每个图片文件
for image_file in image_files:
# 获取奥特曼的名称
character_name = os.path.splitext(image_file)[0]
# 构建图片文件路径
image_path = os.path.join(image_folder, image_file)
# 构建文本文件路径
text_path = os.path.join(text_folder, f'{character_name}.txt')
# 读取文本文件中的介绍
with open(text_path, 'r', encoding='utf-8') as file:
character_intro = file.read()
# 创建PPT的一页
slide = presentation.slides.add_slide(presentation.slide_layouts[5]) # 使用空白幻灯片布局
# 添加奥特曼图片
left = Inches(1)
top = Inches(1)
slide.shapes.add_picture(image_path, left, top, width=Inches(4))
# 添加奥特曼介绍文本
textbox = slide.shapes.add_textbox(left=Inches(5), top=Inches(1), width=Inches(4), height=Inches(4))
text_frame = textbox.text_frame
p = text_frame.add_paragraph()
p.text = character_intro
# 保存PPT
presentation.save('奥特曼介绍.pptx')