计算机类专业实验报告
姓名: 学号: 班级:
实验题目:网络内容的抓取
- 实验目的及意义
从网站https://xkcd.com中下载所有的漫画,并保存在当地磁盘上。
- 实验相关知识点及要求
- requests模块的使用
- beautifulsoup模块的使用
- 程序需要使用迭代方式和递归两种不同的方式进行书写。
- 实验步骤
- 实验各步骤具体内容
实验代码:
import os
import requests
from bs4 import BeautifulSoup
# 保存目录
os.makedirs('wmy1', exist_ok=True)
# 迭代方法
for i in range(1, 2001):
# 构建网页URL
url = f'https://xkcd.com/{i}/'
# 发送HTTP请求获取网页内容
response = requests.get(url)
response.raise_for_status()
# 解析HTML页面
soup = BeautifulSoup(response.text, 'html.parser')
# 查找漫画图片标签并下载保存
comic_element = soup.select('#comic img')
if comic_element:
comic_url = 'https:' + comic_element[0]['src']
comic_response = requests.get(comic_url)
comic_response.raise_for_status()
# 从URL中提取图片文件名
filename = os.path.join('wmy1', os.path.basename(comic_url))
# 保存图片到本地磁盘
with open(filename, 'wb') as file:
file.write(comic_response.content)
print(f'Saved comic: {filename}')
else:
print(f'在该地址查找漫画图片时发生错误: {url}')
"""
# 递归方法
def download_comic(url):
response = requests.get(url)
response.raise_for_status()
# 解析HTML页面
soup = BeautifulSoup(response.text, 'html.parser')
# 查找漫画图片标签并下载保存
comic_element = soup.select('#comic img')
if comic_element:
comic_url = 'https:' + comic_element[0]['src']
comic_response = requests.get(comic_url)
comic_response.raise_for_status()
# 从URL中提取图片文件名
filename = os.path.join('wmy', os.path.basename(comic_url))
# 保存图片到本地磁盘
with open(filename, 'wb') as file:
file.write(comic_response.content)
print(f'Saved comic: {filename}')
else:
print(f'Comic image not found on the page: {url}')
# 递归调用自身,下载下一页的漫画
next_link = soup.select('a[rel="next"]')
if next_link:
next_url = 'https://xkcd.com' + next_link[0]['href']
download_comic(next_url)
"""
# 调用递归函数开始下载漫画
#download_comic('https://xkcd.com/1/')
- 实验结论
递归方法:
迭代方法
结果一致,学到了两种方法。