使用方法:
- 确保已经安装了 Scrapy 和相关依赖
- 调整url,设置深度,调整爬取文件保存目录,执行程序
- 遵守网站的使用规定和法律法规,确保你的爬取行为合法合规
import scrapy
from urllib.parse import urlparse, urljoin
import os
from scrapy.crawler import CrawlerProcess
import time
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = ['url'] # 设置爬取目标url
save_dir = 'path' # 设置保存目录
custom_settings = {
'DEPTH_LIMIT': 2, # 设置爬取深度
'HTTPCACHE_ENABLED': True,
}
def parse(self, response):
url = response.url
base_url = f"{urlparse(url).scheme}://{urlparse(url).netloc}"
if url == self.start_urls[0]:
filename = 'index.html'
page_save_path = self.save_dir
else:
path = urlparse(url).path
page_directory = path.strip('/').replace('/', os.sep)
if not page_directory.endswith('.html'):
pass
page_save_path = os.path.join(self.save_dir, os.path.dirname(page_directory))
filename = os.path.basename(page_directory)
if not filename:
filename = 'index.html'
# 提取HTML内容
html_content = response.text
# 提取并替换资源链接为本地路径
resource_urls = response.css('img::attr(src), link::attr(href), script::attr(src)').extract()
for resource_url in resource_urls:
absolute_url = urljoin(base_url, resource_url)
local_path = self.calculate_local_path(absolute_url)
html_content = html_content.replace(resource_url, local_path)
local_full_path = os.path.join(self.save_dir, local_path)
if not os.path.exists(local_full_path):
yield scrapy.Request(absolute_url, callback=self.save_resource, meta={'filename': local_full_path})
# 替换页面内跳转链接为本地路径
for link in response.css('a::attr(href)').extract():
absolute_link = urljoin(base_url, link)
local_link_path = self.calculate_local_path(absolute_link)
html_content = html_content.replace(link, local_link_path)
# 确保目录存在
os.makedirs(page_save_path, exist_ok=True)
full_path = os.path.join(page_save_path, filename)
# 检查文件是否存在,如果存在,则添加时间戳作为后缀进行重命名
if os.path.exists(full_path):
base_name, ext = os.path.splitext(filename)
timestamp = time.strftime("%Y%m%d%H%M%S")
new_filename = f"{base_name}_{timestamp}{ext}"
full_path = os.path.join(page_save_path, new_filename)
# 保存HTML文件
with open(full_path, 'w', encoding='utf-8') as f:
f.write('<!DOCTYPE html>\n<html>\n<head>\n<meta charset="UTF-8">\n</head>\n<body>\n')
f.write(html_content)
f.write('\n</body>\n</html>')
# 提取并跟踪到下一层页面的链接
for next_page in response.css('a::attr(href)'):
next_page_url = urljoin(base_url, next_page.extract())
yield response.follow(next_page_url, self.parse)
def save_resource(self, response):
filename = response.meta['filename']
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, 'wb') as f:
f.write(response.body)
def calculate_local_path(self, url):
parsed_url = urlparse(url)
path = parsed_url.path.strip('/')
if not os.path.splitext(path)[1]: # os.path.splitext(path)[1] 返回文件的扩展名
path += '.html'
local_path = path.replace('/', os.sep)
return local_path
# 运行爬虫
process = CrawlerProcess()
process.crawl(MySpider)
process.start()