利用Scrapy爬取网站

使用方法:

  1. 确保已经安装了 Scrapy 和相关依赖
  2. 调整url,设置深度,调整爬取文件保存目录,执行程序
  3. 遵守网站的使用规定和法律法规,确保你的爬取行为合法合规
import scrapy
from urllib.parse import urlparse, urljoin
import os
from scrapy.crawler import CrawlerProcess
import time

class MySpider(scrapy.Spider):
    name = 'myspider'
    start_urls = ['url'] # 设置爬取目标url
    save_dir = 'path'  # 设置保存目录
    custom_settings = {
        'DEPTH_LIMIT': 2, # 设置爬取深度
        'HTTPCACHE_ENABLED': True,
    }

    def parse(self, response):
        url = response.url
        base_url = f"{urlparse(url).scheme}://{urlparse(url).netloc}"
        if url == self.start_urls[0]:
            filename = 'index.html'
            page_save_path = self.save_dir
        else:
            path = urlparse(url).path
            page_directory = path.strip('/').replace('/', os.sep)
            if not page_directory.endswith('.html'):
                pass
            page_save_path = os.path.join(self.save_dir, os.path.dirname(page_directory))
            filename = os.path.basename(page_directory)
            if not filename:
                filename = 'index.html'

        # 提取HTML内容
        html_content = response.text

        # 提取并替换资源链接为本地路径
        resource_urls = response.css('img::attr(src), link::attr(href), script::attr(src)').extract()
        for resource_url in resource_urls:
            absolute_url = urljoin(base_url, resource_url)
            local_path = self.calculate_local_path(absolute_url)
            html_content = html_content.replace(resource_url, local_path)
            local_full_path = os.path.join(self.save_dir, local_path)
            if not os.path.exists(local_full_path):
                yield scrapy.Request(absolute_url, callback=self.save_resource, meta={'filename': local_full_path})

        # 替换页面内跳转链接为本地路径
        for link in response.css('a::attr(href)').extract():
            absolute_link = urljoin(base_url, link)
            local_link_path = self.calculate_local_path(absolute_link)
            html_content = html_content.replace(link, local_link_path)

        # 确保目录存在
        os.makedirs(page_save_path, exist_ok=True)
        full_path = os.path.join(page_save_path, filename)

        # 检查文件是否存在,如果存在,则添加时间戳作为后缀进行重命名
        if os.path.exists(full_path):
            base_name, ext = os.path.splitext(filename)
            timestamp = time.strftime("%Y%m%d%H%M%S")
            new_filename = f"{base_name}_{timestamp}{ext}"
            full_path = os.path.join(page_save_path, new_filename)

        # 保存HTML文件
        with open(full_path, 'w', encoding='utf-8') as f:
            f.write('<!DOCTYPE html>\n<html>\n<head>\n<meta charset="UTF-8">\n</head>\n<body>\n')
            f.write(html_content)
            f.write('\n</body>\n</html>')

        # 提取并跟踪到下一层页面的链接
        for next_page in response.css('a::attr(href)'):
            next_page_url = urljoin(base_url, next_page.extract())
            yield response.follow(next_page_url, self.parse)

    def save_resource(self, response):
        filename = response.meta['filename']
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        with open(filename, 'wb') as f:
            f.write(response.body)

    def calculate_local_path(self, url):
        parsed_url = urlparse(url)
        path = parsed_url.path.strip('/')
        if not os.path.splitext(path)[1]:  # os.path.splitext(path)[1] 返回文件的扩展名
            path += '.html'
        local_path = path.replace('/', os.sep)
        return local_path

# 运行爬虫
process = CrawlerProcess()
process.crawl(MySpider)
process.start()

  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值