Python spider 爬虫爬取网页及其子网页并保存到html/txt(自用简易版)

自用参考~

import os
import re
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

def save_as_html(filename, html_content):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(html_content)


def save_as_txt(filename, html_content):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(html_content)


class Spider:
    def __init__(self, website):
        self.cnt = 0
        self.website = website
        self.visited = set()

    def scrap_web(self):
        try:
            os.makedirs(self.website[8:-1], exist_ok=True)
        except Exception as e:
            print(e)

        self.fetch_and_parse(self.website, self.website)
        return self.website[8:-1]

    def fetch_and_parse(self, url, base_url):
        # set max recursion time
        if self.cnt > 10:
            return

        if url in self.visited:
            return

        self.visited.add(url)
        self.cnt += 1

        # send HTTP request
        headers = {'User-Agent': 'My Custom User-Agent'}
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print(f"Failed to fetch {url}, status code: {response.status_code}")
            return

        soup = BeautifulSoup(response.text, 'html.parser')
        text_only = soup.get_text(separator=" ", strip=True)

        # save html to file
        file_name = base_url[8:-1] + '/' + re.sub(r'[\\/:*<>|?].', '_', url[8:-1]).replace('.', '_')
        file_name_txt = file_name + '.txt'
        save_as_txt(file_name_txt, text_only)

        print(f"Fetching {url}")

        # fina all <a></a>
        for link in soup.find_all('a', href=True):
            href = link['href']

            if not href or href.startswith('#') or not href.startswith(base_url) or "?share=" in href:
                continue

            if not href.startswith('http'):
                href = urljoin(base_url, href)

            self.fetch_and_parse(href, base_url)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值