自用参考~
import os
import re
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
def save_as_html(filename, html_content):
with open(filename, 'w', encoding='utf-8') as file:
file.write(html_content)
def save_as_txt(filename, html_content):
with open(filename, 'w', encoding='utf-8') as file:
file.write(html_content)
class Spider:
def __init__(self, website):
self.cnt = 0
self.website = website
self.visited = set()
def scrap_web(self):
try:
os.makedirs(self.website[8:-1], exist_ok=True)
except Exception as e:
print(e)
self.fetch_and_parse(self.website, self.website)
return self.website[8:-1]
def fetch_and_parse(self, url, base_url):
# set max recursion time
if self.cnt > 10:
return
if url in self.visited:
return
self.visited.add(url)
self.cnt += 1
# send HTTP request
headers = {'User-Agent': 'My Custom User-Agent'}
response = requests.get(url, headers=headers)
if response.status_code != 200:
print(f"Failed to fetch {url}, status code: {response.status_code}")
return
soup = BeautifulSoup(response.text, 'html.parser')
text_only = soup.get_text(separator=" ", strip=True)
# save html to file
file_name = base_url[8:-1] + '/' + re.sub(r'[\\/:*<>|?].', '_', url[8:-1]).replace('.', '_')
file_name_txt = file_name + '.txt'
save_as_txt(file_name_txt, text_only)
print(f"Fetching {url}")
# fina all <a></a>
for link in soup.find_all('a', href=True):
href = link['href']
if not href or href.startswith('#') or not href.startswith(base_url) or "?share=" in href:
continue
if not href.startswith('http'):
href = urljoin(base_url, href)
self.fetch_and_parse(href, base_url)