import re
import requests
from bs4 import BeautifulSoup
class UrlManage:
"""URL 管理器"""
def __init__(self):
# 待爬取 URL 集合
self.new_urls = set()
# 已爬取 URL 集合
self.old_urls = set()
def get_url(self):
"""从URL管理器中获取URL进行爬取"""
if self.has_new_url():
url = self.new_urls.pop()
self.old_urls.add(url)
return url
return None
def add_new_url(self, url):
"""新增一个 URL"""
if url is None or len(url) == 0:
return
if url in self.old_urls or url in self.new_urls:
return
self.new_urls.add(url)
def add_new_urls(self, *urls):
"""批量新增 URL"""
if urls is None or len(urls) == 0:
return
for url in urls:
self.add_new_url(url)
def has_new_url(self):
"""判断是否还有待爬取 URL"""
return len(self.new_urls) > 0
root_url = 'http://www.crazyant.net' # 要爬取博客的根目录
urls_management = UrlManage() # URL 管理器
urls_management.add_new_url(root_url)
file = open('crazy_all_pages.txt', 'w')
while urls_management.has_new_url():
curr_url = urls_management.get_url() # 获取要爬取的 URL
res = requests.get(curr_url, timeout=3) # timeout 设置超时时间
if res.status_code != 200:
print('error, return status_code is not 200', curr_url)
continue
soup = BeautifulSoup(res.text, 'html.parser')
title = soup.title.string # 获取文章标题
file.write("%s\t%s\n" % (curr_url, title))
file.flush()
print("%s\t%s\t%d\n" % (curr_url, title, len(urls_management.old_urls)))
# 解析网页中的 URL,将其存入 URL 管理器
links = soup.find_all('a')
for link in links:
href = link.get('href')
if href is None:
continue
pattern = r'^http://www.crazyant.net/\d+.html$'
if re.match(pattern, href):
urls_management.add_new_url(href)
# 关闭文件
file.close()
Python 爬虫实战 —— 爬取一个博客的所有文章
最新推荐文章于 2023-12-28 16:20:20 发布