Python 爬虫实战 —— 爬取一个博客的所有文章

最新推荐文章于 2023-12-28 16:20:20 发布

debugBiubiubiu2000

最新推荐文章于 2023-12-28 16:20:20 发布

阅读量101

点赞数

文章标签： python 爬虫开发语言

本文链接：https://blog.csdn.net/2301_77659011/article/details/133044813

版权

import re
import requests
from bs4 import BeautifulSoup


class UrlManage:
    """URL 管理器"""
    def __init__(self):
        # 待爬取 URL 集合
        self.new_urls = set()
        # 已爬取 URL 集合
        self.old_urls = set()

    def get_url(self):
        """从URL管理器中获取URL进行爬取"""
        if self.has_new_url():
            url = self.new_urls.pop()
            self.old_urls.add(url)
            return url
        return None

    def add_new_url(self, url):
        """新增一个 URL"""
        if url is None or len(url) == 0:
            return
        if url in self.old_urls or url in self.new_urls:
            return
        self.new_urls.add(url)

    def add_new_urls(self, *urls):
        """批量新增 URL"""
        if urls is None or len(urls) == 0:
            return
        for url in urls:
            self.add_new_url(url)

    def has_new_url(self):
        """判断是否还有待爬取 URL"""
        return len(self.new_urls) > 0


root_url = 'http://www.crazyant.net'  # 要爬取博客的根目录
urls_management = UrlManage()  # URL 管理器
urls_management.add_new_url(root_url)

file = open('crazy_all_pages.txt', 'w')
while urls_management.has_new_url():
    curr_url = urls_management.get_url()  # 获取要爬取的 URL
    res = requests.get(curr_url, timeout=3)  # timeout 设置超时时间
    if res.status_code != 200:
        print('error, return status_code is not 200', curr_url)
        continue

    soup = BeautifulSoup(res.text, 'html.parser')
    title = soup.title.string  # 获取文章标题
    file.write("%s\t%s\n" % (curr_url, title))
    file.flush()
    print("%s\t%s\t%d\n" % (curr_url, title, len(urls_management.old_urls)))

    # 解析网页中的 URL，将其存入 URL 管理器
    links = soup.find_all('a')
    for link in links:
        href = link.get('href')
        if href is None:
            continue
        pattern = r'^http://www.crazyant.net/\d+.html$'
        if re.match(pattern, href):
            urls_management.add_new_url(href)

# 关闭文件
file.close()