Python 爬虫实战 —— 爬取一个博客的所有文章

import re
import requests
from bs4 import BeautifulSoup


class UrlManage:
    """URL 管理器"""
    def __init__(self):
        # 待爬取 URL 集合
        self.new_urls = set()
        # 已爬取 URL 集合
        self.old_urls = set()

    def get_url(self):
        """从URL管理器中获取URL进行爬取"""
        if self.has_new_url():
            url = self.new_urls.pop()
            self.old_urls.add(url)
            return url
        return None

    def add_new_url(self, url):
        """新增一个 URL"""
        if url is None or len(url) == 0:
            return
        if url in self.old_urls or url in self.new_urls:
            return
        self.new_urls.add(url)

    def add_new_urls(self, *urls):
        """批量新增 URL"""
        if urls is None or len(urls) == 0:
            return
        for url in urls:
            self.add_new_url(url)

    def has_new_url(self):
        """判断是否还有待爬取 URL"""
        return len(self.new_urls) > 0


root_url = 'http://www.crazyant.net'  # 要爬取博客的根目录
urls_management = UrlManage()  # URL 管理器
urls_management.add_new_url(root_url)

file = open('crazy_all_pages.txt', 'w')
while urls_management.has_new_url():
    curr_url = urls_management.get_url()  # 获取要爬取的 URL
    res = requests.get(curr_url, timeout=3)  # timeout 设置超时时间
    if res.status_code != 200:
        print('error, return status_code is not 200', curr_url)
        continue

    soup = BeautifulSoup(res.text, 'html.parser')
    title = soup.title.string  # 获取文章标题
    file.write("%s\t%s\n" % (curr_url, title))
    file.flush()
    print("%s\t%s\t%d\n" % (curr_url, title, len(urls_management.old_urls)))

    # 解析网页中的 URL,将其存入 URL 管理器
    links = soup.find_all('a')
    for link in links:
        href = link.get('href')
        if href is None:
            continue
        pattern = r'^http://www.crazyant.net/\d+.html$'
        if re.match(pattern, href):
            urls_management.add_new_url(href)

# 关闭文件
file.close()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值