from utils import url_manager
import requests
from bs4 import BeautifulSoup
import re
root_url = "http://www.crazyant.net"
# root_url = 'https://yuguo.us/' #报错
# root_url = 'https://www.sanjieke.cn/discover' #不报错
urls = url_manager.UrlManager()
urls.add_new_url(root_url) # 初始化
fout = open("craw_all_pages.txt1","w")
# print('hello')
while urls.has_new_url():
'''发起网页请求'''
curr_url = urls.get_url()
r = requests.get(curr_url, timeout=3)
if r.status_code != 200:
print("error, return status_code is not 200", curr_url)
continue
'''网页内容解析'''
soup = BeautifulSoup(r.text, "html.parser")
title = soup.title.string
'''目标内容存储'''
fout.write("%s\t%s\n"%(curr_url, title))
fout.flush()
print("success: %s, %s"%(curr_url, title),len(urls.new_urls))
# print('循环一次,运行一次')
links = soup.find_all("a")
for link in links:
href = link.get("href")
if href is None:
continue
pattern = r'^http://www.crazyant.net/\d+.html$'
# pattern = r'^https://www.sanjieke.cn/course/detail/sjk/\d+'
if re.match(pattern, href):
urls.add_new_url(href)
fout.close()
Python爬虫实战爬取所有博客网页
最新推荐文章于 2024-05-27 08:52:59 发布