import url_manager
from bs4 import BeautifulSoup
import requests
import re
root_url="http://www.crazyant.net"
urls=url_manager.UrlManager()#创建url管理器对象
urls.add_new_url(root_url)
fout=open("craw_all_pages.txt","w")
while urls.has_new_url():
current_url=urls.get_url()
r=requests.get(current_url,timeout=3)#过了3秒还没反应就继续往下执行,防止卡住
if r.status_code!=200:
print("erro,return status_code is not 200",current_url)
continue
soup=BeautifulSoup(r.text,"html.parser")#传入BeautifulSoup进行解析
title=soup.title.string #获取数据
fout.write("%s\t%s\n"%(current_url,title))
fout.flush()#立马写入文件
print("success:%s,%s,%d"%(current_url,title,len(urls.new_urls)))
links=soup.find_all("a")
for link in links: #将新的url添加到url管理器里面
herf=link.get("href")
if herf is None:#没有href属性就跳过
continue
pattern=r'^http://www.crazyant.net/\d+.html$'#用正则表达式判断获取的url是否符合需求
if re.match(pattern, herf):
urls.add_new_url(herf)
fout.close()#关闭文件,释放资源
url_manager在之前的文章里Python爬虫—requests、url管理器、HTML_八饱粥的博客-CSDN博客