一、URL管理器
注意:数据存储使用python内存时程序运行结束数据不会保留,而保存到redis和MySQL中数据可以长久保存。
代码实现:
class URLManager():
def __init__(self):
"""
存取未爬取的url
"""
self.new_urls = set()
"""
存取已爬取的url
"""
self.old_urls = set()
"""
两个新增url的方法,嵌套使用,分别实现url的单个增加和批量增加
"""
def add_new_url(self,url):
if url is None or len(url) == 0:
return
if url in self.new_urls or url in self.old_urls:
return
self.new_urls.add(url)
def add_new_urls(self,urls):
if urls is None or len(urls) == 0:
return
for url in urls:
self.add_new_url(url)
def get_url(self):
"""
取出url并更改状态
:return:
"""
if self.has_new_url():
url = self.new_urls.pop()
self.old_urls.add(url)
return url
else:
return None
def has_new_url(self):
"""
判断容器中是否有待爬取的url
:return:
"""
return len(self.new_urls) > 0
if __name__ == "__main__":
"""
测试
"""
url_manager = URLManager()
url_manager.add_new_url("url1")
url_manager.add_new_urls(["url1" , "url2"])
print(url_manager.new_urls , url_manager.old_urls)
"""
打印分割线
"""
print("#"*30)
new_url = url_manager.get_url()
print(url_manager.new_urls, url_manager.old_urls)
print("#" * 30)
new_url = url_manager.get_url()
print(url_manager.new_urls, url_manager.old_urls)
print("#"*30)
print(url_manager.has_new_url)
二、网页解析器Beautiful Soup
1.
其实就是根据html标签名称及其他属性来提取指定标签的内容。