不适用框架,写一个功能相对较全面的爬虫,包含监控网站更新、代理ip、限流器、提取链接、重试下载等基本功能
代码如下:
主函数:
def run(self):
while not self.crawler_queue.empty():
url_str = self.crawler_queue.get()
print("url_str is ::::::{}".format(url_str))
# 检测robots.txt文件规则
if self.rp.can_fetch(self.headers["User-Agent"], url_str):
self.throttle.wait_url(url_str)
depth = self.visited[url_str]
if depth < MAX_DEP:
# 下载链接
html_content = self.download(url_str)
# 储存链接
if html_content is not None:
self.save_result(html_content, url_str)
# self.mcache[url_str] = html_content
# save_url(html_content, url_str)
pass
else:
continue
# 筛选出页面所有的链接
url_list = extractor_url_lists(html