这是一个具备可扩展功能的爬虫!!!
import os
import requests
from fake_useragent import UserAgent
from retrying import retry
import hashlib # 信息摘要 md5
import queue # 队列
import re # 正则表达式
from urllib import robotparser # 解析网站robots.txt文件
from urllib.parse import urlencode, urljoin, urldefrag, urlparse # 解析url
from threading import Thread # 多线程
from datetime import datetime
import time
import mongo_cache
import random
MAX_DEP = 2 # 定义爬虫爬取深度
def get_robots(url):
"""
解析robots.txt文件
:param url:
:return:
"""
rp = robotparser.RobotFileParser()
rp.set_url(urljoin(url, "robots.txt"))
rp.read()
return rp
def save_url(html_content, url_str):
"""
储存下载内容
:param html_content:
:param url_str:
:return:
"""
md5 = hashlib.md5()
md5.update(html_content)
# file_path = "./download/" + md5.hexdigest() + ".html"
if not os.path.exists("./download"):
os.mkdir("./download")
file_path = "./download/" + gen_html_name(url_str)
with open(file_path, "wb") as fp:
fp.write(html_content)
def gen_html_name(url_str):
path = urlparse(url_str).path
path_array = path.split("/")
return path_array[len(path_array) - 1]
def extractor_url_lists(html_content):
"""
抽取网页中的其他链接
:param html_content:
:return:
"""
url_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
return url_regex.findall(html_content)
class CrawlerCommon(Thread):
"""
实现一个通用爬虫, 涵盖基本的爬虫功能及涉及一些发爬虫技术
"""
# def __init__(self, {"url":url_str,"regex":link_regex}
def __init__(self, init_url):
# super(CrawlerCommon, self).__init__() 2.7版本
# crawler.start() 在启动使用多线程的时候要强制初始化父类的构造方法
super().__init__() # 3.x版本使用
__ua = UserAgent() # 随机User-Agent
self.seed_url = init_url # 初始化爬取的种子网址
# LifoQueue python栈结构, 先进后出
self.crawler_queue = queue.Queue() # 使不同的队列会造成BFS和DFS效果
self.crawler_queue.put(init_url) # 将种子网址放入队列
self.visited = {init_url: 0} # 初始化爬取深度为0
self.rp = get_robots(init_url) # 初始化robots解析器
self.headers = {"User-Agent": __ua.random} # 生成一个随机的User-Agent
self.link_regex = '(css|js|c)' # 抽取网址的过滤条件
self.throttle = Throttle(5.0) # 下载限流器时间间隔5秒
self.mcache = mongo_cache.MongoCache() # 初始化mongo_cache
# self.random_proxy = RandomProxy()
# self.random_proxy.crawl_proxies()
# self.random_proxy.verify_proxies()
@retry(stop_max_attempt_number=3)
def retry_download(self, url_str, data, method, proxies):
"""
使用装饰器的重试下载类
:param url_str:
:param data:
:param method:
:param proxies:
:return:
"""
if method == "POST":
result = requests.post(url_str, data=data, headers=self.headers, proxies=proxies)
else:
result = requests.get(url_str, headers=self.headers, timeout=3, proxies=proxies)
# assert 出错时会报出错的位置
assert result.status_code == 200 # 此处为断言, 判断状态码是否为200
return result.content
def download(self, url_str, data=None, method="GET", proxies={}):
"""
真正的下载类
:param url_str:
:param data:
:param method:
:param proxies:
:return:
"""
print("download url is ::::::", url_str)
# 设置代理
try:
result = self.retry_download(url_str, data, method, proxies)
except Exception as e:
print(e)
print("#" * 50)
result = None
return result
def nomalize(self, url_str):
"""
补全下载链接
:param url_str:
:return:
"""
real_url, _ = urldefrag(url_str)
return urljoin(self.seed_url, real_url)
def save_result(self, html_content, url_str):
if url_str not in self.mcache:
self.mcache[url_str] = html_content
else:
data_from_mongo = self.mcache[url_str]
# 初始化md5算法
md5_func_mongo = hashlib.md5()
md5_func_download = hashlib.md5()
# 生产数据库记录的md5摘要
md5_func_download.update(html_content)
md5_func_mongo.update(data_from_mongo)
print(type(html_content), type(data_from_mongo))
# 下载记录的md5摘要
mongo_md5_str = md5_func_mongo.hexdigest()
download_md5_str = md5_func_download.hexdigest()
if download_md5_str != mongo_md5_str:
self.mcache[url_str] = html_content
def run(self):
"""
进行页面爬取的主要方法
:return:
"""
while not self.crawler_queue.empty():
url_str = self.crawler_queue.get()
print("url_str is ::::::{}".format(url_str))
# 检测robots.txt文件规则
if self.rp.can_fetch(self.headers["User-Agent"], url_str):
self.throttle.wait_url(url_str)
depth = self.visited[url_str]
if depth < MAX_DEP:
# 下载链接
html_content = self.download(url_str)
# 储存链接
if html_content is not None:
self.save_result(html_content, url_str)
# self.mcache[url_str] = html_content
# save_url(html_content, url_str)
pass
else:
continue
# 筛选出页面所有的链接
url_list = extractor_url_lists(html_content.decode("utf8"))
# 筛选需要爬取的链接
filter_urls = [link for link in url_list if re.search(self.link_regex, link)]
for url in filter_urls:
# 补全链接
real_url = self.nomalize(url)
# 判断链接是否被爬取过
if real_url not in self.visited:
# print("link is ::::::",real_url)
self.visited[real_url] = depth + 1
self.crawler_queue.put(real_url)
else:
print("ronots.txt 禁止下载", url_str)
class Throttle(object):
"""
下载限流器
"""
def __init__(self, delay):
# 创建一个字典用来保存要下载的域名, 和当前时间
self.domains = {}
# 规定休眠时间
self.delay = delay
def wait_url(self, url_str):
"""
:param url_str:
:return:
"""
domain_url = urlparse(url_str).netloc # 取出网址域名部分(netloc)
last_accessed = self.domains.get(domain_url) # 取出域名的上次下载时间
if self.delay > 0 and last_accessed is not None:
# 将当前时间和上次下载时间相减, 计算出两次下载时间间隔, 然后用规定的休眠时间(delay)减去这个时间间隔
# 如果大于0就休眠, 否则就直接下载后续的链接
sleep_interval = self.delay - (datetime.now() - last_accessed).seconds
if sleep_interval > 0:
print(sleep_interval)
time.sleep(sleep_interval)
self.domains[domain_url] = datetime.now() # 把当前时间为值, 域名为key存入到domains字典中
print(self.domains[domain_url])
class RandomProxy(object):
"""
随机代理
"""
def __init__(self):
self.proxies = []
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
}
def crawl_proxies(self):
"""
抓取网页上的代理
:return:
"""
self.proxies.append("220.180.50.14:53281")
self.proxies.append("119.101.116.5:9999")
def verify_proxies(self):
"""
校验每个代理是否可用
:return:
"""
invalid_ip = []
for ip_str in self.proxies:
proxies = {
"http": ip_str
}
r = requests.get("http://www.baidu.com", proxies=proxies, headers=self.headers)
if r.status_code == 200:
continue
else:
invalid_ip.append(ip_str)
for remove_ip in invalid_ip:
self.proxies.remove(remove_ip)
def get_one_proxy(self):
return random.choice(self.proxies)
urls = ["http://www.runoob.com/css/css-tutorial.html", "http://www.runoob.com/js/js-tutorial.html",
"http://www.runoob.com/cprogramming/c-tutorial.html"]
if __name__ == '__main__':
for url_str in urls:
crawler = CrawlerCommon(url_str)
# 当继承Thread线程时, 当调用start()方法后, 系统会默认调用该类的run()方法
crawler.start()