import requests
from lxml import etree
import threading
from time import sleep
from urllib.parse import urlparse
from queue import Queue
import redis
# 建立一个redis链接
rds = redis.StrictRedis(host="www.fanjianbo.com",port=6379,db=10)
# 分布式数据库中包含三个模块:items、start_urls、depefilter
# 【下载模块】
# 写一个装饰器给下载器增加重传功能
class retry(object):
def __init__(self,max_retries=3,wait=3,exception=(Exception,)):
super().__init__()
self.max_retries = max_retries
self.wait = wait
self.exp = exception
# 把__call__函数作为闭包的外部函数
def __call__(self,f):
# 定义一个内部函数
def wrapper(*args,**kwargs):
# 把外部函数的局部变量f作为一个函数来使用
for i in range(self.max_retries):
try:
res = f(*args,**kwargs)
except self.exp as e:
print("异常:",e)
sleep(self.wait)
print("retries:%d"%(i+1))
continue
else:
return res
return wrapper
# 封装一个下载函数
@retry(4,2)
def fetch(url):
print("正在请求:",url)
# 发起请求
res = requests.get(url=url,headers={"user-agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'})
# 判断下载是否成功
if res.status_code == 200:
# 把响应体返回出去
return res.text
else:
print("下载失败,链接:%s无效"%url)
return None
# 封装一个解析函数
def parse(html):
try:
tree = etree.HTML(html)
# 取出当前页面上所有的a标签
a_list = tree.xpath("//a")
# 过滤获取到的a标签
for a in a_list:
# 取出title和href
title = " ".join(a.xpath(".//text()"))
href = a.xpath("./@href")[0] if a.xpath("./@href") else ""
# https://主机.域名/路由路径...?参数1=值1&参数2=值2&.....#锚点
# 把href分割开来,分成协议、主机.域名、路径、参数几个部分
parse_url = urlparse(href)
# print(parse_url)
# ParseResult(scheme='https', netloc='m.sohu.com', path='/promotion', params='', query='posId=19990501&itemId=1000&link=Z0FitIhGq8uqu4NvHEp0wPQHyqXV4KngbIUGw6MfPkv4cgInluvxUnua1aOnw7r5sX5Ui0Uqsndj7eLeX85VikDuWnJhHZ8cOAFglIMOFgjQnONh2r4LHwUd5eMoor6YBozH45WVFzz1fD2RA61RjZLw8KqrysAm5v7E1dbuCQw%3D', fragment='')
# 判断主机名是否符合要求
loc = parse_url.netloc.strip() or "m.sohu.com"
if loc == "m.sohu.com":
# 过滤协议
tmp = parse_url.scheme.strip()
if tmp=="http" or tmp=="https":
scheme = tmp
elif tmp == "":
scheme = "https"
else:
continue
# 过滤路径
path = parse_url.path.strip()
# 参数
query = "?" + parse_url.query.strip() if parse_url.query.strip() else ""
# 拼接url
page_url = scheme + "://" + loc + path + query
# print(page_url)
# 将拼接好的符合要求的连接和它对应的title组合成字典返回出去
yield {"title":title,"href":page_url}
except Exception as e:
print("解析异常:",e)
# 封装一个多线程调度模块
class Scheduler(threading.Thread):
# def __init__(self,start_queue,depefilter_queue):
def __init__(self):
super().__init__()
# self.start_urls = start_queue
# self.depefilter_queue = depefilter_queue
def run(self):
while True:
# 从start_urls中提取出起始url,并且调度下载器
# url = self.start_urls.get()
# 从redis的“sohu:start_urls”中提取
url = rds.lpop("sohu:start_urls")
# 调度下下载器之前要判断当前url是否在去重队列中
# if url in self.depefilter_queue:
# 从redis的“sohu:dupefilter”中取出所有的去重过的url来判断
dupe_urls = rds.lrange("sohu:dupefilter",0,rds.llen("sohu:dupefilter"))
if url in dupe_urls:
continue
try:
html = fetch(url=url)
# 如果调度成功,则加入到去重队列中
# self.depefilter_queue.append(url)
# 加入redis的"sohu:dupefilter"中
rds.lpush("sohu:dupefilter",url)
# 将html解析,将解析出来的那些当前页面的a标签的href属性取出来加入到start_urls队列
for a in parse(html):
# self.start_urls.put(a["href"]) # 将新产生的url入队
rds.lpush("sohu:start_urls",a["href"])
# print(a)
save(a)
except Exception as e:
# 如果调度不成功,塞回去(放回起始队列)
print("请求异常")
# self.start_urls.put(url)
rds.lpush("sohu:start_urls", url)
def save(data):
rds.lpush("sohu:items",data)
print("%s已经存入!"%data)
pass
if __name__ == '__main__':
start_url = "https://m.sohu.com/"
# 定义一个队列,用于存储起始地址
# start_queue = Queue()
# 把起始url加入到调度队列
# start_queue.put(start_url)
# 把起始url加入到redis数据库中的start_urls中
rds.lpush("sohu:start_urls",start_url)
# 定义一个队列用于去重
# depefilter_queue = []
# 创建4个调度线程来调度下载器与解析器的工作
# shedulers = [Scheduler(start_queue=start_queue,depefilter_queue=depefilter_queue) for i in range(4)]
shedulers = [Scheduler() for i in range(4)]
for s in shedulers:
s.start()
scrapy框架的实现(重写)
最新推荐文章于 2022-08-05 00:02:09 发布