多线程的思想概念
1.将爬虫写成一个类,自带的属性就是是否在工作,
然后其方法就是写获取页面,解析页面,然后爬取页面
2.然后就是讲线程写成一个类,然后run方法就是讲以前的方法写进去,然后将爬虫的方法写入进去
3.然后制造redis和持久化处理,
redis的作用就是避免爬虫重复爬取,然后将持久化处理写入run方法里
剩下就是爬起来
(o゜▽゜)o☆[BINGO!]
就是这么简单
from hashlib import sha1
import pymongo
from enum import Enum, unique
from queue import Queue
from random import random
from threading import Thread, local
from time import sleep
from urllib.parse import urlparse
import redis
import requests
from bs4 import BeautifulSoup
@unique
class SpiderStatus(Enum):
"""枚举方法,并且设置了唯一"""
IDLE = 0
WORKING = 1
def decode_page(page_bytes, charsets=('utf-8',)):
"""解析页面"""
page_html = None
for charset in charsets:
try:
page_html = page_bytes.decode(charset)
break
except UnicodeDecodeError:
pass
# logging.error('Decode:', error)
return page_html
# 装饰器
class Retry(object):
def __init__(self, *, retry_times=3,
wait_secs=5, errors=(Exception,)):
self.retry_times = retry_times
self.wait_secs = wait_secs
self.errors = errors
def __call__(self, fn): # 这样写才是装饰器
def wrapper(*args, **kwargs):
for _ in range(self.retry_times):
try:
return fn(*args, **kwargs)
except self.errors as e:
print(e)
sleep((random() + 1) * self.wait_secs)
return None
return wrapper
class Spider(object):
"""爬虫"""
def __init__(self):
self.status = SpiderStatus.IDLE
# 读取页面
@Retry()
def fetch(self, current_url, *, charsets=('utf-8',),
user_agent=None, proxies=None):
print('[Fetch]: ' + current_url)
headers = {'user-agent': user_agent} if user_agent else {}
resp = requests.get(current_url,
headers=headers, proxies=proxies)
return decode_page(resp.content, charsets) \
if resp.status_code == 200 else None
# 解析页面
def parse(self, html_page, *, domain='m.sohu.com'):
soup = BeautifulSoup(html_page, 'lxml')
for a_tag in soup.body.select('a[href]'):
parser = urlparse(a_tag.attrs['href'])
netloc = parser.netloc or domain
if netloc == domain:
scheme = parser.scheme or 'http'
path = parser.path
query = '?' + parser.query if parser.query else ''
full_url = f'{scheme}://{netloc}{path}{query}'
# 通过锁机制将redis提取出来这样避免多线程争抢
redis_client = thread_local.redis_client
if not redis_client.sismember('url', full_url):
redis_client.rpush('m_souhu', full_url)
def extract(self, html_page):
pass
def store(self, data_dict):
pass
class SpiderThread(Thread):
def __init__(self, spider):
super().__init__(daemon=True)# daemon守护线程,程序解释,主程序结束
self.spider = spider
def run(self):
redis_client = redis.Redis(host='127.0.0.1', port='6379')
mongo_client = pymongo.MongoClient('mongodb://10.7.54.12:27/')
thread_local.redis_client = redis_client
thread_local.mongo_db = mongo_client.msohu
while True:
current_url = redis_client.lpop('m_souhu')# 拿到url
# 拿不到就一直拿避免出错
while not current_url:
current_url = redis_client.lpop('m_souhu') # 如果用rpush就用lpop拿去,这样就可以做右边进左边出
current_url = current_url.decode('utf-8')
if not redis_client.sismember('url', current_url): # 用sismember键里有没有该数据
redis_client.sadd('url', current_url) # sadd添加
self.spider.status = SpiderStatus.WORKING # 将爬虫状态调整,用来下面判断是否结束
html_page = self.spider.fetch(current_url)
if html_page not in [None, '']:
hasher = hasher_proto.copy() # 之前讲过的用copy多次利用,减少读取速度
hasher.update(current_url.encode('utf-8'))
doc_id = hasher.hexdigest() # 显示哈西字符
souhu_data_coll = mongo_client.sohu.weburl
if not souhu_data_coll.find(doc_id):
souhu_data_coll.insert_one({
# 存入MongoDB数据
})
self.spider.parse(html_page)
self.spider.status = SpiderStatus.IDLE
# 判断工作状态
def is_any_alive(spider_threads):
"""any表示有一个真全部都为真,查看Spider的状态如果是工作的则返回True"""
return any([spider_thread.spider.status == SpiderStatus.WORKING
for spider_thread in spider_threads])
# 将数据库写成全局不好,容易让多线程抢着用,应该一个线程对应一个redis
# redis_client = redis.Redis(host='127.0.0.1', port='6379')
# mongo_client = pymongo.MongoClient('mongodb://180.76.154.142:27017/')
# souhu_data_coll = mongo_client.sohu.weburl
# 应该用锁机制将redis分离
thread_local = local()
hasher_proto = sha1()
def main():
redis_client = redis.Redis(host='1.2.3.4', port=6379, password='1qaz2wsx')
# redis中设置主键
if not redis_client.exists('m_souhu'):
redis_client.rpush('m_souhu', 'http://m.sohu.com/')
# task_queue = Queue()
# task_queue.put('http://m.sohu.com/')
spider_threads = [SpiderThread(Spider()) for _ in range(10)]
# 启动线程
for spider_thread in spider_threads:
spider_thread.start()
# 主线程不能停(只要有值或者有蜘蛛工作)
while redis_client.llen('m_souhu') or is_any_alive(spider_threads):
pass
print('Over!')
if __name__ == '__main__':
main()
urlparse
获取页面中url的指定位置的值
具体如下,这样我们在面对需要补全url的情况就可以用urlparse函数来写
- scheme——表示http或者https
- netloc——网页开头
- path——路径
- query——传参?后的
如果改成用redis来判断是否有键值段
那么思路就是先用一个键来储存所有的url信息,然后设置另一个键来储存已经使用过的url,并且每次获取url的时候可以用新的键来验证
redis_client = redis.Redis(host='127.0.0.1', port='6379')
# 在Spider中的parse函数也有相应的改变
def parse(self, html_page, *, domain='m.sohu.com'):
soup = BeautifulSoup(html_page, 'lxml')
for a_tag in soup.body.select('a[href]'):
parser = urlparse(a_tag.attrs['href'])
netloc = parser.netloc or domain
if netloc == domain:
scheme = parser.scheme or 'http'
path = parser.path
query = '?' + parser.query if parser.query else ''
full_url = f'{scheme}://{netloc}{path}{query}'
if not redis_client.sismember('url', full_url):
redis_client.rpush('m_souhu', full_url)
def run(self):
while True:
current_url = redis_client.lpop('m_souhu')
# 拿不到就一直拿避免出错
while not current_url:
current_url = redis_client.lpop('m_souhu') # 如果用rpush就用lpop拿去,这样就可以做右边进左边出
current_url = current_url.decode('utf-8')
if not redis_client.sismember('url', current_url): # 用sismember键里有没有该数据
redis_client.sadd('url', current_url) # sadd添加
self.spider.status = SpiderStatus.WORKING
html_page = self.spider.fetch(current_url)
if html_page not in [None, '']:
hasher = hasher_proto.copy()
hasher.update(current_url.encode('utf-8'))
doc_id = hasher.hexdigest()
if not souhu_data_coll.find(doc_id):
souhu_data_coll.insert_one({
添加MongoDB中的数据进去
})
self.spider.parse(html_page)
self.spider.status = SpiderStatus.IDLE
拓展{}:
这里引用新知识就是一起写的%s的继承符号,可以用f’{}来代替,这样既美观又节约空间