爬虫并发下载
保存在内存
from enum import Enum, unique
from queue import Queue
from random import random
from time import sleep
from threading import Thread, current_thread
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
def decode_page(page_bytes, charsets=('utf-8',)):
page_html = None
for charset in charsets:
try:
page_html = page_bytes.decode(charset)
break
except UnicodeDecodeError:
pass
return page_html
class Retry(object):
def __init__(self, *, retry_times=3, wait_secs=1, errors=(Exception, )):
"""
:param retry_times: 重试次数
:param wait_secs: 最少等待时间
:param errors: 抓取的错误列表
"""
self.retry_times = retry_times
self.wait_secs = wait_secs
self.errors = errors
def __call__(self, fn):
def wrapper(*args, **kwargs):
for _ in range(self.retry_times):
try:
return fn(*args, **kwargs)
except self.errors as e:
print(e)
sleep((random() + 1) * self.wait_secs)
return None
return wrapper
@unique
class SpiderStatus(Enum):
IDLE = 0
WORKING = 1
class Spider(object):
def __init__(self):
self.status = SpiderStatus.IDLE
@Retry()
def fetch(self, current_url, *, charsets=('utf-8', ), user_agent=None, proxies=None):
"""
:param current_url: 传入一个url
:return: 一个html页面
"""
thread_name = current_thread().name
print(f'[{thread_name} Fetch]:{current_url}')
headers = {'user-agent': user_agent} if user_agent else {}
resp = requests.get(current_url,
headers=headers, proxies=proxies)
return decode_page(resp.content, charsets) if resp.status_code == 200 else None
def parse(self, html_page, *, domain='m.sohu.com'):
"""
:param html_page: 传入一个html页面
:return:返回一个新的url列表
"""
soup = BeautifulSoup(html_page, 'lxml')
url_links= []
for a_tag in soup.body.select('a[href]'):
parser = urlparse(a_tag.attrs['href'])
netloc = parser.netloc or domain
scheme = parser.scheme or 'http'
if scheme != 'javascript' and netloc == domain:
path = parser.path
query = '?' + parser.query if parser.query else ''
full_url = f'{scheme}://{netloc}{path}{query}'
if full_url not in visited_urls:
url_links.append(full_url)
return url_links
def extract(self, html_page):
"""
:param html_page: 传入一个html页面
:return: 返回提取到的数据
"""
pass
def store(self, data_dict):
"""
:param data_dict: 传入数据
:return:
"""
pass
class SpiderThread(Thread):
def __init__(self, name, spider, tasks_queue):
"""
:param spider: 爬虫
:param tasks_queue: 需要执行的任务
"""
super().__init__(name=name, daemon=True)
self.spider = spider
self.tasks_queue = tasks_queue
def run(self):
while True:
current_url = self.tasks_queue.get()
visited_urls.add(current_url)
self.spider.status = SpiderStatus.WORKING
html_page = self.spider.fetch(current_url)
if html_page not in [None, '']:
url_links = self.spider.parse(html_page)
for url_link in url_links:
self.tasks_queue.put(url_link)
self.spider.status = SpiderStatus.IDLE
def is_any_alive(spider_threads):
"""
:param spider_threads: 传入所有线程
:return: 如果所有工作都完成,返回false, 如果有一个没结束,返回True
"""
return any([spider_thread.spider.status == SpiderStatus.WORKING
for spider_thread in spider_threads])
visited_urls = set()
def main():
task_queue = Queue()
task_queue.put('http://m.sohu.com/')
spider_threads = [SpiderThread('t%d' % i, Spider(), task_queue) for i in range(10)]
for spider_thread in spider_threads:
spider_thread.start()
while not task_queue.empty() or is_any_alive(spider_threads):
pass
print('Over!')
if __name__ == '__main__':
main()
保存在redis和mongodb中
from enum import Enum, unique
from hashlib import sha1
from random import random
from time import sleep
from threading import Thread, current_thread
from urllib.parse import urlparse
import pymongo
import requests
import redis
from bs4 import BeautifulSoup
def decode_page(page_bytes, charsets=('utf-8',)):
page_html = None
for charset in charsets:
try:
page_html = page_bytes.decode(charset)
break
except UnicodeDecodeError:
pass
return page_html
class Retry(object):
def __init__(self, *, retry_times=3, wait_secs=1, errors=(Exception, )):
"""
:param retry_times: 重试次数
:param wait_secs: 最少等待时间
:param errors: 抓取的错误列表
"""
self.retry_times = retry_times
self.wait_secs = wait_secs
self.errors = errors
def __call__(self, fn):
def wrapper(*args, **kwargs):
for _ in range(self.retry_times):
try:
return fn(*args, **kwargs)
except self.errors as e:
print(e)
sleep((random() + 1) * self.wait_secs)
return None
return wrapper
@unique
class SpiderStatus(Enum):
IDLE = 0
WORKING = 1
class Spider(object):
def __init__(self):
self.status = SpiderStatus.IDLE
@Retry()
def fetch(self, current_url, *, charsets=('utf-8', ), user_agent=None, proxies=None):
"""
:param current_url: 传入一个url
:return: 一个html页面
"""
thread_name = current_thread().name
print(f'[{thread_name} Fetch]:{current_url}')
headers = {'user-agent': user_agent} if user_agent else {}
resp = requests.get(current_url,
headers=headers, proxies=proxies)
return decode_page(resp.content, charsets) if resp.status_code == 200 else None
def parse(self, html_page, *, domain='m.sohu.com'):
"""
:param html_page: 传入一个html页面
:return:返回一个新的url列表
"""
soup = BeautifulSoup(html_page, 'lxml')
url_links= []
for a_tag in soup.body.select('a[href]'):
parser = urlparse(a_tag.attrs['href'])
netloc = parser.netloc or domain
scheme = parser.scheme or 'http'
if scheme != 'javascript' and netloc == domain:
path = parser.path
query = '?' + parser.query if parser.query else ''
full_url = f'{scheme}://{netloc}{path}{query}'
if not redis_client.sismember('visited_urls', full_url) :
redis_client.rpush('m_sohu_task', full_url)
def extract(self, html_page):
"""
:param html_page: 传入一个html页面
:return: 返回提取到的数据
"""
pass
def store(self, data_dict):
"""
:param data_dict: 传入数据
:return:
"""
pass
class SpiderThread(Thread):
def __init__(self, name, spider):
"""
:param spider: 爬虫
:param tasks_queue: 需要执行的任务
"""
super().__init__(name=name, daemon=True)
self.spider = spider
def run(self):
while True:
current_url = redis_client.lpop('m_sohu_task')
while not current_url:
current_url = redis_client.lpop('m_sohu_task')
self.spider.status = SpiderStatus.WORKING
current_url = current_url.decode('utf-8')
if not redis_client.sismember('visited_urls', current_url):
redis_client.sadd('visited_urls', current_url)
html_page = self.spider.fetch(current_url)
if html_page not in [None, '']:
hasher = hasher_proto.copy()
hasher.update(current_url.encode('utf-8'))
doc_id = hasher.hexdigest()
if sohu_data_coll.find({'_id': doc_id}).count() == 0:
print(sohu_data_coll)
sohu_data_coll.insert_one(
{'_id':doc_id,
'url':current_url,
'page': html_page})
self.spider.parse(html_page)
self.spider.status = SpiderStatus.IDLE
def is_any_alive(spider_threads):
"""
:param spider_threads: 传入所有线程
:return: 如果所有工作都完成,返回false, 如果有一个没结束,返回True
"""
return any([spider_thread.spider.status == SpiderStatus.WORKING
for spider_thread in spider_threads])
redis_client = redis.Redis(host='',
port=, password='')
mongo_client = pymongo.MongoClient(host='', port=)
db = mongo_client.msohu
sohu_data_coll = db.webpages
hasher_proto = sha1()
def main():
print(mongo_client)
if not redis_client.exists('m_sohu_task'):
redis_client.rpush('m_sohu_task', 'http://m.sohu.com/')
else:
pass
spider_threads = [SpiderThread('t%d' % i, Spider()) for i in range(10)]
for spider_thread in spider_threads:
spider_thread.start()
while redis_client.exists('m_sohu_task') or is_any_alive(spider_threads):
pass
print('Over!')
if __name__ == '__main__':
main()
- 上面的代码有些问题,由于多个线程共用了一个redis和mongo客户端,会产生竞争资源的问题,所有这里导入了threading,local,来解决这个问题
import pickle
import zlib
from enum import Enum, unique
from hashlib import sha1
from random import random
from threading import Thread, current_thread, local
from time import sleep
from urllib.parse import urlparse
import pymongo
import redis
import requests
from bs4 import BeautifulSoup
from bson import Binary
@unique
class SpiderStatus(Enum):
IDLE = 0
WORKING = 1
def decode_page(page_bytes, charsets=('utf-8',)):
page_html = None
for charset in charsets:
try:
page_html = page_bytes.decode(charset)
break
except UnicodeDecodeError:
pass
return page_html
class Retry(object):
def __init__(self, *, retry_times=3,
wait_secs=5, errors=(Exception, )):
self.retry_times = retry_times
self.wait_secs = wait_secs
self.errors = errors
def __call__(self, fn):
def wrapper(*args, **kwargs):
for _ in range(self.retry_times):
try:
return fn(*args, **kwargs)
except self.errors as e:
print(e)
sleep((random() + 1) * self.wait_secs)
return None
return wrapper
class Spider(object):
def __init__(self):
self.status = SpiderStatus.IDLE
@Retry()
def fetch(self, current_url, *, charsets=('utf-8', ),
user_agent=None, proxies=None):
thread_name = current_thread().name
print(f'[{thread_name}]: {current_url}')
headers = {'user-agent': user_agent} if user_agent else {}
resp = requests.get(current_url,
headers=headers, proxies=proxies)
return decode_page(resp.content, charsets) \
if resp.status_code == 200 else None
def parse(self, html_page, *, domain='m.sohu.com'):
soup = BeautifulSoup(html_page, 'lxml')
for a_tag in soup.body.select('a[href]'):
parser = urlparse(a_tag.attrs['href'])
scheme = parser.scheme or 'http'
netloc = parser.netloc or domain
if scheme != 'javascript' and netloc == domain:
path = parser.path
query = '?' + parser.query if parser.query else ''
full_url = f'{scheme}://{netloc}{path}{query}'
redis_client = thread_local.redis_client
if not redis_client.sismember('visited_urls', full_url):
redis_client.rpush('m_sohu_task', full_url)
def extract(self, html_page):
pass
def store(self, data_dict):
pass
class SpiderThread(Thread):
def __init__(self, name, spider):
super().__init__(name=name, daemon=True)
self.spider = spider
def run(self):
redis_client = redis.Redis(host='', port=, password='')
mongo_client = pymongo.MongoClient(host='', port=)
thread_local.redis_client = redis_client
thread_local.mongo_db = mongo_client.msohu
while True:
current_url = redis_client.lpop('m_sohu_task')
while not current_url:
current_url = redis_client.lpop('m_sohu_task')
self.spider.status = SpiderStatus.WORKING
current_url = current_url.decode('utf-8')
if not redis_client.sismember('visited_urls', current_url):
redis_client.sadd('visited_urls', current_url)
html_page = self.spider.fetch(current_url)
if html_page not in [None, '']:
hasher = hasher_proto.copy()
hasher.update(current_url.encode('utf-8'))
doc_id = hasher.hexdigest()
sohu_data_coll = mongo_client.msohu.webpages
if not sohu_data_coll.find_one({'_id': doc_id}):
sohu_data_coll.insert_one({
'_id': doc_id,
'url': current_url,
'page': Binary(zlib.compress(pickle.dumps(html_page)))
})
self.spider.parse(html_page)
self.spider.status = SpiderStatus.IDLE
def is_any_alive(spider_threads):
return any([spider_thread.spider.status == SpiderStatus.WORKING
for spider_thread in spider_threads])
thread_local = local()
hasher_proto = sha1()
def main():
redis_client = redis.Redis(host='120.77.222.217', port=6379, password='1qaz2wsx')
if not redis_client.exists('m_sohu_task'):
redis_client.rpush('m_sohu_task', 'http://m.sohu.com/')
spider_threads = [SpiderThread('thread-%d' % i, Spider())
for i in range(10)]
for spider_thread in spider_threads:
spider_thread.start()
while redis_client.exists('m_sohu_task') or is_any_alive(spider_threads):
pass
print('Over!')
if __name__ == '__main__':
main()
抓取格言网数据
import re
from enum import Enum, unique
from threading import Thread,current_thread
from time import sleep
from urllib.parse import urlparse
import pymongo
import requests
import redis
import zlib
import pickle
from bs4 import BeautifulSoup
from bson import Binary
@unique
class SpiderStatus(Enum):
IDEL = 0
WORKING = 1
def decode_page(page_bytes, charsets):
page_html = None
for charset in charsets:
try:
page_html = page_bytes.decode(charset)
break
except UnicodeDecodeError as e:
pass
return page_html
def data_classify(soup, url, title):
contents = soup.body.select_one('div[class="content"]')
if contents:
title = soup.body.select_one('div[class="title"] h2').text
data = {
'url': url,
'title': title,
'content': contents.text
}
else:
data = {
'url': url,
'title': title
}
return data
class Retry(object):
def __init__(self, retry_times=3, wait_time=3, errors=(Exception, )):
self.retry_times = retry_times
self.wait_time = wait_time
self.errors = errors
def __call__(self, fn):
def wrapper(*args, **kwargs):
for _ in range(self.retry_times):
try:
return fn(*args, **kwargs)
except self.errors as e:
print(e)
sleep(self.wait_time)
return None
return wrapper
class Spider(object):
def __init__(self):
self.status = SpiderStatus.IDEL
@Retry()
def fecth(self, url, *, charsets=('gb2312', 'utf-8'), user_agent=None, proxies=None):
print(f'[{current_thread().name}{url}]')
headers = {'user_agent': user_agent} if user_agent else {}
resp = requests.get(url, headers=headers, proxies=proxies)
return decode_page(resp.content, charsets) if resp.status_code == 200 else None
def parse(self, html, *, domain='geyanw.com'):
soup = BeautifulSoup(html, 'lxml')
for a_tag in soup.body.select('a[href]'):
parser = urlparse(a_tag.attrs['href'])
netloc = parser.netloc or domain
scheme = parser.scheme or 'http'
if scheme != 'javascript' and netloc == domain:
path = parser.path
query = '?' + parser.query if parser.query else ''
full_url = f'{scheme}://{netloc}{path}{query}'
if not redis_client.sismember('visited_urls', full_url) and \
not redis_client.sismember('url_set', full_url):
redis_client.sadd('url_set', full_url)
"""
lizhimingyan 励志名言
renshenggeyan 人生格言
mingyanjingju 名言警句
mingrenmingyan 名人名言
dushumingyan 读书名言
jingdianmingyan 经典名言
aiqingmingyan 爱情名言
"""
def get_data(self, html, current_url):
soup = BeautifulSoup(html, 'lxml')
content = soup.body.select_one('div[class="content"]')
if content:
href_re = re.compile('index.html')
topic = soup.body.find('a', {'href': href_re}).text
topic_url = soup.body.find('a', {'href': href_re}).attrs['href']
title = soup.body.select_one('div[class="title"] h2').text
data = {
'topic':topic,
'topic_url':topic_url,
'title':title,
'url': current_url,
'content':Binary(zlib.compress(pickle.dumps(content.text)))
}
coll = topic_url.split('/')[1]
if coll == 'html':
coll = topic_url.split('/')[2]
return data, coll
return None
def store(self, data):
if data:
collection = db[data[1]]
collection.insert(data[0])
return True
return False
class SpiderThread(Thread):
def __init__(self, name, spider):
super().__init__(name=name, daemon=True)
self.name = name
self.spider = spider
def run(self):
while True:
current_url = redis_client.spop('url_set')
while not current_url:
current_url = redis_client.spop('url_set')
self.spider.status = SpiderStatus.WORKING
current_url = current_url.decode('utf-8')
if not redis_client.sismember('visited_urls', current_url):
redis_client.sadd('visited_urls', current_url)
html = self.spider.fecth(current_url)
if html:
self.spider.parse(html)
data = self.spider.get_data(html, current_url)
if self.spider.store(data):
print('储存成功')
else:
print('储存失败')
self.spider.status = SpiderStatus.IDEL
redis_client = redis.Redis(host='47.98.172.171', port=11223, password='544619417wxz')
mongo_client = pymongo.MongoClient(host='47.98.172.171', port=27017)
db = mongo_client.geyanw
SEEDURL = 'https://geyanw.com/'
def any_is_working(spiderthreads):
return any([spiderthread.spider.status == SpiderStatus.WORKING for spiderthread in spiderthreads])
def main():
"""url:https://geyanw.com/"""
if not redis_client.exists('url_set'):
redis_client.sadd('url_set', SEEDURL)
spider_threads = [SpiderThread(f'thread[{i}]:', Spider()) for i in range(10)]
for spider_thread in spider_threads:
spider_thread.start()
sleep(20)
while redis_client.exists('url_set') and any_is_working(spider_threads):
pass
print('OVER!')
if __name__ == '__main__':
main()