**多进程 多线程 异步 爬虫
忽略爬虫具体规则策略cookie登录等,专注高性能,高并发。**
初步
爬取煎蛋图片,存图片链接到mongodb
#!/usr/bin/python
#-*- coding: utf-8 -*-
import os
import json
import functools
import requests
import urllib.request
from urllib.request import FancyURLopener
import urllib.parse
import urllib.error
from lxml import etree
import time
from pymongo import MongoClient
from concurrent.futures import ProcessPoolExecutor, as_completed,ThreadPoolExecutor
import asyncio
import aiohttp
TsHeader = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
PATH = r'..\Spider_PoolExecutor'
CHUNK_SIZE = 1024
P_MAX_WORKERS_PAGE = 3
P_MAX_WORKERS_IMGSAVE = 10
T_MAX_WORKERS_PAGE = 3
T_MAX_WORKERS_IMGSAVE = 10
LOCK_TIME = 0.2
EVENT_TIME = 0.2
SEMA_NUM = 3
class DataBase():
def __init__(self):
client = MongoClient('127.0.0.1', 27017)
self._db = client['jandan']
class JDBase():
def __init__(self):
self._chunk_size = CHUNK_SIZE
try:
db = DataBase()
self._db = db._db
except:
print('数据库连接失败')
self._db = None
def insert_header(self,header, img_url, call_url):
referer = call_url
if call_url.find('#') > 0:
referer = call_url[:len(call_url) - 9]
host = img_url[7:21]
header.addheader("Host", host)
header.addheader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36")
if img_url.find('.gif') > 0:
header.addheader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
else:
header.addheader("Accept", "*/*")
header.addheader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
header.addheader("Accept-Encoding", "gzip,deflate")
header.addheader("Referer", referer)
header.addheader("Connection", "keep-alive")
header.addheader("Upgrade-Insecure-Requests", "1")
return header
def GetUrl(self,url, data=None):
f = ""
if data: f = "?" + urllib.parse.urlencode(data)
curl = url + f
try:
req = urllib.request.Request(curl, headers=TsHeader)
response = urllib.request.urlopen(req)
html = response.read()
except Exception as e:
print(e)
print(curl)
return False
return html
def PostUrl(self,url, data):
data = urllib.parse.urlencode(data).encode('utf-8')
try:
request = urllib.request.Request(url, headers=TsHeader)
request.add_header("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
f = urllib.request.urlopen(request, data)
a = f.read().decode('utf-8')
except Exception as e:
print(e)
print(url)
return False
return a
def page_list(self):
p = ('72', 'http://jandan.net/ooxx/page-')
pagelist = []
for i in range(10):
t = (str(int(p[0]) - i), p[1] + str(int(p[0]) - i))
pagelist.append(t)
return pagelist
def url_dict_onepage(self,agrs):
num = agrs[0]
page = agrs[1]
pagedict = {}
r = self.GetUrl(page)
html = etree.HTML(r)
pic_lis = html.xpath('//ol//li//img/@src')
git_lis = html.xpath('//ol//li//img/@org_src')
git_lis.extend(pic_lis)
pagedict['page'] = num
pagedict['urls'] = git_lis
return pagedict
def url_list_db(self):
urls = []
try:
db_urls = self._db.jd_url.find({}, {'page': 1, 'urls': 1})
except:
print('db find error')
return False
for url in db_urls:
urls.extend(url['urls'])
return urls
def file_download(self,url):
urlx = 'http:' + url
imgname = url.split('/')[-1]
if imgname.split('.')[-1] == 'gif':
imgPath = os.path.join(PATH, 'gif', imgname)
else:
imgPath = os.path.join(PATH, 'pic', imgname)
if not os.path.lexists(imgPath):
# urllib.request.urlretrieve(urlx, imgPath)
opener = FancyURLopener()
opener.addheaders.clear()
opener = self.insert_header(opener, urlx, imgPath)
with open(imgPath, 'wb+') as f:
while True:
chunk = opener.open(urlx,self._chunk_size)
if not chunk: break
f.write(chunk)
ProcessPoolExecutor 多进程
class ProcessPE(JDBase):
def __init__(self):
super(ProcessPE, self).__init__()
self._max_workers_page = P_MAX_WORKERS_PAGE
self._max_workers_imgsave = P_MAX_WORKERS_IMGSAVE
def page_ProcessPE(self,urls):
with ProcessPoolExecutor(max_workers=self._max_workers_page) as executor:
future_to_url = {executor.submit(self.url_dict_onepage, url): url for url in urls}
for future in as_completed(future_to_url):
try:
result = future.result()
self._db.jd_url.insert_one(result)
except Exception as e:
print('raise an exception: {}'.format(e))
else:
print('[db]{}'.format(result))
def img_download_ProcessPE(self,urls):
with ProcessPoolExecutor(max_workers=self._max_workers_imgsave) as executor:
future_to_url = {executor.submit(self.file_download, url): url for url in urls}
for future in as_completed(future_to_url):
url = future_to_url[future]
try:
result = future.result()
except Exception as e:
print('raise an exception: {}'.format(e))
else:
print('[save]OK')
def Execute(self):
page_list = self.page_list()
self.page_ProcessPE(page_list)
urls = self.url_list_db()
if not urls:
self.img_download_ProcessPE(urls)
else:
print('数据库无数据')
ThreadPoolExecutor 多线程
class ThreadPE(JDBase):
def __init__(self):
super(ThreadPE, self).__init__()
self._max_workers_page = T_MAX_WORKERS_PAGE
self._max_workers_imgsave = T_MAX_WORKERS_IMGSAVE
def page_ThreadPE(self,urls):
with ThreadPoolExecutor(max_workers=self._max_workers_page) as executor:
future_to_url = {executor.submit(self.url_dict_onepage, url): url for url in urls}
for future in as_completed(future_to_url):
try:
result = future.result()
self._db.jd_url.insert_one(result)
except Exception as e:
print('raise an exception: {}'.format(e))
else:
print('[db]{}'.format(result))
def img_download_ThreadPE(self,urls):
with ThreadPoolExecutor(max_workers=self._max_workers_imgsave) as executor:
future_to_url = {executor.submit(self.file_download, url): url for url in urls}
for future in as_completed(future_to_url):
url = future_to_url[future]
try:
result = future.result()
except Exception as e:
print('raise an exception: {}'.format(e))
else:
print('[save]OK')
def Execute(self):
page_list = self.page_list()
self.page_ThreadPE(page_list)
urls = self.url_list_db()
# print(urls)
if urls:
self.img_download_ThreadPE(urls)
else:
print('数据库无数据')
asyncio 异步
(获取页面下图片链接)
class Async_only(JDBase):
def __init__(self):
super(Async_only, self).__init__()
self._pagedict = {}
async def url_dict_onepage_Async(self, agrs):
print('[A]',agrs)
num = agrs[0]
page = agrs[1]
pagedict = {}
# 异步调用
# 这里只是例子 正确做法是用aiohttp这个异步网络库,而不是urllib这种阻塞库
# r = await self.GetUrl(page)
# html = etree.HTML(r)
# pic_lis = html.xpath('//ol//li//img/@src')
# git_lis = html.xpath('//ol//li//img/@org_src')
# git_lis.extend(pic_lis)
# pagedict['page'] = num
# pagedict['urls'] = git_lis
# print(pagedict)
# 用异步sleep(1)模拟异步io操作
# r = await asyncio.sleep(2)
# 用time.sleep(1)模拟阻塞io操作,和同步执行一样
r = await time.sleep(2)
pagedict['page'] = num
pagedict['urls'] = page
print('[B]',pagedict)
def page_ELoop(self, urls):
# 获取EventLoop
loop = asyncio.get_event_loop()
tasks = [self.url_dict_onepage_Async(agrs) for agrs in urls]
# print(tasks)
# 执行coroutine
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
def Execute(self):
page_list = self.page_list()
self.page_ELoop(page_list)
asyncio + aiohttp
替换阻塞io为异步比如 请求 比如 数据库操作等
class AsyncAiohttp(JDBase):
def __init__(self):
super(AsyncAiohttp,self).__init__()
self._chunk_size = CHUNK_SIZE
def page_list(self):
p = ('74', 'http://jandan.net/ooxx/page-')
pagelist = []
for i in range(2):
t = (str(int(p[0]) - i), p[1] + str(int(p[0]) - i))
pagelist.append(t)
# print(a)
return pagelist
# @asyncio_Semaphore(3)
async def url_dict_onepage_async(self,agrs):
pagedict = {}
num = agrs[0]
page = agrs[1]
async with aiohttp.request('GET',page) as r :
data = await r.read()
data = data.decode()
html = etree.HTML(data)
pic_lis = html.xpath('//ol//li//img/@src')
git_lis = html.xpath('//ol//li//img/@org_src')
git_lis.extend(pic_lis)
pagedict['page'] = num
pagedict['urls'] = git_lis
self._db.jd_url.insert_one(pagedict) #TODO 替换成一次插入多值
async def file_download_async(self, url):
urlx = 'http:' + url
imgname = url.split('/')[-1]
if imgname.split('.')[-1] == 'gif':
imgPath = os.path.join(PATH, 'gif', imgname)
else:
imgPath = os.path.join(PATH, 'pic', imgname)
if not os.path.lexists(imgPath):
# urllib.request.urlretrieve(urlx, imgPath)
# async with aiohttp.request('GET', urlx) as r:
async with aiohttp.ClientSession() as session:
async with session.get( urlx) as r:
with open(imgPath, 'wb+') as f:
while True:
chunk = await r.content.read(self._chunk_size)
if not chunk:break
f.write(chunk)
def page_ELoop(self, urls):
# 获取EventLoop
loop = asyncio.get_event_loop()
tasks = [self.url_dict_onepage_async(url) for url in urls]
# 执行coroutine
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
def img_download_ELoop(self,urls):
# 获取EventLoop
loop = asyncio.get_event_loop()
tasks = [self.file_download_async(url) for url in urls]
# 执行coroutine
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
def Execute(self):
page_list = self.page_list()
self.page_ELoop(page_list)
urls = self.url_list_db()
if urls:
self.img_download_ELoop(urls)
else:
print('数据库无数据')
if __name__ == '__main__':
start = time.time()
# AA = ProcessPE()
# AA = ThreadPE()
# AA = Async_only()
AA = AsyncAiohttp()
AA.Execute()
print('[endtime][{}]'.format(time.time() - start))