1 multiprocessing
Python多进程multiprocessing使用示例
mutilprocess的作用是能够像线程一样管理进程,在多核CPU利用率比threading要好的多。
2 从数据库中读取爬到的代理进行验证
下面的代码参考了qiyeboy/IPProxyPool
# -*- coding: utf-8 -*-
'''
Created on 2017年6月14日
检测ip是否可用
@author: dzm
'''
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from eie.service.EieIpService import EieIpService
import multiprocessing
from multiprocessing import Process
from eie.middlewares import udf_config
from gevent import monkey
import gevent
monkey.patch_all()
import os
from eie.middlewares.random_user_agent import RandomUserAgent
import json
import time
import requests
logger = udf_config.logger
eieIpService = EieIpService()
class CheckIpProxyService(object):
def __init__(self):
'''
使用httpbin做http验证
检测有效期为5s
'''
self.http_timeout = 5
self.target_url = 'http://httpbin.org/get'
self.target_url_https = 'https://httpbin.org/get'
self.target_url_ip = 'http://httpbin.org/ip'
self.my_ip = None
# 最大进程数量
self.max_check_process = multiprocessing.cpu_count()
# 每个进程最大并发
self.max_check_construct_per_process = 30
# 任务队列数量
self.task_queue_size = 50
# 进程数达到上限时的等待时间
self.check_wati_time = 1
def detect_proxy(self,proxy):
ip = proxy['ip']
port = proxy['port']
proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
http, types, speed = self.check_proxy(proxies,proxy['types'])
if http==False:
eieIpService.delete(ip, port)
else:
pass
def check_proxy(self,proxies,types):
if types == 'HTTP':
http, http_types, http_speed = self._checkHttpProxy(proxies)
else:
http, http_types, http_speed = self._checkHttpProxy(proxies,False)
return http, http_types, http_speed
def _checkHttpProxy(self,proxies,is_http=True):
types = -1
speed = -1
if is_http:
test_url = self.target_url
else:
test_url = self.target_url_https
try:
start = time.time()
randomUserAgent = RandomUserAgent()
r = requests.get(url=test_url,headers=randomUserAgent.get_headers(),timeout=self.http_timeout,proxies=proxies)
logger.debug('请求结果为%s' % r)
if r.ok:
speed = round(time.time()-start,2)
content = json.loads(r.text)
headers = content['headers']
ip = content['origin']
proxy_connection = headers.get('Proxy-Connection',None)
if ',' in ip:
types = 2
elif proxy_connection:
types = 1
else:
types = 0
logger.debug('%s 代理有效' % proxies)
return True,types, speed
else:
return False,types, speed
except Exception,e:
logger.debug('%s 代理无效' % proxies)
return False,types, speed
def get_my_ip(self):
'''
检测自己的ip地址
'''
try:
randomUserAgent = RandomUserAgent()
r = requests.get(url=self.target_url_ip, headers=randomUserAgent.get_headers(), timeout=self.http_timeout)
ip = json.loads(r.text)
self.my_ip = ip['origin']
except Exception,e:
raise Exception('访问 %s 失败,请检查网络连接' % self.target_url_ip)
def run(self):
'''
gevent协程的用法
@see: http://www.cnblogs.com/tkqasn/p/5705338.html
'''
proxy_list = eieIpService.select()
spawns = []
for proxy in proxy_list:
spawns.append(gevent.spawn(self.detect_proxy, proxy))
if len(spawns) >= self.max_check_construct_per_process:
gevent.joinall(spawns)
spawns = []
if len(spawns)>0:
gevent.joinall(spawns)
if __name__ == '__main__':
ip = '59.37.17.202'
port = '808'
proxies = {"http": "http://%s:%s" % (ip, port)}
c = CheckIpProxyService()
# c.check_proxy(proxies, 'HTTP')
p = Process(target=c.run)
p.start()
p.join()
3 gevent
待续…