web.py 搭建本地小服务
urls = (
'/', 'select',
'/delete', 'delete'
)
def start_api_server():
sys.argv.append('0.0.0.0:%s' % config.API_PORT)
app = web.application(urls, globals())
app.run()
class select(object):
def GET(self):
inputs = web.input()
json_result = json.dumps(sqlhelper.select(inputs.get('count', None), inputs))
return json_result
class delete(object):
params = {}
def GET(self):
inputs = web.input()
json_result = json.dumps(sqlhelper.delete(inputs))
return json_result
if __name__ == '__main__':
sys.argv.append('0.0.0.0:8000')
app = web.application(urls, globals())
app.run()
q1爬取ip的共享队列
q2验证通过ip的共享队列
myip = getMyIP() DB_PROXY_NUM = Value('i', 0)#传递数值 q1 = Queue(maxsize=TASK_QUEUE_SIZE) q2 = Queue() p0 = Process(target=start_api_server) p1 = Process(target=startProxyCrawl, args=(q1, DB_PROXY_NUM,myip)) p2 = Process(target=validator, args=(q1, q2, myip)) p3 = Process(target=store_data, args=(q2, DB_PROXY_NUM)) p0.start() p1.start() p2.start() p3.start() p0.join() p1.join() p2.join() p3.join()
gevent.spawn和gevent.jionall来新建并发任务,检查数据库已有代理,当数目不够的时候新建爬虫任务
crawl爬取时候根据配置list规则进行解析
def parse(self, response, parser): ''' :param response: 响应 :param type: 解析方式 :return: ''' if parser['type'] == 'xpath': return self.XpathPraser(response, parser) elif parser['type'] == 'regular': return self.RegularPraser(response, parser) elif parser['type'] == 'module': return getattr(self, parser['moduleName'], None)(response, parser) else: return None
for proxy in proxylist:
spawns.append(gevent.spawn(detect_from_db, self.myip, proxy, self.proxies))
if len(spawns) >= MAX_CHECK_CONCURRENT_PER_PROCESS:
gevent.joinall(spawns)
spawns= []
gevent.joinall(spawns)
self.db_proxy_num.value = len(self.proxies)
str = 'IPProxyPool----->>>>>>>>db exists ip:%d' % len(self.proxies)
if len(self.proxies) < MINNUM:
str += '\r\nIPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...'
sys.stdout.write(str + "\r\n")
sys.stdout.flush()
spawns = []
for p in parserList:
spawns.append(gevent.spawn(self.crawl, p))
if len(spawns) >= MAX_DOWNLOAD_CONCURRENT:
gevent.joinall(spawns)
spawns= []
gevent.joinall(spawns)
检测ip访问失败则判断评分小于一删除,大于一则减一,检测成功则加入
def validator(queue1, queue2, myip):
tasklist = []
proc_pool = {} # 所有进程列表
cntl_q = Queue() # 控制信息队列
while True:
if not cntl_q.empty():
# 处理已结束的进程
try:
pid = cntl_q.get()
proc = proc_pool.pop(pid)
proc_ps = psutil.Process(pid)
proc_ps.kill()
proc_ps.wait()
except Exception as e:
pass
# print(e)
# print(" we are unable to kill pid:%s" % (pid))
try:
# proxy_dict = {'source':'crawl','data':proxy}
if len(proc_pool) >= config.MAX_CHECK_PROCESS:
time.sleep(config.CHECK_WATI_TIME)
continue
proxy = queue1.get()
tasklist.append(proxy)
if len(tasklist) >= config.MAX_CHECK_CONCURRENT_PER_PROCESS:
p = Process(target=process_start, args=(tasklist, myip, queue2, cntl_q))
p.start()
proc_pool[p.pid] = p
tasklist = []
except Exception as e:
if len(tasklist) > 0:
p = Process(target=process_start, args=(tasklist, myip, queue2, cntl_q))
p.start()
proc_pool[p.pid] = p
tasklist = []
检测代理ip是否能代理,根据配置的进程数目和协程数目,新建进程任务,检测完成会放入控制队列从cnt1_q来判断保证删除,检测成功放入队列queue2,
protocol, types, speed = getattr(sys.modules[__name__],config.CHECK_PROXY['function'])(selfip, proxies)
r = requests.get(url=test_url, headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies)判断高逆性和加不加密
class ISqlHelper(object):
params = {'ip': None, 'port': None, 'types': None, 'protocol': None, 'country': None, 'area': None}
def init_db(self):
raise NotImplemented
def drop_db(self):
raise NotImplemented
def insert(self, value=None):
raise NotImplemented
def delete(self, conditions=None):
raise NotImplemented
def update(self, conditions=None, value=None):
raise NotImplemented
def select(self, count=None, conditions=None):
raise NotImplemented