IPProxy代理分析

web.py 搭建本地小服务

urls = (
    '/', 'select',
    '/delete', 'delete'
)

def start_api_server():
    sys.argv.append('0.0.0.0:%s' % config.API_PORT)
    app = web.application(urls, globals())
    app.run()

class select(object):
    def GET(self):
        inputs = web.input()
        json_result = json.dumps(sqlhelper.select(inputs.get('count', None), inputs))
        return json_result

class delete(object):
    params = {}

    def GET(self):
        inputs = web.input()
        json_result = json.dumps(sqlhelper.delete(inputs))
        return json_result

if __name__ == '__main__':
    sys.argv.append('0.0.0.0:8000')
    app = web.application(urls, globals())
    app.run()
 

q1爬取ip的共享队列
q2验证通过ip的共享队列
myip = getMyIP()
DB_PROXY_NUM = Value('i', 0)#传递数值
q1 = Queue(maxsize=TASK_QUEUE_SIZE)
q2 = Queue()
p0 = Process(target=start_api_server)
p1 = Process(target=startProxyCrawl, args=(q1, DB_PROXY_NUM,myip))
p2 = Process(target=validator, args=(q1, q2, myip))
p3 = Process(target=store_data, args=(q2, DB_PROXY_NUM))
p0.start()
p1.start()
p2.start()
p3.start()
p0.join()
p1.join()
p2.join()
p3.join()
gevent.spawn和gevent.jionall来新建并发任务,检查数据库已有代理,当数目不够的时候新建爬虫任务

crawl爬取时候根据配置list规则进行解析

def parse(self, response, parser):
    '''

    :param response: 响应
    :param type: 解析方式
    :return:
    '''
    if parser['type'] == 'xpath':
        return self.XpathPraser(response, parser)
    elif parser['type'] == 'regular':
        return self.RegularPraser(response, parser)
    elif parser['type'] == 'module':
        return getattr(self, parser['moduleName'], None)(response, parser)
    else:
        return None

 
            for proxy in proxylist:
                spawns.append(gevent.spawn(detect_from_db, self.myip, proxy, self.proxies))
                if len(spawns) >= MAX_CHECK_CONCURRENT_PER_PROCESS:
                    gevent.joinall(spawns)
                    spawns= []
            gevent.joinall(spawns)
            self.db_proxy_num.value = len(self.proxies)
            str = 'IPProxyPool----->>>>>>>>db exists ip:%d' % len(self.proxies)

            if len(self.proxies) < MINNUM:
                str += '\r\nIPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...'
                sys.stdout.write(str + "\r\n")
                sys.stdout.flush()
                spawns = []
                for p in parserList:
                    spawns.append(gevent.spawn(self.crawl, p))
                    if len(spawns) >= MAX_DOWNLOAD_CONCURRENT:
                        gevent.joinall(spawns)
                        spawns= []
                gevent.joinall(spawns)
检测ip访问失败则判断评分小于一删除,大于一则减一,检测成功则加入

 
def validator(queue1, queue2, myip):
    tasklist = []
    proc_pool = {}     # 所有进程列表
    cntl_q = Queue()   # 控制信息队列
    while True:
        if not cntl_q.empty():
            # 处理已结束的进程
            try:
                pid = cntl_q.get()
                proc = proc_pool.pop(pid)
                proc_ps = psutil.Process(pid)
                proc_ps.kill()
                proc_ps.wait()
            except Exception as e:
                pass
                # print(e)
                # print(" we are unable to kill pid:%s" % (pid))
        try:
            # proxy_dict = {'source':'crawl','data':proxy}
            if len(proc_pool) >= config.MAX_CHECK_PROCESS:
                time.sleep(config.CHECK_WATI_TIME)
                continue
            proxy = queue1.get()
            tasklist.append(proxy)
            if len(tasklist) >= config.MAX_CHECK_CONCURRENT_PER_PROCESS:
                p = Process(target=process_start, args=(tasklist, myip, queue2, cntl_q))
                p.start()
                proc_pool[p.pid] = p
                tasklist = []

        except Exception as e:
            if len(tasklist) > 0:
                p = Process(target=process_start, args=(tasklist, myip, queue2, cntl_q))
                p.start()
                proc_pool[p.pid] = p
                tasklist = []

检测代理ip是否能代理,根据配置的进程数目和协程数目,新建进程任务,检测完成会放入控制队列从cnt1_q来判断保证删除,检测成功放入队列queue2,
 
protocol, types, speed = getattr(sys.modules[__name__],config.CHECK_PROXY['function'])(selfip, proxies)
 
r = requests.get(url=test_url, headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies)
判断高逆性和加不加密
class ISqlHelper(object):
    params = {'ip': None, 'port': None, 'types': None, 'protocol': None, 'country': None, 'area': None}

    def init_db(self):
        raise NotImplemented

    def drop_db(self):
        raise NotImplemented

    def insert(self, value=None):
        raise NotImplemented

    def delete(self, conditions=None):
        raise NotImplemented

    def update(self, conditions=None, value=None):
        raise NotImplemented

    def select(self, count=None, conditions=None):
        raise NotImplemented


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值