python简易异步代理池爬虫

该代码实现了一个使用 Python 的异步库 aiohttp 来爬取多个代理IP源,并验证它们对 HTTPS 的支持。通过遍历多个代理IP网站,将IP和端口存储在字典中,然后异步发送请求到目标URL以检查代理的有效性。最终,保存有效的HTTPS代理IP。代码中包含了一些网站的URL和正则表达式来解析IP和端口。
摘要由CSDN通过智能技术生成
每一个代理源单独异步去爬不方便,所以将所有的代理源的url集中起来,添加到任务队列里,爬出的结果再去根据代理源解析ip(大部分都只取前几页,后面的基本失效了)
将target_url中'http://icanhazip.com'改为'https://icanhazip.com'就可以检验IP是否支持https了(成功率大概只有1%-5%,免费的要什么自行车...哈哈哈)
要添加新的代理源也可像我这样去添加(不建议),爬出来的结果建议单独封装去取ip(我这样瞎搞迟早废掉)
import re
import asyncio
import json
import aiohttp
from keywords import Get
from socketbase import Sendmethod

class Proxy:


    def __init__(self):
        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())   #一定要改变循环策略(针对windows)才支持https
        self.url_list=[]
        self.proixes={}
        self.good_proixes={}
        self.index=0
        self.target_url='http://icanhazip.com'


    def kuaidaili(self):
        url='https://www.kuaidaili.com/free/inha/'
        for i in range(1,3):
            # res=Sendmethod.method('get',url+str(i)+'/')
            # ips=re.findall('<td data-title="IP">(.*?)</td>',res.text)
            # ports=re.findall('<td data-title="PORT">(.*?)</td>',res.text)
            # for ip,port in zip(ips,ports):
            #     self.proixes[ip]=port
            self.url_list.append(('kuaidaili',url+str(i)+'/'))


    def xiaohuan(self):
        url='https://ip.ihuan.me/address/5Lit5Zu9.html?page='
        pages=['b97827cc','4ce63706','5crfe930']
        for page in pages:
            # res = Sendmethod.method('get', url + page)
            # content=re.findall('<img src="/flag/CN.svg">(.*?) href="/address/5Lit5Zu9.html">', res.text)
            # for i in content:
            #     ip = re.findall('(.*?)</a></td><td>', i)
            #     port = re.findall('</a></td><td>(.*?)</td><td><a', i)
            #     self.proixes[ip[0]]=port[0]

            self.url_list.append(('xiaohuan',url + page))


    def ip66(self):
        url = 'http://www.66ip.cn/index.html'
        # res = Sendmethod.method('get', url)
        # ips = re.findall('<tr><td>(.*?)</td><td>', res.text)[1:]
        # ports = re.findall('</td><td>(.*?)</td><td>', res.text)
        # ports=list(filter(lambda x:x.isdigit(),ports))
        # for ip,port in zip(ips,ports):
        #     self.proixes[ip]=port

        self.url_list.append(('ip66',url))


    def dieniao(self):
        url='https://www.dieniao.com/FreeProxy.html'
        # res = Sendmethod.method('get', url)
        # ips = re.findall("<span class='f-address'>(.*?)</span>", res.text)[1:]
        # ports = re.findall("<span class='f-port'>(.*?)</span>", res.text)
        # ports = list(filter(lambda x: x.isdigit(), ports))
        # for ip,port in zip(ips,ports):
        #     self.proixes[ip]= port

        self.url_list.append(('dieniao',url))


    def kaixin(self):
        url='https://proxy11.com/api/demoweb/proxy.json'
        # res = Sendmethod.method('get', url)
        # ips=Get.get_keyword(json.loads(res.text),'ip')
        # ports=Get.get_keyword(json.loads(res.text),'port')
        # for ip,port in zip(ips,ports):
        #     self.proixes[ip] = port

        self.url_list.append(('kaixin',url))


    def yundaili(self):
        url='http://www.ip3366.net/?stype=1&page='
        for i in range(1,4):
            # res = Sendmethod.method('get', url+str(i))
            # content=re.findall("<tr>(.*?)</tr>", res.text,re.S)[1:]
            # for item in content:
            #     ip=re.findall("<td>(.*?)</td>", item)[0]
            #     port=re.findall("<td>(.*?)</td>", item)[1]
            #     self.proixes[ip]=port
            self.url_list.append(('yundaili',url+str(i)))


    def freedaili(self):
        for i in range(1,4):
            url=f'https://ip.jiangxianli.com/?page={i}&country=%E4%B8%AD%E5%9B%BD'
            # res=Sendmethod().method('get',url)
            # content=re.findall('<link rel="dns-prefetch" href="//github.com">(.*?)<link rel="dns-prefetch" href="//buy.jiangxianli.com">', res.text,re.S)
            # ips=re.findall('<link rel="dns-prefetch" href="//(.*?):', content[0])
            # ports=re.findall(':(.*?)">', content[0])
            # for ip, port in zip(ips, ports):
            #     self.proixes[ip] = port
            self.url_list.append(('freedaili', url))


    def daili89(self):
        for i in range(1, 7):
            url=f'https://www.89ip.cn/index_{i}.html'
            # res = Sendmethod().method('get', url)
            # content=re.findall('<td>(.*?)</td>', res.text,re.S)
            # ips=[content[index].strip('\n').strip('\t') for index in range(0,len(content),5)]
            # ports=[content[index].strip('\n').strip('\t') for index in range(1,len(content),5)]
            # for ip, port in zip(ips, ports):
            #     self.proixes[ip] = port
            self.url_list.append(('daili89', url))


    def zhandaye(self):
        main_url='https://www.zdaye.com'
        self.header={
            'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0'
        }
        res = Sendmethod().method('get', main_url+'/dayProxy.html',headers=self.header)
        main_pages = re.findall('<H3 class="thread_title"><a href="(.*?)">', res.text)
        for main_page in main_pages[0:3]:
            url=main_url+main_page
            for x in range(6):
                if x ==0:
                    self.url_list.append(('zhandaye', url))
                else:
                    self.url_list.append(('zhandaye', url[:-5]+'/'+str(x)+'.html'))


    async def crawl_ip(self,item):
        timeout = aiohttp.ClientTimeout(total=10)
        async with aiohttp.ClientSession(trust_env=True) as conn:
            async with conn.get(item[1], timeout=timeout, ssl=False) as res:
                response = await res.text()
                if item[0] == 'kuaidaili':
                    ips = re.findall('<td data-title="IP">(.*?)</td>', response)
                    ports = re.findall('<td data-title="PORT">(.*?)</td>', response)
                    for ip, port in zip(ips, ports):
                        self.proixes[ip] = port
                elif item[0] == 'xiaohuan':
                    content = re.findall('<img src="/flag/CN.svg">(.*?) href="/address/5Lit5Zu9.html">', response)
                    for i in content:
                        ip = re.findall('(.*?)</a></td><td>', i)
                        port = re.findall('</a></td><td>(.*?)</td><td><a', i)
                        self.proixes[ip[0]] = port[0]
                elif item[0] == 'ip66':
                    ips = re.findall('<tr><td>(.*?)</td><td>', response)[1:]
                    ports = re.findall('</td><td>(.*?)</td><td>', response)
                    ports = list(filter(lambda x: x.isdigit(), ports))
                    for ip, port in zip(ips, ports):
                        self.proixes[ip] = port
                elif item[0] == 'dieniao':
                    ips = re.findall("<span class='f-address'>(.*?)</span>",response)[1:]
                    ports = re.findall("<span class='f-port'>(.*?)</span>", response)
                    ports = list(filter(lambda x: x.isdigit(), ports))
                    for ip, port in zip(ips, ports):
                        self.proixes[ip] = port
                elif item[0] == 'kaixin':
                    ips = Get.get_keyword(json.loads(response), 'ip')
                    ports = Get.get_keyword(json.loads(response), 'port')
                    for ip, port in zip(ips, ports):
                        self.proixes[ip] = port
                elif item[0] == 'yundaili':
                    content = re.findall("<tr>(.*?)</tr>", response, re.S)[1:]
                    for item in content:
                        ip = re.findall("<td>(.*?)</td>", item)[0]
                        port = re.findall("<td>(.*?)</td>", item)[1]
                        self.proixes[ip] = port
                elif item[0] == 'freedaili':
                    content = re.findall('<link rel="dns-prefetch" href="//github.com">(.*?)<link rel="dns-prefetch" href="//buy.jiangxianli.com">',response, re.S)
                    ips = re.findall('<link rel="dns-prefetch" href="//(.*?):', content[0])
                    ports = re.findall(':(.*?)">', content[0])
                    for ip, port in zip(ips, ports):
                        self.proixes[ip] = port
                elif item[0] == 'daili89':
                    content = re.findall('<td>(.*?)</td>', response, re.S)
                    ips = [content[index].strip('\n').strip('\t') for index in range(0, len(content), 5)]
                    ports = [content[index].strip('\n').strip('\t') for index in range(1, len(content), 5)]
                    for ip, port in zip(ips, ports):
                        self.proixes[ip] = port
                elif item[0] == 'zhandaye':
                    ips = re.findall('<a href="/ip/CheckHttp/(.*?)" title=', response)
                    for i in ips:
                        self.proixes[i.split(':')[0]] = i.split(':')[1]
                self.index+=1
                self.progress_bar('爬取IP进度',len(self.url_list))


    async def check_proxy(self,proxy):
        timeout = aiohttp.ClientTimeout(total=10)
        async with aiohttp.ClientSession(trust_env=True) as conn:
            async with conn.get(self.target_url,timeout=timeout,proxy=f'http://{proxy[0]}:{proxy[1]}',ssl=False) as res:
                self.index += 1
                response = await res.text()
                if str(res.status)[0] == '2' and response:
                    self.good_proixes[proxy[0]]=proxy[1]
                    self.progress_bar('成功率', len(self.proixes))


    def progress_bar(self,msg,length):
        print('\r',f"{msg}: {int((self.index/length)*100)}%: ", "▋" * int((self.index / length)*100), end="")


    async def main(self):
        self.kuaidaili()
        self.xiaohuan()
        self.ip66()
        self.dieniao()
        self.kaixin()
        self.yundaili()
        self.freedaili()
        self.daili89()
        self.zhandaye()
        tasks = [asyncio.ensure_future(self.crawl_ip(item)) for item in self.url_list]   #爬取代理ip
        await asyncio.gather(*tasks,return_exceptions=True)
        self.index=0   #重置index
        tasks=[asyncio.ensure_future(self.check_proxy(proxy)) for proxy in self.proixes.items()]   #检查有效代理ip
        await asyncio.gather(*tasks,return_exceptions=True)


    def entrance(self):
        loop=asyncio.get_event_loop()
        loop.run_until_complete(self.main())


if __name__ == '__main__':
    a=Proxy()
    a.entrance()
    print(a.good_proixes)
  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值