工具------获取最新的代理ip(66网)

爬取66代理网的代理ip

使用技术:
- python3
- request
- xpath


import userAgent
import requests
from lxml import etree

class Proxies:
    """
    爬取66免费代理网的代理
    proDicList = Proxies.get_proxies(4)

    """
    url = 'http://www.66ip.cn/' #全国
    # url = 'http://www.66ip.cn/areaindex_1/'
    proxies = []
    geshu = 0


    @staticmethod
    def get_one_proxy(count=1):
        count = count
        proxy = Proxies.get_proxies(count)
        return proxy[0]

    @staticmethod
    def get_proxies(count):
        """
        返回含有count个的代理字典的列表
        """
        count = count
        Proxies.geshu = count
        i = 2  # 从第二页开始
        url_list = []  # 存放有效的代理字典
        tempDic = {}  # 临时存取每一页的字典
        while True:      #一直爬取,直到大于指定的个数后,爬取完当前页,停止
            i += 1
            url_new = Proxies.url+ str(i) + '.html'
            content = Proxies.get_content(url_new)
            tempDic = Proxies.get_info(content)   #爬取了一个页面的代理

            if len(Proxies.proxies) > count:
                return  Proxies.proxies[:count]

    @staticmethod
    def get_content(url):
        """
        根据url获取网页内容
        """
        user_agent = userAgent.User_Agent.get_user_agent('pc')
        headers = user_agent
        response = requests.get(url=url, headers=headers)
        return response.text

    @staticmethod
    def get_info(content):
        datas_ips = etree.HTML(content).xpath(
            '//div[contains(@id,"main")]/div/div[1]/table/tr[position()>1]/td[1]/text()')
        datas_ports = etree.HTML(content).xpath(
            '//div[contains(@id,"main")]/div/div[1]/table/tr[position()>1]/td[2]/text()')

        for i in range(len(datas_ips)):
            Proxies.verif_ip(datas_ips[i], datas_ports[i])  # 验证,将有效的代理保存下来
            if len(Proxies.proxies)> Proxies.geshu:
                break
            # print(i)

        # print(Proxies.proxies)
        # print('%s :--- %s'%(datas_ips,datas_ports))

    @staticmethod
    def verif_ip(ip, port):   #验证代理的有效性
        user_agent = userAgent.User_Agent.get_user_agent('pc')
        url = 'http://www.baidu.com'
        proxies = {}

        try:
            left = 'http'
            right = 'http://' + ip + ":" + port
            proxies[left] = right

            res = requests.get(url=url,proxies=proxies, headers=user_agent, timeout=0.1)  //过滤一些反应慢的ip
            if res.status_code == 200:
                Proxies.proxies.append(proxies)
                # print(Proxies.proxies)
                # print('验证通过')
                return True

        except :
            try:
                left = 'https'
                right = 'https://' + ip + ":" + port
                proxies = {}
                proxies[left] = right

                res = requests.get(url=url, proxies=proxies, headers=user_agent, timeout=0.1)
                if res.status_code == 200:
                    Proxies.proxies.append(proxies)   # 如果代理有效,则保存到类属性
                    # print('----', Proxies.proxies)
                    # print('----验证通过')
                    return True
            except Exception as e:
                # print("验证失败", e)
                return False
        else:
            # print("验证失败")
            return False
if __name__ == '__main__':
    proxies = Proxies.get_one_proxy()
    proxies = Proxies.get_proxies(10)
    print(proxies)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值