工具------获取最新的代理ip（66网）

最新推荐文章于 2023-07-17 11:37:59 发布

DustHeartQi

最新推荐文章于 2023-07-17 11:37:59 发布

阅读量1.6k

点赞数

分类专栏： python爬虫文章标签： ip 代理ip 爬虫-python xpath

本文链接：https://blog.csdn.net/qq_25445035/article/details/77333799

版权

python爬虫专栏收录该内容

6 篇文章 0 订阅

订阅专栏

爬取66代理网的代理ip

使用技术：
- python3
- request
- xpath

import userAgent
import requests
from lxml import etree

class Proxies:
    """
    爬取66免费代理网的代理
    proDicList = Proxies.get_proxies(4)

    """
    url = 'http://www.66ip.cn/' #全国
    # url = 'http://www.66ip.cn/areaindex_1/'
    proxies = []
    geshu = 0


    @staticmethod
    def get_one_proxy(count=1):
        count = count
        proxy = Proxies.get_proxies(count)
        return proxy[0]

    @staticmethod
    def get_proxies(count):
        """
        返回含有count个的代理字典的列表
        """
        count = count
        Proxies.geshu = count
        i = 2  # 从第二页开始
        url_list = []  # 存放有效的代理字典
        tempDic = {}  # 临时存取每一页的字典
        while True:      #一直爬取，直到大于指定的个数后，爬取完当前页，停止
            i += 1
            url_new = Proxies.url+ str(i) + '.html'
            content = Proxies.get_content(url_new)
            tempDic = Proxies.get_info(content)   #爬取了一个页面的代理

            if len(Proxies.proxies) > count:
                return  Proxies.proxies[:count]

    @staticmethod
    def get_content(url):
        """
        根据url获取网页内容
        """
        user_agent = userAgent.User_Agent.get_user_agent('pc')
        headers = user_agent
        response = requests.get(url=url, headers=headers)
        return response.text

    @staticmethod
    def get_info(content):
        datas_ips = etree.HTML(content).xpath(
            '//div[contains(@id,"main")]/div/div[1]/table/tr[position()>1]/td[1]/text()')
        datas_ports = etree.HTML(content).xpath(
            '//div[contains(@id,"main")]/div/div[1]/table/tr[position()>1]/td[2]/text()')

        for i in range(len(datas_ips)):
            Proxies.verif_ip(datas_ips[i], datas_ports[i])  # 验证，将有效的代理保存下来
            if len(Proxies.proxies)> Proxies.geshu:
                break
            # print(i)

        # print(Proxies.proxies)
        # print('%s :--- %s'%(datas_ips,datas_ports))

    @staticmethod
    def verif_ip(ip, port):   #验证代理的有效性
        user_agent = userAgent.User_Agent.get_user_agent('pc')
        url = 'http://www.baidu.com'
        proxies = {}

        try:
            left = 'http'
            right = 'http://' + ip + ":" + port
            proxies[left] = right

            res = requests.get(url=url,proxies=proxies, headers=user_agent, timeout=0.1)  //过滤一些反应慢的ip
            if res.status_code == 200:
                Proxies.proxies.append(proxies)
                # print(Proxies.proxies)
                # print('验证通过')
                return True

        except :
            try:
                left = 'https'
                right = 'https://' + ip + ":" + port
                proxies = {}
                proxies[left] = right

                res = requests.get(url=url, proxies=proxies, headers=user_agent, timeout=0.1)
                if res.status_code == 200:
                    Proxies.proxies.append(proxies)   # 如果代理有效，则保存到类属性
                    # print('----', Proxies.proxies)
                    # print('----验证通过')
                    return True
            except Exception as e:
                # print("验证失败", e)
                return False
        else:
            # print("验证失败")
            return False
if __name__ == '__main__':
    proxies = Proxies.get_one_proxy()
    proxies = Proxies.get_proxies(10)
    print(proxies)

DustHeartQi

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
工具------获取最新的代理ip（66网）

爬取66代理网的代理ip使用技术： - python3 - request - xpathimport userAgentimport requestsfrom lxml import etreeclass Proxies: """ 爬取66免费代理网的代理 proDicList = Proxies.get_proxies(4) """ url = '
复制链接

扫一扫