python爬取免费代理ip脚本（含代码），搭建自己的代理池

Gugumeng

已于 2024-05-28 16:37:09 修改

阅读量201

点赞数 3

分类专栏： python 代理爬虫文章标签： python tcp/ip 开发语言

于 2024-05-28 16:35:37 首次发布

本文链接：https://blog.csdn.net/qq_43654631/article/details/139270917

版权

python 同时被 3 个专栏收录

1 篇文章 0 订阅

订阅专栏

代理

1 篇文章 0 订阅

订阅专栏

爬虫

1 篇文章 0 订阅

订阅专栏

python爬取代理ip脚本，搭建自己的代理池

建议：爬取后尽量搭建自己的代理池，每天都测一下代理是否可用，开放接口时需测下是否可用

import queue
import time
from threading import Thread
from lxml import etree
import re
import requests

# 控制台不输出verify=False导致的安全警告
requests.packages.urllib3.disable_warnings()


def req_get_html(url, headers=None, retry_times=3):
    """
    发送请求获取网页源代码
    :param url: 链接地址
    :param retry_times:重试次数
    :return: 响应对象
    """
    flag = False
    res = ""
    while not flag and retry_times > 0:
        try:
            res = requests.get(url, headers=headers, verify=False)
            encode = get_encode(res.headers)
            if encode != "":
                res.encoding = encode
            else:
                pass
            flag = True
        except Exception as e:
            print(e)
            retry_times -= 1

    return res


def get_encode(headers):
    """
    获取headers中的Content-Type里携带的网页编码信息
    :param headers:
    :return:
    """
    encode = ""
    encod_str = headers["Content-Type"]
    if encod_str is not None:
        com_encode = re.compile("charset=(.*)")
        encode = re.search(com_encode, encod_str)
    else:
        pass
    return encode


def analyze_response(res):
    """
    解析网页结构，提取代理信息
    :param res:
    :return:
    """
    ip_msg_lis = []
    tree = etree.HTML(res)
    lis = tree.xpath(
        "//div[@id='list']//tr|//div[@class='fly-panel']//tr|//div[@class='layui-form']//tr|//div[@align='center']//tr|//div[@class='top']//tr|//div[@class='container']//tr|//div[@class='list']/div[@class='tr ip_tr']|//tbody/tr")
    for i in lis:
        li = i.xpath('./td/text()|./div/text()')
        # print(li)
        if len(li) > 0:
            if li[0] != "ip":
                ip_msg_lis.append(li)

    return ip_msg_lis


def rm_character(str_wait):
    """
    剔除获取的代理中的特殊符号
    :param str_wait:
    :return:
    """
    re_rm = re.compile("\\n|\\t")
    res = re.sub(re_rm, "", str_wait)
    return res


def try_response_speed(name,url, ip, port, retry_time=3, timeout=3):
    """
    测速
    :param ip:
    :param port:
    :param retry_time:
    :return:
    """
    flag = False
    # try_url = "https://www.baidu.com"
    try_url = "http://www.baidu.com"
    response_status = 500
    speed = retry_time * timeout
    http_flag = False
    https_flag = False
    while not flag and retry_time > 0:
        try:
            proxy = {
                "http": "http://{}:{}".format(ip, port),
                "https": "https://{}:{}".format(ip, port)
            }
            # print(proxy)
            time_start = time.time()
            response = requests.get(
                url=try_url, proxies=proxy, timeout=timeout)
            time_end = time.time()
            speed = time_end - time_start
            response_status = response.status_code
            # print(response_status)
            flag = True
            http_flag = True
        except Exception as e:
            # print(e)
            retry_time -= 1
        try_url = "https://www.baidu.com"
        try:
            proxy = {
                "http": "http://{}:{}".format(ip, port),
                "https": "https://{}:{}".format(ip, port)
            }
            response = requests.get(
                url=try_url, proxies=proxy, timeout=timeout)
            if response.status_code == 200:
                https_flag = True

        except Exception as e:
            s = 1
            # print()
            # retry_time -= 1
    ip_msg = {
        "source": str(name).split('-')[0],
        "ip": ip,
        "port": port,
        "status": response_status,
        "sourceUrl": url,
        "sendTime": int(speed * 1000),
        "http": http_flag,
        "https": https_flag
    }
    if response_status == 200:
        que.put(ip_msg)
    # return response_status,speed

def post_art_ip(s):
    try:
        headers = {
            "Content-Type": 'application/json;charset=UTF-8'
        }
	# 记录所抓ip，发送到自己后台
        content = requests.post('https://xxxx', json=s)
        print(content.text)
    except Exception as e:
        print('e')

if __name__ == '__main__':
    que = queue.Queue()

    ip_free_dic = {
        "快代理_高匿": "https://www.kuaidaili.com/free/inha/1/",
        "快代理_普通": "https://www.kuaidaili.com/free/intr/1/",
        "89免费代理-1": "https://www.89ip.cn/index_1.html",
        "89免费代理-2": "https://www.89ip.cn/index_2.html",
        "89免费代理-3": "https://www.89ip.cn/index_3.html",
        "89免费代理-4": "https://www.89ip.cn/index_4.html",
        "89免费代理-5": "https://www.89ip.cn/index_5.html",
        "89免费代理-6": "https://www.89ip.cn/index_6.html",
        "89免费代理-7": "https://www.89ip.cn/index_7.html",
        "89免费代理-8": "https://www.89ip.cn/index_8.html",
        "89免费代理-9": "https://www.89ip.cn/index_9.html",
        "89免费代理-10": "https://www.89ip.cn/index_10.html",
        "高可用全球免费代理ip库": "https://ip.jiangxianli.com/",
        "66代理-1": "http://www.66ip.cn/1.html",
        "66代理-2": "http://www.66ip.cn/2.html",
        "66代理-3": "http://www.66ip.cn/3.html",
        "站大爷": "https://www.zdaye.com/daxue_ip.html",
        "站大爷-1": "https://www.zdaye.com/free/1/?https=1&post=%E6%94%AF%E6%8C%81",
        "站大爷-2": "https://www.zdaye.com/free/2/?https=1&post=%E6%94%AF%E6%8C%81",
        "站大爷-3": "https://www.zdaye.com/free/1/?cunhuo=7&px=3",
        "站大爷-4": "https://www.zdaye.com/free/2/?cunhuo=7&px=3",
        "蜜蜂代理-1": "https://www.beesproxy.com/free/page/1",
        "蜜蜂代理-2": "https://www.beesproxy.com/free/page/2",
        "蜜蜂代理-3": "https://www.beesproxy.com/free/page/3",
        "蜜蜂代理-4": "https://www.beesproxy.com/free/page/4",
        "seofangfa": "https://proxy.seofangfa.com/"
    }

    for k, v in ip_free_dic.items():
        # print("开始" + k + "爬取，url=" + v)
        headers = {
            "user-agent": "PostmanRuntime-ApipostRuntime/1.1.0"
        }
        res = req_get_html(url=v, headers=headers)
        if res != "":
            ip_list = analyze_response(res.text)
            if len(ip_list) > 0:
                res = [[rm_character(j) for j in i] for i in ip_list]
                for msg in res:
                    ip = msg[0]
                    port = msg[1]
                    test_speed = Thread(
                        target=try_response_speed, args=(
                            k, v, ip, port))
                    test_speed.start()
    s = '['
    x = 1
    my_list = []
    que2 = queue.Queue()
    while not que.empty():
        data = que.get()
        que2.put(data)
        my_list.append(data)
        if x == 1:
            s = s + str(data)
        else:
            s = s + ',' + str(data)
        x = 2
    s = s + ']'
    # 发送自己后台 搭建代理池用
    # post_my_ip(my_list)
    print('开始写入。。。')
    print('数量：' + str(que2.qsize()))
    with open('output.txt', 'w', encoding='utf-8') as file:
        # 从队列中获取数据并写入文件
        file.write('[')
        xx = 1
        while not que2.empty():
            data = que2.get()
            # print(data)
            if xx == 1:
                file.write(str(data))
            else:
                file.write(',' + str(data))
            xx = 2
        file.write(']')
    print('写入完成。。。')