python 爬取公开IP代理

import queue
import time
from threading import Thread
from lxml import etree
import re
import requests
# 控制台不输出verify=False导致的安全警告
requests.packages.urllib3.disable_warnings()


def req_get_html(url, headers=None, retry_times=3):
    """
    发送请求获取网页源代码
    :param url: 链接地址
    :param retry_times:重试次数
    :return: 响应对象
    """
    flag = False
    res = ""
    while not flag and retry_times > 0:
        try:
            res = requests.get(url, headers=headers, verify=False)
            encode = get_encode(res.headers)
            if encode != "":
                res.encoding = encode
            else:
                pass
            flag = True
        except Exception as e:
            print(e)
            retry_times -= 1

    return res


def get_encode(headers):
    """
    获取headers中的Content-Type里携带的网页编码信息
    :param headers:
    :return:
    """
    encode = ""
    encod_str = headers["Content-Type"]
    if encod_str is not None:
        com_encode = re.compile("charset=(.*)")
        encode = re.search(com_encode, encod_str)
    else:
        pass
    return encode


def analyze_response(res):
    """
    解析网页结构,提取代理信息
    :param res:
    :return:
    """
    ip_msg_lis = []
    tree = etree.HTML(res)
    lis = tree.xpath(
        "//div[@id='list']//tr|//div[@class='fly-panel']//tr|//div[@class='layui-form']//tr|//div[@align='center']//tr|//div[@class='top']//tr|//div[@class='container']//tr|//div[@class='list']/div[@class='tr ip_tr']")
    for i in lis:
        li = i.xpath('./td/text()|./div/text()')
        print(li)
        if len(li) > 0:
            if li[0] != "ip":
                ip_msg_lis.append(li)

    return ip_msg_lis


def rm_character(str_wait):
    """
    剔除获取的代理中的特殊符号
    :param str_wait:
    :return:
    """
    re_rm = re.compile("\\n|\\t")
    res = re.sub(re_rm, "", str_wait)
    return res


def try_response_speed(ip, port, retry_time=3, timeout=3):
    """
    测速
    :param ip:
    :param port:
    :param retry_time:
    :return:
    """
    flag = False
    try_url = "http://www.baidu.com"
    response_status = 500
    speed = retry_time * timeout
    while not flag and retry_time > 0:
        try:
            proxy = {
                "http": "http://{}:{}".format(ip, port),
                "https": "https://{}:{}".format(ip, port)
            }
            # print(proxy)
            time_start = time.time()
            response = requests.get(
                url=try_url, proxies=proxy, timeout=timeout)
            time_end = time.time()
            speed = time_end - time_start
            response_status = response.status_code
            # print(response_status)
            flag = True
        except Exception as e:
            # print(e)
            retry_time -= 1
    ip_msg = {
        "ip": ip,
        "port": port,
        "status": response_status,
        "speed": speed
    }
    if response_status == 200:
        que.put(ip_msg)
    # return response_status,speed


if __name__ == '__main__':
    que = queue.Queue()

    ip_free_dic = {
        "快代理_高匿": "https://free.kuaidaili.com/free/inha",
        "快代理_普通": "https://free.kuaidaili.com/free/intr",
        "89免费代理": "https://www.89ip.cn/index_1.html",
        "高可用全球免费代理ip库": "https://ip.jiangxianli.com/",
        "66代理": "http://www.66ip.cn/2.html",
        "站大爷": "https://www.zdaye.com/daxue_ip.html",
        "蜜蜂代理":"https://www.beesproxy.com/free"
    }

    for k, v in ip_free_dic.items():
        res = req_get_html(url=v)
        if res != "":
            ip_list = analyze_response(res.text)
            if len(ip_list) > 0:
                res = [[rm_character(j) for j in i] for i in ip_list]
                for msg in res:
                    ip = msg[0]
                    port = msg[1]
                    test_speed = Thread(
                        target=try_response_speed, args=(
                            ip, port))
                    test_speed.start()

    while not que.empty():
        print(que.get())
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值