Python获取代理池和提取可用IP

前言

最近在学习Python爬虫的编写,发现很多网站设置了ip限制,请求过于频繁会被ban ip,于是想到了代理池技术。

正文

请求代理池

这里我选用了一个国外的免费代理池,由于网页已经帮我们整理好了格式,所以不需要利用re模块去查找ip和端口了。
我采用requests库请求,并把未筛选的代理ip存为一个txt文档:

	url = ""  # 填入代理ip页面的URL,或者api接口
	headers = {
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55"
     }
    resp = requests.get(url, headers)
    with open("1.txt", "w") as f1:  # 爬取未筛选的代理ip
         f1.write(resp.text)

筛选代理ip

先编写筛选单个ip的函数,再使用多线程筛选多个ip:

def check_ip(url,line):  # 检查单个ip的可用性
    proxy = line.split(":")
    ip = proxy[0]
    port = proxy[1]
    check_proxy = {
        "http":"http://"+ip+":"+port,
        "socks5":"socks5://"+ip+":"+port
    }
    try:
        proxy_resp = requests.get(url,check_proxy)
        if proxy_resp.status_code == 200:
            print("[+]当前代理:"+ip+":"+str(port)+"可用!")
            proxy_list.append(ip+":"+port)
            # print(proxy_list)
        else:
            print(print("[-]当前代理:"+ip+":"+port+"不可用!"))
    except Exception as e:
        print(e)
        print(print("[-]当前代理:"+ip+":"+port+"不可用!"))

    with open("1.txt","r") as f2:  # 多线程筛选多个ip
        for f in f2:
            if f.startswith("#"):
                continue
            else:
                f = f.strip()
                t1 = threading.Thread(target=check_ip,args=(url,f))  # 多线程筛选代理ip
                t1.start()
                t1.join()

保存可用的代理ip

with open("2.txt","w") as f3:  # 把可用的代理ip写入一个新文档
        for pl1 in proxy_list:
            f3.writelines(pl1+'\n')

把代理ip转发到本地(可选)

这里大家可写可不写,我纯粹是为了巩固所学知识(doge)。每次运行这段代码都会报“数组下标越界”的错误,还请各位大佬多多指导!

def portforward(prip,prpo):
    global target_socket
    server = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
    try:
        server.bind(('127.0.0.1',5320))  # 把代理转发到本地的5320端口
        server.listen(10)
    except socket.error as e:
        print("[-]The local service : " + str(e))
        return "[-]The local service : " + str(e)
    while True:
        try:
            # 接收客户端数据
            client, addr = server.accept()
            print('[*]accept %s connect' % (addr,))
            data = client.recv(1024)
            if not data:
                break
            print('[*' + localtime + ']: Accept data...')
        except socket.error as e:
            print("[-]Local receiving client : " + str(e))
            return "[-]Local receiving client : " + str(e)
        while True:
            # 目标代理服务器,将客户端接收数据转发给代理服务器
            target_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

            print("[!]Now proxy ip:" + prip + str(prpo))

            try:
                target_socket.settimeout(3)
                target_socket.connect((prip, prpo))
            except:
                print("[-]RE_Connect...")
                continue
            break

        try:
            target_socket.send(data)
        except socket.error as e:
            print("[-]Sent to the proxy server : " + str(e))
            return "[-]Sent to the proxy server : " + str(e)

        while True:
            try:
                # 从代理服务器接收数据,然后转发回客户端
                data_1 = target_socket.recv(1024)
                if not data_1:
                    break
                print('[*' + localtime + ']: Send data...')
                client.send(data_1)
            except socket.timeout as e:
                print(prip + ":" + str(prpo))
                print("[-]Back to the client : " + str(e))
                continue
        # 关闭连接
    client.close()
    target_socket.close()
    
    with open("2.txt","r") as f4:
        f5 = f4.readlines()
        print(f5)
        for pl2 in f5:
            pl2 = pl2.strip()
            print(pl2)
            if pl2.startswith('#'):
                continue
            pl2 = pl2.split(":")
            print(pl2)
            proxy_ip = pl2[0]
            proxy_port = pl2[1]
            t2 = threading.Thread(target=portforward,args=(proxy_ip,proxy_port))
            t2.start()
            t2.join()

最后贴出完整代码:

#!/usr/bin/python3.9
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022 HackerTerry, Inc. All Rights Reserved 
#
# @Time    : 2022/2/18 15:04
# @Author  : Terry Zhang
# @Email   : goudan1974@163.com
# @Blog    : https://www.terry906.top
# @File    : 检测代理ip.py
# @Software: PyCharm

import socket
import requests
import threading
import time

proxy_list = []
localtime = time.asctime(time.localtime(time.time()))
def check_ip(url,line):  # 检查单个ip的可用性
    proxy = line.split(":")
    ip = proxy[0]
    port = proxy[1]
    check_proxy = {
        "http":"http://"+ip+":"+port,
        "socks5":"socks5://"+ip+":"+port
    }
    try:
        proxy_resp = requests.get(url,check_proxy)
        if proxy_resp.status_code == 200:
            print("[+]当前代理:"+ip+":"+str(port)+"可用!")
            proxy_list.append(ip+":"+port)
            # print(proxy_list)
        else:
            print(print("[-]当前代理:"+ip+":"+port+"不可用!"))
    except Exception as e:
        print(e)
        print(print("[-]当前代理:"+ip+":"+port+"不可用!"))


def portforward(prip,prpo):
    global target_socket
    server = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
    try:
        server.bind(('127.0.0.1',5320))
        server.listen(10)
    except socket.error as e:
        print("[-]The local service : " + str(e))
        return "[-]The local service : " + str(e)
    while True:
        try:
            # 接收客户端数据
            client, addr = server.accept()
            print('[*]accept %s connect' % (addr,))
            data = client.recv(1024)
            if not data:
                break
            print('[*' + localtime + ']: Accept data...')
        except socket.error as e:
            print("[-]Local receiving client : " + str(e))
            return "[-]Local receiving client : " + str(e)
        while True:
            # 目标代理服务器,将客户端接收数据转发给代理服务器
            target_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

            print("[!]Now proxy ip:" + prip + str(prpo))

            try:
                target_socket.settimeout(3)
                target_socket.connect((prip, prpo))
            except:
                print("[-]RE_Connect...")
                continue
            break

        try:
            target_socket.send(data)
        except socket.error as e:
            print("[-]Sent to the proxy server : " + str(e))
            return "[-]Sent to the proxy server : " + str(e)

        while True:
            try:
                # 从代理服务器接收数据,然后转发回客户端
                data_1 = target_socket.recv(1024)
                if not data_1:
                    break
                print('[*' + localtime + ']: Send data...')
                client.send(data_1)
            except socket.timeout as e:
                print(prip + ":" + str(prpo))
                print("[-]Back to the client : " + str(e))
                continue
        # 关闭连接
    client.close()
    target_socket.close()

if __name__ == '__main__':
    url = ""  # 填入代理ip页面的URL,或者api接口
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55"
    }
    resp = requests.get(url, headers)
    with open("1", "w") as f1:  # 爬取未筛选的代理ip
        f1.write(resp.text)

    with open("1.txt","r") as f2:
        for f in f2:
            if f.startswith("#"):
                continue
            else:
                f = f.strip()
                t1 = threading.Thread(target=check_ip,args=(url,f))  # 多线程筛选代理ip
                t1.start()
                t1.join()

    with open("2.txt","w") as f3:  # 把可用的代理ip写入一个新文档
        for pl1 in proxy_list:
            f3.writelines(pl1+'\n')

    with open("2.txt","r") as f4:
        f5 = f4.readlines()
        print(f5)
        for pl2 in f5:
            pl2 = pl2.strip()
            print(pl2)
            if pl2.startswith('#'):
                continue
            pl2 = pl2.split(":")
            print(pl2)
            proxy_ip = pl2[0]
            proxy_port = pl2[1]
            t2 = threading.Thread(target=portforward,args=(proxy_ip,proxy_port))
            t2.start()
            t2.join()

总结

编写这个脚本,是对我学习知识的一个小总结。然而,这还不是终点。随着学习的深入,以后我会尝试使用面向对象的编程知识,写出更厉害的python工具和脚本!

  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值