测试代理ip

最新推荐文章于 2024-08-19 17:52:30 发布

HockerF

最新推荐文章于 2024-08-19 17:52:30 发布

阅读量1.8k

点赞数

分类专栏： # python爬虫 python 文章标签：代理ip测试

本文链接：https://blog.csdn.net/u012939880/article/details/89929110

版权

python 同时被 2 个专栏收录

67 篇文章 2 订阅

订阅专栏

python爬虫

5 篇文章 0 订阅

订阅专栏

代理ip的测试原理很简单，就是使用代理ip去访问测试网站，如果访问成功，那就可以判断该ip是有效的ip。
下面是我前后折腾了一周左右的代码：

# check_proxy.py
import random
import time
import sys
import requests

import threading  # 线程
import time

import queue
import time

try:
    import src.static_info
except:
    import static_info
exitFlag = 0

num_key = 'num_fad23fafd'

# checkIp
class checkIp:
    def __init__(self, thread_num=1, file_names=[], ip_max=100):
        self.ip_max = ip_max
        self.threads = []
        self.file_lock = {}
        self.ip_queue = queue.Queue(ip_max)
        self.wait_num = 0
        self.queue_lock = threading.Lock()
        self.exit_flag = False
        self.thread_num = thread_num
        self.static_info = static_info.static_info()
        for n in file_names:
            self.add_file_lock(n)

    def add_file_lock(self, file_name):
        if file_name not in self.file_lock:
            self.file_lock[file_name] = threading.Lock()

    def start_check(self):
        for i in range(self.thread_num):
            t = threading.Thread(target=self.process_data,
                                 args=('thead_'+str(i),))
            t.start()
            self.threads.append(t)

    def write_t(self, info, is_valid):
        string = ''
        string = str(info['ip']) + ', '+str(info['port']) + \
            ', '+str(info['type']) + ', '
        for key in info:
            if key in ['file', 'ip', 'port', 'type']:
                continue
            string += str(info[key]) + ', '
        if info['file'][0] not in self.file_lock:
            self.add_file_lock(info['file'][0])
        self.file_lock[info['file'][0]].acquire()
        # print('file name: ', info['file'][0])
        if is_valid:
            info['file'][1].write(string[:-2] + '\n')
            info['file'][1].flush()
        else:
            info['file'][2].write(string[:-2] + '\n')
            info['file'][2].flush()

        self.file_lock[info['file'][0]].release()

    def gettimediff(self, start, end):
        seconds = (end - start).seconds
        m, s = divmod(seconds, 60)
        h, m = divmod(m, 60)
        diff = ("%02d:%02d:%02d" % (h, m, s))
        return diff

    def process_data(self, thread_name):
        while not self.exit_flag:
            # print("process_data[%s]" % thread_name)
            self.queue_lock.acquire()
            if not self.ip_queue.empty():
                ip_info = self.ip_queue.get()
                self.wait_num += 1
                self.queue_lock.release()
                self.checkip(ip_info, thread_name)
                time.sleep(0.2)
            else:
                self.queue_lock.release()
                time.sleep(1)
        print('['+thread_name+']: exit.')

    def checkip(self, ip_info, thread_name):
        headers = self.static_info.get_random_headers()  # 定制请求头
        ip = ip_info['ip'] + ":" + ip_info['port']

        if 'type' not in ip_info:
            ip_info['type'] = 'http'
        targeturl = self.static_info.get_random_url(ip_info['type'])
        if targeturl == None:
            print('[checkip] unknow type!')
            return False

        proxies = {"http": "http://"+ip,
                   "https": "https://"+ip}  # 代理ip
        try:
            response = requests.get(
                url=targeturl[0], proxies=proxies, headers=headers, timeout=3).status_code
            if response == 200:
                is_avail = True
                targeturl[1] += 1
            else:
                is_avail = False
        except:
            is_avail = False

        if not is_avail and ip_info[num_key] < 5:
            ip_info[num_key] += 1
            self.add_check_ip(ip_info)
        else:
            self.write_t(ip_info, is_avail)

        if is_avail == True:
            print('['+thread_name+'-'+ip_info['file'][0]+'] get:',
                  str(ip_info['ip'])+':'+str(ip_info['port']))
        else:
            print('['+thread_name+'-'+ip_info['file'][0]+'] abandon['+str(ip_info[num_key])+']: ',
                  str(ip_info['ip'])+':'+str(ip_info['port']))

        self.queue_lock.acquire()
        self.wait_num -= 1
        self.queue_lock.release()

    def add_check_ip(self, info_ip):
        # 添加ip
        self.queue_lock.acquire()
        while self.ip_queue.qsize() == self.ip_queue.maxsize:
            self.queue_lock.release()
            time.sleep(1)
            self.queue_lock.acquire()
        if num_key not in info_ip:
            info_ip[num_key] = 0
        self.ip_queue.put(info_ip)
        self.queue_lock.release()

    def quit_check(self, method=''):
        if method == 'w':
            while not self.ip_queue.empty() or self.wait_num > 0:
                time.sleep(1)
        elif method == 'r' and not self.ip_queue.empty():  # 直接退出
            return False

        self.exit_flag = True
        for t in self.threads:
            t.join()

        return True


# ----------------------- test -----------------------
if __name__ == "__main__":
    check = checkIp(3, ['abc'])
    check.start_check()
    file_t = open('abc_t.txt', 'a')  # try 
    file_c = open('abc_c.txt', 'a')  # cancel
    # info 必须 file　（其中有　该ip爬虫的标识，　正确文件写open，　错误文件写open）,  ip,  port,  选有: type,  其他字段顺序保存
    ip_info = {'file': ['abc', file_t, file_c],
               'ip': '123.13.123.111', 'port': '1097', 'type': 'http'}
    check.add_check_ip(ip_info)
    time.sleep(10)
    check.quit_check()
    print('end')

# static_info.py
import random
class static_info:
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
	# 我在网上找的一些网址,大家也可以自己整理
    urls = {'http': [['http://tggg.37.com/', 1],  
                     ['https://app.tanwan.com', 1],
                     ['http://chongqing.bianminwang.com/', 1],
                     ['http://tc.519397.com/flash/wxot/index.html', 1],
                     ['http://bj.wfc188.com/', 1],
                     ['http://jfk2.sanfsw.net/', 1],
                     ['http://qiaomuyi.cn/', 1],
                     ['http://time.yixingjiaoyu.cn', 1],
                     ['http://m.lemonyule.com/', 1],
                     ['http://www.265.com/', 1],
                     ['http://www.0460.com/', 1],
                     ['http://www.sohu.com/', 1],
                     ['http://www.9991.com/', 1],
                     ['http://m.2499kp.com/', 1]],
            'https': [['https://www.baidu.com/', 1],
                      ['https://www.ett.cc/', 1],
                      ['https://www.so.com/', 1],
                      ['https://www.hao123.com/', 1],
                      ['https://www.2345.com/', 1],
                      ['https://www.sogou.com', 1],
                      ['https://www.soso.com', 1],
                      ['https://123.sogou.com/', 1],
                      ['https://www.37.com/', 1],
                      ['https://mos.m.taobao.com', 1],
                      ['https://cq.58.com', 1]
                      ]}

    def __init__(self):
        pass

    def get_random_headers(self):
        UserAgent = random.choice(self.user_agent_list)
        headers = {'User-Agent': UserAgent}
        return headers

    def get_random_url(self, htype):
        if htype not in self.urls:
            print('get test url unknow type!')
            return None
        return random.choice(self.urls[htype])

代理隔一段时间就会失效，所以对自己的代理ip进行定期的检测是有必要的，下面就是我的检测代码。

# test_proxy.py
import time
import os
import sys
import shutil
try:
    from src import tools, check_proxy, static_info
except:
    import tools
    import check_proxy
    import static_info

ip_file = ''


def check_ip_file(file_name):

    ip_file = os.path.splitext(os.path.basename(file_name))[0]

    ip_file += str(time.time()).replace('.', '_')
    check = check_proxy.checkIp(3, [ip_file])
    check.start_check()
    ft = open(ip_file + '_t.txt', 'w')
    fc = open(ip_file + '_c.txt', 'w')
    with open(file_name, 'r') as fr:
        lines = fr.readlines()
        data_i = 0
        li = 0
        if lines[0].strip().isnumeric():
            data_i += 1
        l_i2 = [0, 1, 2]
        for l in lines[data_i:]:
            data = l.strip().split(',')
            info = {}
            info['file'] = [ip_file, ft, fc]
            info['ip'] = data[l_i2[0]].strip()
            info['port'] = data[l_i2[1]].strip()
            info['type'] = data[l_i2[2]].strip()
            for i, k in enumerate(data):
                if i not in l_i2:
                    info['data'+str(i)] = k
            check.add_check_ip(info)

    check.quit_check('w')
    ft.close()
    fc.close()
    shutil.move(ip_file + '_t.txt', file_name)


if __name__ == '__main__':
    if len(sys.argv) < 2:
        file_name = 'abc_t.txt'
    else:
        file_name = sys.argv[1]

    check_ip_file(file_name)