代理ip的测试原理很简单,就是使用代理ip去访问测试网站,如果访问成功,那就可以判断该ip是有效的ip。
下面是我前后折腾了一周左右的代码:
# check_proxy.py
import random
import time
import sys
import requests
import threading # 线程
import time
import queue
import time
try:
import src.static_info
except:
import static_info
exitFlag = 0
num_key = 'num_fad23fafd'
# checkIp
class checkIp:
def __init__(self, thread_num=1, file_names=[], ip_max=100):
self.ip_max = ip_max
self.threads = []
self.file_lock = {}
self.ip_queue = queue.Queue(ip_max)
self.wait_num = 0
self.queue_lock = threading.Lock()
self.exit_flag = False
self.thread_num = thread_num
self.static_info = static_info.static_info()
for n in file_names:
self.add_file_lock(n)
def add_file_lock(self, file_name):
if file_name not in self.file_lock:
self.file_lock[file_name] = threading.Lock()
def start_check(self):
for i in range(self.thread_num):
t = threading.Thread(target=self.process_data,
args=('thead_'+str(i),))
t.start()
self.threads.append(t)
def write_t(self, info, is_valid):
string = ''
string = str(info['ip']) + ', '+str(info['port']) + \
', '+str(info['type']) + ', '
for key in info:
if key in ['file', 'ip', 'port', 'type']:
continue
string += str(info[key]) + ', '
if info['file'][0] not in self.file_lock:
self.add_file_lock(info['file'][0])
self.file_lock[info['file'][0]].acquire()
# print('file name: ', info['file'][0])
if is_valid:
info['file'][1].write(string[:-2] + '\n')
info['file'][1].flush()
else:
info['file'][2].write(string[:-2] + '\n')
info['file'][2].flush()
self.file_lock[info['file'][0]].release()
def gettimediff(self, start, end):
seconds = (end - start).seconds
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
diff = ("%02d:%02d:%02d" % (h, m, s))
return diff
def process_data(self, thread_name):
while not self.exit_flag:
# print("process_data[%s]" % thread_name)
self.queue_lock.acquire()
if not self.ip_queue.empty():
ip_info = self.ip_queue.get()
self.wait_num += 1
self.queue_lock.release()
self.checkip(ip_info, thread_name)
time.sleep(0.2)
else:
self.queue_lock.release()
time.sleep(1)
print('['+thread_name+']: exit.')
def checkip(self, ip_info, thread_name):
headers = self.static_info.get_random_headers() # 定制请求头
ip = ip_info['ip'] + ":" + ip_info['port']
if 'type' not in ip_info:
ip_info['type'] = 'http'
targeturl = self.static_info.get_random_url(ip_info['type'])
if targeturl == None:
print('[checkip] unknow type!')
return False
proxies = {"http": "http://"+ip,
"https": "https://"+ip} # 代理ip
try:
response = requests.get(
url=targeturl[0], proxies=proxies, headers=headers, timeout=3).status_code
if response == 200:
is_avail = True
targeturl[1] += 1
else:
is_avail = False
except:
is_avail = False
if not is_avail and ip_info[num_key] < 5:
ip_info[num_key] += 1
self.add_check_ip(ip_info)
else:
self.write_t(ip_info, is_avail)
if is_avail == True:
print('['+thread_name+'-'+ip_info['file'][0]+'] get:',
str(ip_info['ip'])+':'+str(ip_info['port']))
else:
print('['+thread_name+'-'+ip_info['file'][0]+'] abandon['+str(ip_info[num_key])+']: ',
str(ip_info['ip'])+':'+str(ip_info['port']))
self.queue_lock.acquire()
self.wait_num -= 1
self.queue_lock.release()
def add_check_ip(self, info_ip):
# 添加ip
self.queue_lock.acquire()
while self.ip_queue.qsize() == self.ip_queue.maxsize:
self.queue_lock.release()
time.sleep(1)
self.queue_lock.acquire()
if num_key not in info_ip:
info_ip[num_key] = 0
self.ip_queue.put(info_ip)
self.queue_lock.release()
def quit_check(self, method=''):
if method == 'w':
while not self.ip_queue.empty() or self.wait_num > 0:
time.sleep(1)
elif method == 'r' and not self.ip_queue.empty(): # 直接退出
return False
self.exit_flag = True
for t in self.threads:
t.join()
return True
# ----------------------- test -----------------------
if __name__ == "__main__":
check = checkIp(3, ['abc'])
check.start_check()
file_t = open('abc_t.txt', 'a') # try
file_c = open('abc_c.txt', 'a') # cancel
# info 必须 file (其中有 该ip爬虫的标识, 正确文件写open, 错误文件写open), ip, port, 选有: type, 其他字段顺序保存
ip_info = {'file': ['abc', file_t, file_c],
'ip': '123.13.123.111', 'port': '1097', 'type': 'http'}
check.add_check_ip(ip_info)
time.sleep(10)
check.quit_check()
print('end')
# static_info.py
import random
class static_info:
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
# 我在网上找的一些网址,大家也可以自己整理
urls = {'http': [['http://tggg.37.com/', 1],
['https://app.tanwan.com', 1],
['http://chongqing.bianminwang.com/', 1],
['http://tc.519397.com/flash/wxot/index.html', 1],
['http://bj.wfc188.com/', 1],
['http://jfk2.sanfsw.net/', 1],
['http://qiaomuyi.cn/', 1],
['http://time.yixingjiaoyu.cn', 1],
['http://m.lemonyule.com/', 1],
['http://www.265.com/', 1],
['http://www.0460.com/', 1],
['http://www.sohu.com/', 1],
['http://www.9991.com/', 1],
['http://m.2499kp.com/', 1]],
'https': [['https://www.baidu.com/', 1],
['https://www.ett.cc/', 1],
['https://www.so.com/', 1],
['https://www.hao123.com/', 1],
['https://www.2345.com/', 1],
['https://www.sogou.com', 1],
['https://www.soso.com', 1],
['https://123.sogou.com/', 1],
['https://www.37.com/', 1],
['https://mos.m.taobao.com', 1],
['https://cq.58.com', 1]
]}
def __init__(self):
pass
def get_random_headers(self):
UserAgent = random.choice(self.user_agent_list)
headers = {'User-Agent': UserAgent}
return headers
def get_random_url(self, htype):
if htype not in self.urls:
print('get test url unknow type!')
return None
return random.choice(self.urls[htype])
代理隔一段时间就会失效,所以对自己的代理ip进行定期的检测是有必要的,下面就是我的检测代码。
# test_proxy.py
import time
import os
import sys
import shutil
try:
from src import tools, check_proxy, static_info
except:
import tools
import check_proxy
import static_info
ip_file = ''
def check_ip_file(file_name):
ip_file = os.path.splitext(os.path.basename(file_name))[0]
ip_file += str(time.time()).replace('.', '_')
check = check_proxy.checkIp(3, [ip_file])
check.start_check()
ft = open(ip_file + '_t.txt', 'w')
fc = open(ip_file + '_c.txt', 'w')
with open(file_name, 'r') as fr:
lines = fr.readlines()
data_i = 0
li = 0
if lines[0].strip().isnumeric():
data_i += 1
l_i2 = [0, 1, 2]
for l in lines[data_i:]:
data = l.strip().split(',')
info = {}
info['file'] = [ip_file, ft, fc]
info['ip'] = data[l_i2[0]].strip()
info['port'] = data[l_i2[1]].strip()
info['type'] = data[l_i2[2]].strip()
for i, k in enumerate(data):
if i not in l_i2:
info['data'+str(i)] = k
check.add_check_ip(info)
check.quit_check('w')
ft.close()
fc.close()
shutil.move(ip_file + '_t.txt', file_name)
if __name__ == '__main__':
if len(sys.argv) < 2:
file_name = 'abc_t.txt'
else:
file_name = sys.argv[1]
check_ip_file(file_name)
代码没太整理, 功能测试正确,就先做个笔记,大家也可以参考下。这里下载!