前提:代理池已将ip存入Redis数据库中了,如何提高使用代理池中的IP质量
通过一个访问脚本获得高质量访问目标网站的IP代理:
# -*- coding: utf-8 -*-
import threading,time,random
import requests
import redis
import re
from fake_useragent import UserAgent
redis = redis.Redis(host='', port='', password='',db=0)
#https://www.huishoubao.com/evInfo/20059519191?selecteds=%5B%22133%22,%22879%22,%2212%22,%2277%22,%2273%22,%2271%22,%2263%22,%229%22%5D
TEST_URL = 'https://www.huishoubao.com/evInfo/20059519191?selecteds=%5B%22174%22%2C%2212%22%2C%2242%22%2C%2237%22%2C%2217%22%2C%2283%22%2C%2277%22%2C%2273%22%2C%2271%22%2C%2263%22%5D'
def test():
for i in range(0,1000):
Allproxies = str(redis.zrevrange('proxies', 0, -1))
# print(Allproxies)
proxies = re.findall("b'(.*?)',", Allproxies)
# print(proxies)
print('当前排列可用IP数目:', len(proxies))
if len(proxies) > 0:
rr = random.randint(0,len(proxies)-1)
proxy = proxies[rr].replace(' ', '')
# print('当前测试ip:', proxy)
proxies = {"http": "http://%(proxy)s/" % {'proxy': proxy},
"https": "http://%(proxy)s/" % {'proxy': proxy}}
uas = UserAgent()
ua = uas.random
# print(ua)
try:
headers = {'User-Agent': ua}
response = requests.get(url=TEST_URL, headers=headers, proxies=proxies, timeout=3)
if response.status_code == 200:
print('-------------------------可用有效ip:', proxy)
return proxy
else:
# print('清除无用代理', proxy)
redis.zrem('proxies', proxy)
except Exception as gl:
# print(gl)
# print('清除无用代理', proxy)
redis.zrem('proxies', proxy)
接着在抓取目标页面使用获取到的高质量IP代理:
# -*- coding: utf-8 -*-
from p1 import test
import requests
import threading
from fake_useragent import UserAgent
TEST_URL = 'https://www.huishoubao.com/evInfo/20059519191?selecteds=%5B%22174%22%2C%2212%22%2C%2242%22%2C%2237%22%2C%2217%22%2C%2283%22%2C%2277%22%2C%2273%22%2C%2271%22%2C%2263%22%5D'
def t_s():
proxy = test()
print('p2---------',proxy)
proxies = {"http": "http://%(proxy)s/" % {'proxy': proxy},
"https": "http://%(proxy)s/" % {'proxy': proxy}}
uas = UserAgent()
ua = uas.random
print(ua)
try:
headers = {'User-Agent': ua}
response = requests.get(url=TEST_URL, headers=headers, proxies=proxies, timeout=3)
if response.status_code == 200:
print('success'+str(len(response.text)))
except Exception as gl:
print(gl)
proxy = test()
print('p3---------', proxy)
proxies = {"http": "http://%(proxy)s/" % {'proxy': proxy},
"https": "http://%(proxy)s/" % {'proxy': proxy}}
uas = UserAgent()
ua = uas.random
print(ua)
try:
headers = {'User-Agent': ua}
response = requests.get(url=TEST_URL, headers=headers, proxies=proxies, timeout=3)
if response.status_code == 200:
print('success' + str(len(response.text)))
except Exception as gl:
print(gl)
proxy = test()
print('p3---------', proxy)
proxies = {"http": "http://%(proxy)s/" % {'proxy': proxy},
"https": "http://%(proxy)s/" % {'proxy': proxy}}
uas = UserAgent()
ua = uas.random
print(ua)
try:
headers = {'User-Agent': ua}
response = requests.get(url=TEST_URL, headers=headers, proxies=proxies, timeout=3)
if response.status_code == 200:
print('success' + str(len(response.text)))
except Exception as gl:
print(gl)
def threads_get():
for i in range(10):
t = threading.Thread(target=t_s)
t.start()
if __name__ == '__main__':
threads_get()