import random
import requests
import re
def rand_userAgent():
# 用正则弄的一个UA池
UserAgentList = [{'User-Agent': 'Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50'},
{'User-Agent': 'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50'},
{'User-Agent': 'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0;'},
{'User-Agent': 'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)'},
{'User-Agent': 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)'},
{'User-Agent': 'Mozilla/4.0(compatible;MSIE6.0;WindowsNT5.1)'},
{'User-Agent': 'Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1'},
{'User-Agent': 'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1'},
{'User-Agent': 'Opera/9.80(Macintosh;IntelMacOSX10.6.8;U;en)Presto/2.8.131Version/11.11'},
{'User-Agent': 'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11'}, {
'User-Agent': 'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'},
{'User-Agent': 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Maxthon2.0)'},
{'User-Agent': 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;TencentTraveler4.0)'},
{'User-Agent': 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1)'},
{'User-Agent': 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;TheWorld)'}, {
'User-Agent': 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Trident/4.0;SE2.XMetaSr1.0;SE2.XMetaSr1.0;.NETCLR2.0.50727;SE2.XMetaSr1.0)'},
{'User-Agent': 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;360SE)'},
{'User-Agent': 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;AvantBrowser)'},
{'User-Agent': 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1)'}]
return random.choices(UserAgentList)[0]
# 向一个url发起请求,成功返回文本,不成功继续发起请求
def get_one_page(url):
print("发起请求")
userAgent = rand_userAgent()
response = requests.get(url, headers=userAgent)
response.encoding = "utf-8"
if response.status_code == 200:
# print(response.text)
print("响应成功")
return response.text
else:
print("响应失败")
# 如果一直响应失败会超出递归最大层数,导致程序崩溃.
# 此处并不打算来解决这个问题,毕竟玩具代码,不影响实用
return get_one_page(url)
txt = get_one_page(url = 'http://www.xicidaili.com')
def ip_pool(IP_list):
for i in IP_list:
if i[2] == 'HTTPS':
# 如果IP不可用,从IP_list中删除
if not test_ip(i[0], i[1]):
print(i, "不可用")
IP_list.remove(i)
# 返回筛选后的IP_list
return IP_list
# 对每一个IP进行测试
def test_ip(ip, port):
server = ip + ":" + port
proxies = {'http': 'http://' + server, 'https': 'https://' + server}
try:
r = requests.get('https://www.baidu.com', proxies=proxies, timeout=1)
if r.status_code == 200:
return 1
else:
return 0
except:
return 0
str1 = '<td class="country"><img[\s\S]+?<td>'
#ip
str2 = '</td>[\s\S]*?<td>'
#端口号
str3 = '</td>[\s\S]*?<td>[\s\S]*?<td>'
#协议类型
str4 = '</td>'
regex = re.compile(str1+"([\s\S]*?)"+str2+"([\s\S]*?)"+str3+"([\s\S]*?)"+str4)
IP_List = regex.findall(string=txt)
#print(IP_List)
a = ip_pool(IP_List)
print(a)
Python之代理IP地址池(一)
最新推荐文章于 2022-01-29 17:29:01 发布