import queue
import time
from threading import Thread
from lxml import etree
import re
import requests
# 控制台不输出verify=False导致的安全警告
requests.packages.urllib3.disable_warnings()
def req_get_html(url, headers=None, retry_times=3):
"""
发送请求获取网页源代码
:param url: 链接地址
:param retry_times:重试次数
:return: 响应对象
"""
flag = False
res = ""
while not flag and retry_times > 0:
try:
res = requests.get(url, headers=headers, verify=False)
encode = get_encode(res.headers)
if encode != "":
res.encoding = encode
else:
pass
flag = True
except Exception as e:
print(e)
retry_times -= 1
return res
def get_encode(headers):
"""
获取headers中的Content-Type里携带的网页编码信息
:param headers:
:return:
"""
encode = ""
encod_str = headers["Content-Type"]
if encod_str is not None:
com_encode = re.compile("charset=(.*)")
encode = re.search(com_encode, encod_str)
else:
pass
return encode
def analyze_response(res):
"""
解析网页结构,提取代理信息
:param res:
:return:
"""
ip_msg_lis = []
tree = etree.HTML(res)
lis = tree.xpath(
"//div[@id='list']//tr|//div[@class='fly-panel']//tr|//div[@class='layui-form']//tr|//div[@align='center']//tr|//div[@class='top']//tr|//div[@class='container']//tr|//div[@class='list']/div[@class='tr ip_tr']")
for i in lis:
li = i.xpath('./td/text()|./div/text()')
print(li)
if len(li) > 0:
if li[0] != "ip":
ip_msg_lis.append(li)
return ip_msg_lis
def rm_character(str_wait):
"""
剔除获取的代理中的特殊符号
:param str_wait:
:return:
"""
re_rm = re.compile("\\n|\\t")
res = re.sub(re_rm, "", str_wait)
return res
def try_response_speed(ip, port, retry_time=3, timeout=3):
"""
测速
:param ip:
:param port:
:param retry_time:
:return:
"""
flag = False
try_url = "http://www.baidu.com"
response_status = 500
speed = retry_time * timeout
while not flag and retry_time > 0:
try:
proxy = {
"http": "http://{}:{}".format(ip, port),
"https": "https://{}:{}".format(ip, port)
}
# print(proxy)
time_start = time.time()
response = requests.get(
url=try_url, proxies=proxy, timeout=timeout)
time_end = time.time()
speed = time_end - time_start
response_status = response.status_code
# print(response_status)
flag = True
except Exception as e:
# print(e)
retry_time -= 1
ip_msg = {
"ip": ip,
"port": port,
"status": response_status,
"speed": speed
}
if response_status == 200:
que.put(ip_msg)
# return response_status,speed
if __name__ == '__main__':
que = queue.Queue()
ip_free_dic = {
"快代理_高匿": "https://free.kuaidaili.com/free/inha",
"快代理_普通": "https://free.kuaidaili.com/free/intr",
"89免费代理": "https://www.89ip.cn/index_1.html",
"高可用全球免费代理ip库": "https://ip.jiangxianli.com/",
"66代理": "http://www.66ip.cn/2.html",
"站大爷": "https://www.zdaye.com/daxue_ip.html",
"蜜蜂代理":"https://www.beesproxy.com/free"
}
for k, v in ip_free_dic.items():
res = req_get_html(url=v)
if res != "":
ip_list = analyze_response(res.text)
if len(ip_list) > 0:
res = [[rm_character(j) for j in i] for i in ip_list]
for msg in res:
ip = msg[0]
port = msg[1]
test_speed = Thread(
target=try_response_speed, args=(
ip, port))
test_speed.start()
while not que.empty():
print(que.get())
python 爬取公开IP代理
最新推荐文章于 2022-10-05 19:21:19 发布