基础爬虫,爬取代理池并进行测试
import requests
import re
from requests import adapters
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
}
def ip_66(data_text1):
global IPS_66
IPZ_66 = re.compile('<td>(\d+.\d+.\d+.\d+)</td><td>(\d+)</td>',re.S)
IPS_66 = re.findall(IPZ_66,data_text1)
return IPS_66
def port_p(data_text):
global DKS
DKZ = re.compile('<td data-title="PORT">(\d+)</td>',re.S)
DKS = re.findall(DKZ,data_text)
return DKS
def ip_ip(data_text):
global IPS
IPZ = re.compile('<td data-title="IP">(\d+\.\d+\.\d+\.\d+)</td>',re.S)
IPS = re.findall(IPZ,data_text)
return IPS
def pinjie():
# 功能需求:实现两个IPS和DKS的拼接,结果是IPS:DSK
ip_z= []
for i in range(0,len(IPS)):
ip_z.append(IPS[i]+':'+DKS[i])
# 输出结果 ['xxxx:xxxx']
dl_write(ip_z)
return ip_z
# def merge_list(list1,list2):
# python 有个拉链函数, zip(list1,list2),可以实现合并功能
# print(list(zip(list1,list2)))
def dl_write(ip_z):
with open('ipcs.txt','w',encoding='utf-8') as f:
for DIP in ip_z:
f.write(DIP+'\n')
test_corr()
def test_corr():
corr=[]
with open('ipcs.txt','r',encoding='utf-8') as f:
for i in f:
try:
# 设置重连次数
adapters.DEFAULT_RETRIES = 3
# IP = random.choice(IPAgents)
proxy = f"http://{i}"
# thisIP = "".join(IP.split(":")[0:1])
# print(thisIP)
res = requests.get(url="http://icanhazip.com/", timeout=2, proxies={"http": proxy})
if (res.status_code == 200):
print("代理IP:"+i+"有效")
corr.append(i)
else:
print("代理IP:"+i+"无效")
except:
print("错误")
text_qualified(corr)
return corr
def text_qualified(corr):
with open('ip.txt','w',encoding='utf-8') as f:
for i in corr:
f.write(i)
def ipyc():
with open('ip.txt','r',encoding='utf-8') as f:
for i in f:
proxy = f"http://{i}"
data = requests.get('https://free.kuaidaili.com/free/inha/',headers= header,proxies={"http": proxy})
data_1=requests.get('http://www.66ip.cn/',headers= header,proxies={"http": proxy})
data_text1=data_1.text
data_text = data.text
if data.status_code == 200:
port_p(data_text)
ip_ip(data_text)
pinjie()
ip_66(data_text1)
break
ipyc()