验证爬下来的代理ip是否可用,去掉不可用的代理
import pickle
import os
import urllib.request
import urllib.parse
import re
import random
def loadAgentList():
agentlist=[]
if not os.path.exists("agent.pkl"):
return agentlist
with open("agent.pkl","rb") as f:
agentlist=pickle.load(f)
return agentlist
ipAgents=loadAgentList()
def writeAgentList(agentList):
if os.path.exists("agent.pkl"):
os.remove("agent.pkl") #每次重新生成 要不多次 dump需要多次 load
with open("agent.pkl.","wb") as f:
pickle.dump(agentList, f)
def loadAgentList():
agentlist=[]
if not os.path.exists("agent.pkl"):
return agentlist
with open("agent.pkl","rb") as f:
agentlist=pickle.load(f)
return agentlist
def getMyIp():
url="http://httpbin.org/ip"
try:
html=openUrl(url).decode("utf-8")
except:
return
pattern = re.compile(r'((25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\.){3}(25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)')
groupIp=pattern.search(html)
if groupIp:
return groupIp.group()
#注意 返回对象未进行解码
def openUrl(url):
#req=urllib.request.Request(url)
#设置访问头
#req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")
#设置代理
proxy={"http":ip}
#print(proxy)
#定义一个代理字段
proxy_support=urllib.request.ProxyHandler(proxy)
#建立一个opener
opener=urllib.request.build_opener(proxy_support)
opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")]
#urllib.request.install_opener(opener)
#获得网页对象
#response=urllib.request.urlopen(req)
response=opener.open(url)
html=response.read()
return html
if __name__ == "__main__":
ipArray=loadAgentList()
ip=""
for iter in range(1,len(ipArray)+1):
ip=ipArray[-iter]
print(ip)
b=getMyIp()
print(b)
if not b:
#代理不能用
ipArray.pop(-iter)
else :
print("success")
writeAgentList(ipArray)
print("处理完成!")