爬取代理
from lxml import etree
import time
import requests
def main():
ip_list = []
f = open("./ip_list.txt","w")
for i in range(1,30):
url = "https://www.kuaidaili.com/free/inha/"+str(i)
header = {"User-Agent":"jafaf"}
res = requests.get(url=url,headers=header)
print(res.status_code)
tree = etree.HTML(res.text)
tr_list = tree.xpath('//div[@id="list"]//tr') #定位到div属性id的值为list分支下的tr子分支,由于tr有好多个,所以需要循环
i = 0
for tr in tr_list:
ip = tr.xpath('./td/text()') #定位到tr分支下的td分支取td的值
if i == 0: #因为打印时发现第一个是空所以过滤第一个
i+=1
continue
ip_port = ip[0]+":"+ip[1] #ip加端口号
print(ip_port)
ip_list.append(ip_port)
f.write(ip_port+'\n') #保存到本地文件记得加换行符
time.sleep(0.5) #防止速度过快,服务器访问不过来
if__name__=="__main__":
main()
测试可用性
import requests
import multiprocessing
prox_list = []
def main():
f = open("./ip_list.txt","r")
for ip in f:
ip2 = ip.strip('\n') #把每行的换行符去掉
prox_list.append(ip2)
print(prox_list)
check_ip(prox_list)
def check(url,header,proxies,timeout):
print(proxies['http'])
try:
res = requests.get(url=url,headers=header,proxies=proxies,timeout=0.5)
if res.status_code==200:
print("可用的ip: "+proxies['http'])
except:
pass
try:
res = requests.get(url=url,headers=header,proxies=proxies,timeout=timeout)
if res.status_code==200:
print(ip)
except:
pass
def check_ip(ip):
url = "https://www.baidu.com/"
header = {"User-Agent":"haha"}
for ip_prox in ip:
#proxies是字典形式,就这样写就是了
proxies={
"http":"http://"+ip_prox,
"https":"http://"+ip_prox
}
test = multiprocessing.Process(target=check,args=(url,header,proxies,0.5,))
test.start()
if__name__=="__main__":
main()
运气不好,爬取的免费代理每一个能用