Python爬取代理ip构建代理ip池
说着写西刺代理的爬虫结果西刺直接关站了= = ,另外找了一个代理ip网站练手,只爬取了前两页避免疯狂爬取再导致个关站,另外可能由于网络的原因,访问网站验证ip的可用性存在问题,也可能是代码的问题,目前尚未解决,只好先把ip爬取下来写进csv,之后解决验证问题后再写验证后的代理ip池。
import requests
import re
from bs4 import BeautifulSoup
import time
import csv
def getHTML(url):
header ={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
try:
html = requests.get(url,headers=header, timeout=30)
html.raise_for_status()
html.encoding = html.apparent_encoding
return html.text
except:
return '发生异常'
def getIP(ulist,html):
soup = BeautifulSoup(html, 'lxml')
tags = soup.find_all('tr')
for tag in tags:
# ip = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}',str(tag))
tdlist = tag.find_all('td')
if tdlist!=[]:
ip = tdlist[0].string
http = tdlist[1].string
name = tdlist[2].string
add = tdlist[3].string
time = tdlist[5].string
ulist.append([ip,http,name,add,time])
return ulist
def proofIP(iplist):
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
newiplist=[]
for ip in iplist:
proxies ={
'https':ip[0]
}
try:
# requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数
# req = requests.session()
# req.keep_alive = False # 关闭多余连接
html = requests.get('https://www.baidu.com/', headers=header, proxies=proxies, timeout=2)
print('正在验证{},状态码为{}'.format(ip,html.status_code))
if html.status_code ==200:
print('此ip可用')
newiplist.append(ip)
except:
continue
return newiplist
def info(ulist):
with open('ip.csv','w',encoding='utf-8-sig') as f:
f.writelines('ip,代理协议,代理匿名度,代理位置,存活时间'+'\n')
for list in ulist:
res = ','.join(list)
f.writelines(res + '\n')
def intoFile(ulist):
with open('ip.txt','w',encoding='utf-8') as f:
for list in ulist:
f.writelines(list)
return 'end'
def main():
starturl = 'http://www.xiladaili.com/https/'
depth = 2
iplist = []
for i in range(depth):
url = starturl + str(i+1)
html = getHTML(url)
iplist = getIP(iplist,html)
info(iplist)
# ip = proofIP(iplist)
# print(ip)
# intoFile(ip)
main()
在验证ip时可能会报Max retries exceeded with url 错误,目前网络状况不是很好尚无法验证,解决后会写出解决方法。