前言
在日常爬虫中,很多网站都出现了反爬虫,一是模拟登陆可以解决,还有一种是同一个IP无法访问多次,因此我们需要有自己的代理IP池并自动清除和补充代理IP池
一、代理IP的获取
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Host': 'www.kuaidaili.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}
urls = ['https://www.kuaidaili.com/free/inha/','https://www.kuaidaili.com/free/intr/']
def getProxyIp():
print('开始获取代理IP')
proxy=[]
for each_url in urls:
for i in range(1, 10):
url = each_url + str(i)+'/'
html = urllib.request.urlopen(urllib.request.Request(url, headers=header)).read()
html = gzip.decompress(html).decode("utf-8")
soup = BeautifulSoup(html,'lxml')
ips = soup.findAll('tr')
for x in range(1, len(ips)):
ip = ips[x]
tds = ip.findAll("td")
ip_temp = tds[0].contents[0] + " " + tds[1].contents[0]
proxy.append(ip_temp)
time.sleep(3)
return proxy
二、代理IP的验证
def validateIp(proxy):
print('开始验证代理IP,以下IP可用')
url = "http://ip.chinaz.com/getip.aspx"
socket.setdefaulttimeout(3)
for i in range(0, len(proxy)):
try:
ip = proxy[i].strip().split(" ")
proxy_host = "http://" + ip[0] + ":" + ip[1]
proxy_handler = urllib.request.ProxyHandler({"http":proxy_host})
res = urllib.request.build_opener(proxy_handler).open(url).read()
print(proxy[i])
except:
continue
三、代理IP的使用
import urllib.request
import random
url = 'http://www.whatismyip.com.tw'
iplist=['60.191.164.83:3128','211.108.62.56:80','218.5.238.169:8080','210.136.19.243:8080']
proxy_support = urllib.request.ProxyHandler({'http':'60.191.164.83:3128'})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')
print(html)