在跟进一些简单的爬虫,发现IP被限制的死死的,所以就需要构建一个IP池并从中筛选出可用的IP
不多说直接上代码,用的是西刺代理,有效率感人
from lxml import etree
import requests, bs4,random
def get_proxy():
global IP
# 存放获取到的代理 IP 信息
proxyList = []
# 访问 快代理 网站,获取第 page 页的网页信息
headers = {
"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
}
res = requests.get("https://www.xicidaili.com/wt/", headers=headers)
# 解析网页信息,从中提取代理 ip 的数据
bs_res = bs4.BeautifulSoup(res.text,'html.parser')
datas = bs_res.find_all('tr',class_="odd")
# 将代理 ip 信息存入 proxyList 列表
for i in datas:
IP1 = i.find_all('td')[1].text
# print(IP1)
IP2 = i.find_all('td')[2].text
# print(IP2)
proxyList.append(IP1+':'+IP2)
# print(proxyList)
return proxyList
x = 0
y = 0
proxyList = get_proxy()
for i in range(100):
try:
IP = random.choice(proxyList)
print(IP.split(":")[0])
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
}
url = 'http://ip.tool.chinaz.com/'
proxies = {
"http": "http://"+IP
}
wb_data = requests.get(url=url, headers=headers,timeout=2,proxies=proxies)#超时设置2秒快速筛选出成功的
content = etree.HTML(wb_data.text)
IP_test= content.xpath('//*[@id="rightinfo"]/dl/dd[1]/text()')[0]
if IP.split(":")[0] == IP_test:
x = x +1
print(IP+"成功"+str(x)+"次")
else :
y = y +1
print("失败"+str(y)+"次")
except :
y = y +1
print("失败"+str(y)+"次")