说明
- 要写一个一个爬虫,妈的总是被禁ip,公司又不愿意花钱买IP 只能找一些免费的IP,免费的IP还要测试是否能用
- 需要注意的是整个只是Demo,并不是特别完善。需要进行完善逻辑
完善逻辑
- 获取有效代理的IP,放入到缓存中
- 使用IP的时候在去测试是否能通
- 不能通,缓存中剔除
依赖
代码
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import telnetlib
class Driver:
def __init__(self):
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
self.chrome_options = chrome_options
def getDriver(self):
return webdriver.Chrome(options=self.chrome_options)
class Company:
def __init__(self, driver):
self.driver = driver
def getCompanyDetailUrl(self):
first_url = "http://www.goubanjia.com/"
self.driver.get(first_url)
ipList = self.driver.find_elements_by_xpath(
'//section[@id="services"]//div[@class="container"]//div[@class="row"]//div[@class="container-fluid"]//div[@class="row-fluid"]//div[@class="span12"]//table//tbody//tr')
for ipTr in ipList:
list = ipTr.find_element_by_css_selector('.ip').get_attribute("innerText").split(":", -1)
print(list)
try:
telnetlib.Telnet(list[0], list[1], timeout=3)
print("代理ip有效!")
except:
print("代理ip无效!")
if __name__ == '__main__':
driver = Driver()
company = Company(driver.getDriver())
company.getCompanyDetailUrl()
另外一个页面的
class Driver:
def __init__(self):
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
self.chrome_options = chrome_options
def getDriver(self):
return webdriver.Chrome(options=self.chrome_options)
class Company:
def __init__(self, driver):
self.driver = driver
def getCompanyDetailUrl(self):
first_url = "https://www.kuaidaili.com/free/inha/"
self.driver.get(first_url)
listDiv = self.driver.find_element_by_id("list").find_elements_by_xpath("//table//tbody//tr")
for ipTD in listDiv:
try:
ip = ipTD.find_element_by_css_selector('td[data-title="IP"]').get_attribute("innerText")
port = ipTD.find_element_by_css_selector('td[data-title="PORT"]').get_attribute('innerText')
print(ip, port)
telnetlib.Telnet(ip, port, timeout=2)
except:
print("代理ip无效!")
if __name__ == '__main__':
driver = Driver()
company = Company(driver.getDriver())
company.getCompanyDetailUrl()