本篇使用 selenium技术 爬取快代理上的代理IP,并判断其是否可用。
#爬取代理IP
from selenium import webdriver
from selenium.webdriver.common.proxy import Proxy
from selenium.webdriver.common.proxy import ProxyType
from selenium.webdriver.common import desired_capabilities
import time
from random import randint
from bs4 import BeautifulSoup
import pymysql
'''
PhantomJS常用配置
'''
#增加头信息
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
'Accept-Charset': 'utf-8',
"User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64;rv:47.0) Gecko/20100101 Firefox/47.0",
"Host": "www.kuaidaili.com",
"Connection": "keep-alive"
}
#DesiredCapabilities 手动设置爬取时你所期望的功能配置
for key in headers:
webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.{}'.format(key)] = headers[key]
proxy=Proxy(
{
'proxyType':ProxyType.MANUAL, #manual 手工的,手动的
'httpProxy':'181.16.136.6:38319'
}
)
desired_capabilities = desired_capabilities.DesiredCapabilities.PHANTOMJS.copy()
#把代理ip加入到技能中
proxy.add_to_capabilities(desired_capabilities)
#禁止加载图片,我们只需要爬取数据,并不需要加载图片,这样可以加快爬取速度
desired_capabilities["phan