from pyquery import PyQuery as pq
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import re
import time
from concurrent.futures.thread import ThreadPoolExecutor
import config
from process import DataProcessor
#实现一个爬虫,用于从: 上获取代理IP数据#默认只爬取前4页数据,可自行调整query_ max page参数来控制
dp = DataProcessor()
default_url=‘http://www.xicidaili.com/nn/’
#生成爬虫目标URL列表
target_urls = [default_url + str(i) for i in range(1, 11)]
browser = webdriver.Chrome(executable_path = ‘C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe’)
def get_proxies(url):
try:
browser.get(url)
html=browser.page_source
doc=pq(html,parser=“html”).find( ‘tr:gt(0)’)
for tr in doc:
tds=pq(tr).find(‘td’)
yield tds[5].text.strip().lower(),tds[1].text.strip(),tds[2].text.strip()
except Exception as e:
print(e)
def get_all_proxies():
“”"
获取全部代理数据
:return:
“”"
for url in target_urls:
yield from get_proxies(url)
def concat_proxy(item):
“”"
将元组组装成代理字符串
:param item:
:return:
“”"
return ‘{}?/{}:{}’.format(*item)
def validate(protocol, ip, port):
“”"
检查代理IP有效性
:param protocol:
:param ip:
:param port:
:return:
(True, 1) 表示有效且优质
(True, 0) 表示有效但普通
False 表示无效
“”"
if check(protocol, ip, port, config.better_timeout):
return True, 1
print(‘T1’)
elif check(protocol, ip, port, config.normal_timeout):
return True, 0
print(‘T0’)
else:
return False, None
print(‘f’)
def check(protocol, ip, port, timeout=0.5):
“”"
内部函数,检查代理IP是否有效
:param protocol:
:param ip:
:param port:
:param timeout:
:return:
“”"
protocol = protocol.lower()
try:
# 根据协议自动切换测试URL
target_url = config.target_https_url
if protocol == ‘http’:
target_url = config.target_http_url
# 对HTTPS只能使用HTTPS代理,对HTTP只能使用HTTP代理
resp = requests.get(target_url,
timeout=timeout,
proxies={
protocol: concat_proxy((protocol, ip, port))
},
headers={
‘User-Agent’: config.default_user_agent,
})
return resp.status_code == 200
except Exception:
pass
return False
def crawler_task():
for item in get_all_proxies():
# cannot unpack non-iterable bool object
flag, level = validate(*item)
if not flag:
continue
else:
# 加入代理IP池中
dp.save(concat_proxy(item), level)
print(’[{}]完成爬虫任务’.format(time.ctime()))
def validate_task():
# 从代理IP池中获取数据
for proxy, level in dp.query():
# 执行校验,剔除无效数据
flag, level2 = validate(*re.split(r’(?/|:)’, proxy)[::2])
if not flag:
dp.remove(proxy, level)
else:
# 加入代理IP池中
dp.save(proxy, level2)
# 如果代理级别发生变化,则移除原集合中的代理IP
if level != level2:
dp.remove(proxy, level)
print(’[{}]完成校验任务’.format(time.ctime()))
if name == ‘main’:
print(’[{}]启动代理IP池维护任务’.format(time.ctime()))
# 手动执行一次,定时任务将会在下一个周期才开始执行
crawler_task()
# 启动爬虫定时任务
schedule.every(config.crawler_task_interval).seconds.do(crawler_task)
# 启动IP有效性校验任务
schedule.every(config.validate_task_interval).seconds.do(validate_task)
while True:
schedule.run_pending()
time.sleep(1)
count = 0
for proxy in get_all_proxies():
#测试解包正常
count += 1
print(proxy)
print(validate(*proxy))
print('一共获取到:{} 个代理IP'.format(count))
browser.quit()