Engine类的代码如下:
"""
Author: MR.N
Created: 2020/11/27 Friday
2020年11月27日 星期五
"""
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ECS
from selenium.webdriver.support.wait import WebDriverWait
from module import *
from UrlUtil import __pad__, __replace_separator__, __is_host__, gethost
from article import *
import random
import re
import traceback
import time
# webdriver管理容器: Engine类
class Engine:
def __init__(self, driver=0, visits=10):
self.paused=False
self.__driver_mode__=driver
self.__max_visits__=visits
self.engine=None
self.articles={}
self.hosts={}
self.logger=None
def __set_logger__(self, logger):
self.logger=logger
def __get_logger__(self):
return None!=self.logger
def __set_driver_mode__(self, driver):
self.__driver_mode__=driver
def __get_driver_mode__(self):
return self.__driver_mode__
def __set_max_visits__(self, visitsites):
self.__max_visits__=visitsites
def __get_max_visits__(self):
return self.__max_visits__
def __log__(self, message):
if self.__get_logger__():
self.logger.insert(0., message+'\n')
def __pause__(self):
if not self.paused:
self.paused=True
def __is_alive__(self):
return not self.paused
def __resume__(self):
if self.paused:
self.paused=False
def __delete__(self):
if self.engine:
try:
self.engine.quit()
finally:
self.engine=None
self.articles=None
self.hosts=None
def __run__(self):
visits=self.__get_max_visits__()
if visits<=0:
visits=len(NEWS_HOSTS)
elif visits==1:
visits=10
elif visits==2:
visits=20
elif visits==3:
visits=50
else:
visits=len(NEWS_HOSTS)
if visits>len(NEWS_HOSTS):
visits=len(NEWS_HOSTS)
hosts=random.sample(NEWS_HOSTS, k=visits)
self.articles={}
self.hosts={}
for key, host in enumerate(hosts):
try:
if not self.__is_alive__():
break
self.__log__('visiting ['+str(key+1)+'/'+str(len(hosts))+'] '+host)
# 访问新闻网站
articles=self.__visit__(host)
if articles and len(articles)>0:
for article in articles:
source=__pad__(article.getsource(), host)
# 是否为网站主机
if __is_host__(source):
self.hosts[article.gettitle()]=source
else:
self.articles[article.gettitle()]=source
self.__log__('ended with '+str(len(articles))+' link(s)')
time.sleep(1)
except:
self.__log__('err with '+host)
traceback.print_exc()
self.__iterate__()
def __iterate__(self):
if self.__get_logger__() and self.articles and len(self.articles)>0:
index=0
for title in self.articles.keys():
index+=1
self.__log__('reading ['+str(index)+'] '+title+', '+self.articles[title])
# 浏览文章网页
self.__read__(title, self.articles[title])
time.sleep(.25)
else:
self.__log__('None')
if self.__get_logger__() and self.hosts and len(self.hosts)>0:
index=0
self.__log__('-'*30+'|HOST - END|'+'-'*30)
for title in self.hosts.keys():
index+=1
self.__log__('['+str(index)+'] '+title+', '+self.hosts[title])
self.__log__('-'*30+'|HOST - START|'+'-'*30)
else:
self.__log__('no host found')
# 访问网站
def __visit__(self, host):
if not host.startswith('http://') and not host.startswith('https://'):
return []
try:
wait=WebDriverWait(self.engine, 5)
self.engine.implicitly_wait(5)
try:
self.engine.get(host)
except:
#traceback.print_exc()
pass
time.sleep(3)
if not self.__is_alive__():
return []
try:
by=By.TAG_NAME
target='body'
targetelement=wait.until(ECS.presence_of_element_located((by, target)))
if not targetelement:
self.__log__('Target element not found.')
return []
except:
pass
scrollToBottomJs='setTimeout(function(){window.scrollTo({top:document.body.scrollHeight/2, behavior:"smooth", duration:350});}, 100);'
self.engine.execute_script(scrollToBottomJs)
time.sleep(.75)
pagesource=self.engine.page_source
return self.__get_article__(pagesource)
except:
traceback.print_exc()
self.__log__('err with '+host)
return []
# 解析超链接标签,获取文章列表
def __get_article__(self, pagesource):
sources=re.findall('<a[^>]*href="([^>"]+)"[^>]*>([^<>]+)</a>', pagesource, re.I|re.M|re.S)
if not sources or len(sources)<=0:
self.__log__('[err] no source')
return None
#self.__log__('length: '+str(len(sourcelinks)))
articles=[]
for source in sources:
link=source[0].strip()
if link.startswith('javascript:') or link.startswith('#') or len(link)>1024:
#self.__log__('empty link')
continue
title=source[1].strip()
title=__replace_separator__(title)
if not title or len(title)<6 or len(title)>200:
#self.__log__('invalid title: '+title)
continue
article=Article(title, link)
articles.append(article)
#self.__log__(title)
return articles
def __read__(self, title=None, source=None):
try:
if None==title or None==source or (not source.startswith('http://') and not source.startswith('https://'))\
or source.endswith('#') or title.startswith("'"):
return
# 同步webdriver初始化设置超时为5s
wait=WebDriverWait(self.engine, 5)
self.engine.implicitly_wait(5)
try:
self.engine.get(source)
except:
#traceback.print_exc()
pass
time.sleep(3)
if not self.__is_alive__():
return
try:
by=By.TAG_NAME
target='body'
targetelement=wait.until(ECS.presence_of_element_located((by, target)))
# 判断网页是否开始渲染
if not targetelement:
self.__log__('Target element not found.')
# 若超时未渲染,决定是否终止操作; 宽松策略:否
#return
except:
pass
time.sleep(2.66)
#return self.__get_article__(pagesource)
except:
traceback.print_exc()
self.__log__('err with '+source)
def __reset__(self):
try:
if self.engine:
self.engine.quit()
except:
traceback.print_exc()
finally:
self.engine=None
self.articles={}
# 选择所使用浏览器
def __start__(self):
self.__reset__()
if self.__driver_mode__==0:
self.__start_chrome__()
elif self.__driver_mode__==1:
self.__start_firefox__()
else:
self.__start_chrome__()
# 使用Chrome浏览器
def __start_chrome__(self):
self.engine=webdriver.Chrome()
# 超时时间设定为5s
self.engine.set_page_load_timeout(5)
self.engine.set_script_timeout(5)
# 使用Firefox浏览器
def __start_firefox__(self):
self.engine=webdriver.Firefox()
self.engine.set_page_load_timeout(5)
self.engine.set_script_timeout(5)
article.py、UrlUtil.py和module.py略,其中module.py为自定义新闻网站列表。