Python + Selenium 简单模拟浏览新闻网站和文章

最新推荐文章于 2022-07-31 16:20:47 发布

laoyouzhazi

最新推荐文章于 2022-07-31 16:20:47 发布

阅读量248

点赞数

分类专栏： Python 文章标签： python selenium 模拟浏览网页容器管理

本文为博主原创文章，未经博主允许不得转载。

本文链接：https://blog.csdn.net/qq_21264377/article/details/110579807

版权

Python 专栏收录该内容

159 篇文章 1 订阅

订阅专栏

本文介绍了使用Python结合Selenium库模拟浏览新闻网站和文章的过程，通过Engine类的代码展示了如何操作。涉及到的工具有UrlUtil.py辅助工具和自定义新闻网站列表module.py。

摘要由CSDN通过智能技术生成

Engine类的代码如下：

"""
Author: MR.N
Created: 2020/11/27 Friday
		 2020年11月27日 星期五

"""

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ECS
from selenium.webdriver.support.wait import WebDriverWait
from module import *
from UrlUtil import __pad__, __replace_separator__, __is_host__, gethost
from article import *
import random
import re
import traceback
import time


# webdriver管理容器: Engine类
class Engine:
	def __init__(self, driver=0, visits=10):
		self.paused=False
		self.__driver_mode__=driver
		self.__max_visits__=visits
		self.engine=None
		self.articles={}
		self.hosts={}
		self.logger=None

	def __set_logger__(self, logger):
		self.logger=logger

	def __get_logger__(self):
		return None!=self.logger

	def __set_driver_mode__(self, driver):
		self.__driver_mode__=driver

	def __get_driver_mode__(self):
		return self.__driver_mode__

	def __set_max_visits__(self, visitsites):
		self.__max_visits__=visitsites

	def __get_max_visits__(self):
		return self.__max_visits__

	def __log__(self, message):
		if self.__get_logger__():
			self.logger.insert(0., message+'\n')

	def __pause__(self):
		if not self.paused:
			self.paused=True

	def __is_alive__(self):
		return not self.paused

	def __resume__(self):
		if self.paused:
			self.paused=False

	def __delete__(self):
		if self.engine:
			try:
				self.engine.quit()
			finally:
				self.engine=None
				self.articles=None
				self.hosts=None

	def __run__(self):
		visits=self.__get_max_visits__()
		if visits<=0:
			visits=len(NEWS_HOSTS)
		elif visits==1:
			visits=10
		elif visits==2:
			visits=20
		elif visits==3:
			visits=50
		else:
			visits=len(NEWS_HOSTS)
		if visits>len(NEWS_HOSTS):
			visits=len(NEWS_HOSTS)
		hosts=random.sample(NEWS_HOSTS, k=visits)
		self.articles={}
		self.hosts={}
		for key, host in enumerate(hosts):
			try:
				if not self.__is_alive__():
					break
				self.__log__('visiting ['+str(key+1)+'/'+str(len(hosts))+'] '+host)
                # 访问新闻网站
				articles=self.__visit__(host)
				if articles and len(articles)>0:
					for article in articles:
						source=__pad__(article.getsource(), host)
                        # 是否为网站主机
						if __is_host__(source):
							self.hosts[article.gettitle()]=source
						else:
							self.articles[article.gettitle()]=source
				self.__log__('ended with '+str(len(articles))+' link(s)')
				time.sleep(1)
			except:
				self.__log__('err with '+host)
				traceback.print_exc()
		self.__iterate__()

	def __iterate__(self):
		if self.__get_logger__() and self.articles and len(self.articles)>0:
			index=0
			for title in self.articles.keys():
				index+=1
				self.__log__('reading ['+str(index)+'] '+title+', '+self.articles[title])
				# 浏览文章网页
                self.__read__(title, self.articles[title])
				time.sleep(.25)
		else:
			self.__log__('None')
		if self.__get_logger__() and self.hosts and len(self.hosts)>0:
			index=0
			self.__log__('-'*30+'|HOST - END|'+'-'*30)
			for title in self.hosts.keys():
				index+=1
				self.__log__('['+str(index)+'] '+title+', '+self.hosts[title])
			self.__log__('-'*30+'|HOST - START|'+'-'*30)
		else:
			self.__log__('no host found')

    # 访问网站
	def __visit__(self, host):
		if not host.startswith('http://') and not host.startswith('https://'):
			return []
		try:
			wait=WebDriverWait(self.engine, 5)
			self.engine.implicitly_wait(5)
			try:
				self.engine.get(host)
			except:
				#traceback.print_exc()
				pass
			time.sleep(3)
			if not self.__is_alive__():
				return []
			try:
				by=By.TAG_NAME
				target='body'
				targetelement=wait.until(ECS.presence_of_element_located((by, target)))
				if not targetelement:
					self.__log__('Target element not found.')
					return []
			except:
				pass
			scrollToBottomJs='setTimeout(function(){window.scrollTo({top:document.body.scrollHeight/2, behavior:"smooth", duration:350});}, 100);'
			self.engine.execute_script(scrollToBottomJs)
			time.sleep(.75)
			pagesource=self.engine.page_source
			return self.__get_article__(pagesource)
		except:
			traceback.print_exc()
			self.__log__('err with '+host)
			return []

    # 解析超链接标签，获取文章列表
	def __get_article__(self, pagesource):
		sources=re.findall('<a[^>]*href="([^>"]+)"[^>]*>([^<>]+)</a>', pagesource, re.I|re.M|re.S)
		if not sources or len(sources)<=0:
			self.__log__('[err] no source')
			return None
		#self.__log__('length: '+str(len(sourcelinks)))
		articles=[]
		for source in sources:			
			link=source[0].strip()
			if link.startswith('javascript:') or link.startswith('#') or len(link)>1024:
				#self.__log__('empty link')
				continue
			title=source[1].strip()
			title=__replace_separator__(title)
			if not title or len(title)<6 or len(title)>200:
				#self.__log__('invalid title: '+title)
				continue
			article=Article(title, link)
			articles.append(article)
			#self.__log__(title)
		return articles

	def __read__(self, title=None, source=None):
		try:
			if None==title or None==source or (not source.startswith('http://') and not source.startswith('https://'))\
			 or source.endswith('#') or title.startswith("'"):
				return
            # 同步webdriver初始化设置超时为5s
			wait=WebDriverWait(self.engine, 5)
			self.engine.implicitly_wait(5)
			try:
				self.engine.get(source)
			except:
				#traceback.print_exc()
				pass
			time.sleep(3)
			if not self.__is_alive__():
				return
			try:
				by=By.TAG_NAME
				target='body'
				targetelement=wait.until(ECS.presence_of_element_located((by, target)))
                # 判断网页是否开始渲染
				if not targetelement:
					self.__log__('Target element not found.')
                    # 若超时未渲染，决定是否终止操作; 宽松策略：否
					#return
			except:
				pass
			time.sleep(2.66)
			#return self.__get_article__(pagesource)
		except:
			traceback.print_exc()
			self.__log__('err with '+source)

	def __reset__(self):
		try:
			if self.engine:
				self.engine.quit()				
		except:
			traceback.print_exc()
		finally:
			self.engine=None
			self.articles={}

    # 选择所使用浏览器
	def __start__(self):
		self.__reset__()
		if self.__driver_mode__==0:
			self.__start_chrome__()
		elif self.__driver_mode__==1:
			self.__start_firefox__()
		else:
			self.__start_chrome__()

    # 使用Chrome浏览器
	def __start_chrome__(self):
		self.engine=webdriver.Chrome()
        # 超时时间设定为5s
		self.engine.set_page_load_timeout(5)
		self.engine.set_script_timeout(5)
    
    # 使用Firefox浏览器
	def __start_firefox__(self):
		self.engine=webdriver.Firefox()
		self.engine.set_page_load_timeout(5)
		self.engine.set_script_timeout(5)

article.py、UrlUtil.py和module.py略，其中module.py为自定义新闻网站列表。

laoyouzhazi

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python + Selenium 简单模拟浏览新闻网站和文章

"""Author: MR.NCreated: 2020/11/27 Friday 2020年11月27日星期五"""from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support import expected_conditions as ECSfrom selenium.webdriver.support.wait import W.
复制链接

扫一扫

专栏目录