from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
import os,re
from time import sleep
class GetAjaxWeb:
def __init__(self):
options = Options()
options.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
# options.add_argument("headless")
service=Service(r"D:\Program\Python\edgedriver_win64\msedgedriver.exe")
self.driver = webdriver.Edge(service=service,options = options)
self.driver.implicitly_wait(10)
def getWeb(self,url):
self.driver.get(url)
def getHtml(self):
while not self.driver.find_element(By.CLASS_NAME,'text').text:
sleep(0.5)
return self.driver.page_source
def openHtml(self,html,name='out.html'):
f = open(name, 'w',encoding='utf-8')
f.write(html)
# os.startfile(name)
def analysisHtml(self,url,html):
# 获取小说正文
chapter=re.search(r'<div class="title">(.*?)<',html).group(1)
print(f'正在获取 {chapter}')
content=re.search(r'<div class="text">[\w\W]*?</div>',html).group()
content=re.sub(r'<p>','\n',content)
content=re.sub(r' ','',content)
content=re.sub(r'(<.*?>)|(&.*?;)','',content)
content=re.sub(r'\n\n',r'\n',content)
content=re.sub(r'\t',r' ',content)
text=chapter+'\n'+content+'\n'+url+'\n\n'
# 获取下一章
next=self.driver.find_element(By.XPATH,'//*[text()="下一页" or text()="下一章"]')
if next:
self.driver.execute_script('arguments[0].click();',next)
sleep(1)
return text,url+' '+chapter,self.driver.current_url
def saveTxt(self,text,url,name='踏星2.txt'):
f = open(name, 'r+',encoding='utf-8')
if not url in f.read()[150:]:
f.write(text)
return True
return False
def getStartUrl(self,name='踏星2.txt'):
f = open(name, 'r',encoding='utf-8')
return f.readlines()[-2]
if __name__=='__main__':
g=GetAjaxWeb()
nextUrl=g.getStartUrl()
g.getWeb(nextUrl)
while nextUrl:
html=g.getHtml()
g.openHtml(html)
text,url,nextUrl=g.analysisHtml(nextUrl,html)
g.saveTxt(text,url)
python使用selenium模拟浏览器爬取动态网页的小说
最新推荐文章于 2024-02-28 11:34:14 发布