python使用selenium模拟浏览器爬取动态网页的小说

最新推荐文章于 2024-02-28 11:34:14 发布

王机子的学习机

最新推荐文章于 2024-02-28 11:34:14 发布

阅读量294

点赞数

分类专栏： python实战文章标签： python selenium 爬虫网络爬虫

本文链接：https://blog.csdn.net/m0_70274160/article/details/124873036

版权

python实战专栏收录该内容

2 篇文章 0 订阅

订阅专栏

from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
import os,re
from time import sleep

class GetAjaxWeb:
    def __init__(self):
        options = Options()
        options.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
        # options.add_argument("headless")

        service=Service(r"D:\Program\Python\edgedriver_win64\msedgedriver.exe")

        self.driver = webdriver.Edge(service=service,options = options)
        self.driver.implicitly_wait(10)

    def getWeb(self,url):
        self.driver.get(url)

    def getHtml(self):
        while not self.driver.find_element(By.CLASS_NAME,'text').text:
            sleep(0.5)
        return self.driver.page_source
        
    def openHtml(self,html,name='out.html'):
        f = open(name, 'w',encoding='utf-8')
        f.write(html)
        # os.startfile(name)
        
    def analysisHtml(self,url,html):
        # 获取小说正文
        chapter=re.search(r'<div class="title">(.*?)<',html).group(1)
        print(f'正在获取 {chapter}')
        content=re.search(r'<div class="text">[\w\W]*?</div>',html).group()
        content=re.sub(r'<p>','\n',content)
        content=re.sub(r'　','',content)
        content=re.sub(r'(<.*?>)|(&.*?;)','',content)
        content=re.sub(r'\n\n',r'\n',content)
        content=re.sub(r'\t',r' ',content)
        text=chapter+'\n'+content+'\n'+url+'\n\n'

        # 获取下一章
        next=self.driver.find_element(By.XPATH,'//*[text()="下一页" or text()="下一章"]')
        if next:
            self.driver.execute_script('arguments[0].click();',next)
            sleep(1)

        return text,url+' '+chapter,self.driver.current_url

    def saveTxt(self,text,url,name='踏星2.txt'):
        f = open(name, 'r+',encoding='utf-8')
        if not url in f.read()[150:]:
            f.write(text)
            return True
        return False

    def getStartUrl(self,name='踏星2.txt'):
        f = open(name, 'r',encoding='utf-8')
        return f.readlines()[-2]

if __name__=='__main__':
    g=GetAjaxWeb()
    nextUrl=g.getStartUrl()
    g.getWeb(nextUrl)
    while nextUrl:
        html=g.getHtml()
        g.openHtml(html)
        text,url,nextUrl=g.analysisHtml(nextUrl,html)
        g.saveTxt(text,url)

王机子的学习机

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python使用selenium模拟浏览器爬取动态网页的小说

from time import sleepfrom selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.edge.options import Optionsfrom selenium.webdriver.edge.service import Servicefrom selenium.webdriver.support.ui import WebDriverWa.
复制链接

扫一扫