python 爬虫小说使用无头浏览器 + 自动化爬虫

fuchto

已于 2022-07-18 16:55:43 修改

阅读量746

点赞数

文章标签： python chrome 开发语言

于 2022-07-18 15:42:47 首次发布

本文链接：https://blog.csdn.net/fuchto/article/details/125851402

版权

该博客介绍了如何利用Python的Selenium库和无头浏览器进行网络爬虫，目标是获取指定小说的详情信息，并将其存储到MySQL数据库中。首先，通过POST请求搜索小说，然后解析HTML找到小说链接，接着访问小说详情页获取目录和章节链接。最后，使用无头Chrome浏览器逐章抓取内容并保存为TXT文件。整个过程中涉及了正则表达式、 BeautifulSoup解析及数据库操作。

摘要由CSDN通过智能技术生成

仅供学习，请勿商业行为！，未经允许请勿转载

获取到搜索接口和请求方法和请求参数当前是post 方法

请求参数为

获取对应小说的详情介绍页

对应类、对应浏览器驱动获取方法

python selenium4 使用无界面浏览器爬虫并存储mysql数据库_fuchto的博客-CSDN博客_python 无界面浏览器浏览器驱动需要查看对应浏览器版本进行下载selenium · PyPIhttps://pypi.org/project/selenium/浏览器设置中查看当前版本from selenium import webdriverfrom selenium.webdriver.chrome.service import Servicefrom selenium.webdriver.common.by import By# select 选择框需要引入 select 类fro...https://blog.csdn.net/fuchto/article/details/124480885?spm=1001.2014.3001.5502

废话不多说直接上代码

import requests
from bs4 import BeautifulSoup
import re
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
# select 选择框需要引入 select 类
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import math
import json

def bubbleSort(arr):
    n = len(arr)
    # 遍历所有数组元素
    for i in range(n):
        # Last i elements are already in place
        for j in range(0, n - i - 1):
            if arr[j] > arr[j + 1]:
                arr[j],arr[j + 1] = arr[j + 1],arr[j]

    return arr

def str_replaces(arr):
    arrs = []
    for itme in arr:
        arrs.append(itme.replace(".html", ' '))

    return arrs

def start():
    fiction = input("请输入你想获取的小说：")

    # 搜索小说
    # post 方法
    search_url = "私聊获取请求地址"
    # 头部信息

    # 请求参数
    search_data = {'searchtype': 'articlename','searchkey':fiction}
    # 发送请求
    response = requests.post(search_url,search_data)
    # 设置获取到的内容编码
    response.encoding = 'utf-8'
    # print(response.text)
    # 声明 当前 字符串用于匹配
    soup = BeautifulSoup(response.text,"html.parser")
    # 获取到 a标签
    fiction_a_link = soup.select("#content > div > ul > a")
    # 循环获取到的小说
    for itme in fiction_a_link:
        # 正则匹配规则 http 开始 到 html 结束
        pattern = re.compile('http.+html')
        # 匹配 获取 内容
        fiction_link = pattern.findall(str(itme))
        # 当前文件绝对路径
        basedir = os.path.abspath(os.path.dirname(__file__))
        # 获取 小说名称
        fiction_pattern = re.compile(r'alt="(\w+)"')
        fiction_name = fiction_pattern.findall(str(itme))

        print("\n小说名称：" + fiction_name[0])
        #小说目录
        dir = basedir + "\\" + fiction_name[0]
        #创建目录
        if os.path.exists(dir) == False:
            print("\n正在创建目录" + dir)
            # 创建文件目录
            os.mkdir(fiction_name[0])
        else:
            print("\n目录已存在")
        #         获取小说详情
        print("\n详情地址"+fiction_link[0])
        details_respsone = requests.get(fiction_link[0])
        details_respsone.encoding = 'utf-8'
        details_soup = BeautifulSoup(details_respsone.text,"html.parser")
        fiction_list = details_soup.select("#content > div.articleInfo > div.articleInfoRight > ol > p.right > a")
        print("\n列表页地址")
        print(fiction_list)
        list_pattern = re.compile(r'href=\"(.+?)\"')
        str_fiction_list = str(fiction_list[0])
        lsit_link = list_pattern.findall(str_fiction_list)
        print("\n列表请求地址")
        print(lsit_link)
        lists_response = requests.get(str(lsit_link[0]))
        lists_response.encoding = lists_response.apparent_encoding
        lists_soup = BeautifulSoup(lists_response.text,'html.parser')
        lists_html = lists_soup.select("#newlist")
        lists_pattern = re.compile(r'href=\"(.+?)\"')
        # 章节链接
        chapter_link = lists_pattern.findall(str(lists_html))
        # print("\n章节链接")
        # 去除字符串 .html
        chapter_link = str_replaces(chapter_link)
        # 从小到大排序
        chapter_link = bubbleSort(chapter_link)

        for value in chapter_link:
            # if int(value) >= 189883:
                # 章节详情链接
                chapter_details_link = str(lsit_link[0])+value.strip()+".html"
                #使用无头浏览器访问
                chrome_options = Options()
                chrome_options.add_argument('--headless')
                chrome_options.add_argument('--disable-gpu')
                s = Service("D:\pythonVendor\chrome\chromedriver.exe")
                driver = webdriver.Chrome(service=s, options=chrome_options)
                #  打开网站
                driver.get(chapter_details_link)
                # 获取网页内容
                chapter_soup = BeautifulSoup(driver.page_source,'html.parser')
                title_html = chapter_soup.select("body > div.readerListBody > div.readerTitle")
                title_pattern = re.compile(r"<h1>(.+?)</h1>")
                title_name = title_pattern.findall(str(title_html))
                print("\n章节名称："+str(title_name[0]))
                # 获取文章内容
                content = chapter_soup.select("#content")
                content_pattern = re.compile(r'<p data-id="99" .+?>(.+?)</p>',re.S)
                content = content_pattern.sub('', str(content[0]))
                pattern = re.compile(r'<[^>]+>', re.S)
                content = pattern.sub('\r\n', content)
                content = content.replace('最新网址：www.umiwx.com', '')
                content = content.replace(' ', '')
                # 创建文章 文件
                chapter_dir = dir+"\\"+str(title_name[0])+".txt";
                chapter_dir = chapter_dir.replace("：",' ')
                chapter_dir = chapter_dir.replace("|",' ')
                chapter = open(chapter_dir,"w",encoding="utf-8")
                chapter.writelines(content)
                print("\n章节："+str(title_name[0])+"保存成功")
                driver.close()
                chapter.close()
                time.sleep(30)
        print("小说"+fiction_name[0]+"已全部爬取")


if __name__ == "__main__":
    start()

fuchto

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
python 爬虫小说使用无头浏览器 + 自动化爬虫

浏览器驱动需要查看对应浏览器版本进行下载selenium·PyPIhttps//pypi.org/project/selenium/浏览器设置中查看当前版本fromseleniumimportwebdriverfromselenium.webdriver.chrome.serviceimportServicefromselenium.webdriver.common.byimportBy#select选择框需要引入select类fro...仅供学习，请勿商业行为！............
复制链接

扫一扫