Python爬取小说-CSDN博客

本文链接：https://blog.csdn.net/qq_35255384/article/details/147836854

1，准备工作

在主目录下新建目录 xiaoshuo ，然后在其下创建文件 gudian.py

from time import sleep

import requests
from bs4 import BeautifulSoup
import re
from models.novels import Novel
from models.novel_detail import NovelDetail
from utils.db_util import DBUtil

def get_novel_info(domain,url):
    resp = requests.get(url)
    page = BeautifulSoup(resp.text, "html.parser")
    page_wrap3 = page.find("div", class_="wrap3")
    if not page_wrap3:
        print("未搜索到【get_novel_info-wrap3】,当前地址链接为:", url)
        sleep(1)
        return get_novel_info(domain, url)
    divs = page_wrap3.find_all("div", class_="sons")
    results = []
    if not divs:
        return results
    for div in divs:
        novel = Novel()
        # 获取详情信息
        detail_url = domain + div.find("a").get("href")
        # print("详情链接：", detail_url)
        novel.detail_url = detail_url
        # 获取小说书面图片
        img_path = div.find("img").get("src")
        # print("小说书面图片地址：", img_path)
        novel.img_path = img_path
        ps = div.find_all("p")
        # 获取小说标题
        title = ps[0].find("a").text
        # print("小说标题：", title)
        novel.title = title
        # 获取作者 和 分数、评分人数
        text = ps[1].text
        pattern = r"作者：([^ ]+)\s+([\d.]+)\((\d+)人评分\)"
        match = re.search(pattern, text)
        if match:
            author = match.group(1)
            score = match.group(2)
            rating_count = match.group(3)
            # 获取作者
            # print("作者：", author)
            novel.author = author
            # 获取分数
            # print("分数：", score)
            novel.score = score
            # 获取评分人数
            # print("评分人数：", rating_count)
            novel.rating_count = rating_count
        # 获取小说简介
        # 使用正则表达式匹配<p>标签内的非标签文本
        novel_introduct = BeautifulSoup(str(ps[2]), "html.parser").get_text()
        # print("小说简介：", novel_introduct)
        novel.novel_introduct = novel_introduct
        results.append(novel)
    return results

def get_novel_details(novels):
    results = []
    # 从详情中 获取相关信息
    detail_resp = requests.get(novels.detail_url)
    detail_page = BeautifulSoup(detail_resp.text, "html.parser")
    wrap3 = detail_page.find("div", class_="wrap3")
    if not wrap3:
        print("未搜索到【wrap3】,当前小说为:",novels.title,"地址链接为:",novels.detail_url)
        sleep(1)
        # return results
        return get_novel_details(novels)
    # 获取小说简介
    detail_div = wrap3.find("div", class_="son2")
    pattern = r'''
            (?P<before_p_tag>.*?)              # 捕获 <p> 标签之前的内容
            <p\s+style="margin:0px;\s*font-size:12px;\s*line-height:160%;">   # 匹配 <p> 标签及其样式
            (?P<p_content>.*?)                 # 捕获 <p> 标签内的内容
            </p>                              # 匹配 </p> 标签
            (?P<after_p_tag>.*?)               # 捕获 </p> 标签之后的内容直到 </div>
            </div>                            # 匹配结束的 </div> 标签
        '''
    match = re.search(pattern, str(detail_div), re.DOTALL | re.VERBOSE)
    if match:
        # before_p_tag = match.group('before_p_tag').strip()
        # p_content = match.group('p_content').strip()
        novel_introduction = match.group('after_p_tag').strip()
        # 获取小说简介
        # print("小说完整简介：", novel_introduction)
    else:
        # print("未找到匹配项")
        return results
    # 获取小说各章节标题
    spans = wrap3.find("div", class_="bookcont").find_all("span")
    for index, span in enumerate(spans):
        chapter = span.find("a")
        # 获取章节链接
        chapter_domain = domain + chapter.get("href")
        # print("每章节链接：", chapter_domain)
        # 获取章节标题
        chapter_title = chapter.text
        # print("每章节标题：", chapter_title)
        # 获取章节序号
        chapter_index = index + 1
        # print("每章节序号：", chapter_index)
        # 获取章节内容
        content_ps = get_content_ps(chapter_domain)
        if not content_ps or len(content_ps) == 0:
            print("章节：", chapter_title, "没有内容！")
            continue
        for idx, content_p in enumerate(content_ps):
            nodel_detail = NovelDetail()
            nodel_detail.title = novels.title
            nodel_detail.novel_introduction = novel_introduction
            nodel_detail.chapter_domain = chapter_domain
            nodel_detail.chapter_title = chapter_title
            nodel_detail.chapter_index = chapter_index
            if idx == 0 or content_p.text.strip() == "":
                # print(content_p.text)
                continue
            content_text = content_p.text
            content_index = idx
            # 打印每章内容
            # print("每章节每段内容：", content_text, "每章节每段序号:", content_index)
            nodel_detail.content_text = content_text
            nodel_detail.content_index = content_index
            results.append(nodel_detail)
    return results

def get_content_ps(chapter_domain):
    try:
        content_resp = requests.get(chapter_domain)
        content_page = BeautifulSoup(content_resp.text, "html.parser")
        content_wrap3 = content_page.find("div", class_="wrap3")
        if not content_wrap3:
            print("未搜索到【wrap3】,当前地址链接为:", chapter_domain)
            sleep(1)
            return get_content_ps(chapter_domain)
        content_div = content_wrap3.find("div", class_="son2")
        content_ps = content_div.find_all("p")
        return content_ps
    except Exception as e:
        print("获取章节内容失败：", e,"地址:",chapter_domain)
        return []


def syncOneNovel(session,domain, url):
    try:
        novel_infos = get_novel_info(domain, url)
        sleep(1)
        if novel_infos:
            DBUtil.insert_novels(session, novel_infos)
            for novel in novel_infos:
                try:
                    novel_details = get_novel_details(novel)
                    DBUtil.insert_novel_details(session, novel_details)
                except Exception as e:
                    print("获取小说详情失败：", e,"地址:",novel.detail_url)
    except Exception as e:
        print("获取小说信息失败：", e,"地址:",url)

if __name__ == "__main__":
    # 创建数据库连接
    session = DBUtil.create_connection("localhost", "3307","loren", "123456", "gudian")

    domain = "https://www.guwenxue.cc"
    # 1~214
    for page_number in range(1,215):
        url = f"{domain}/xiaoshuo_{page_number}.html"
        syncOneNovel(session, domain, url)
        print(f"第 {page_number} 页同步完成！")

    print("全部结束！")

2，创建 models、utils 目录

在 models 目录下有两个实体类 novels.py、novel_detail.py

from sqlalchemy import Column, Integer, String, Float
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()

class Novel(Base):
    __tablename__ = "novels"

    id = Column(Integer, primary_key=True, index=True)
    detail_url = Column(String, nullable=True)
    img_path = Column(String, nullable=True)
    title = Column(String, nullable=True)
    author = Column(String, nullable=True)
    score = Column(String, nullable=True)
    rating_count = Column(Integer, nullable=True)
    novel_introduct = Column(String, nullable=True)

    def __repr__(self):
        return (f"Novel(id={self.id}, detail_url={self.detail_url!r}, img_path={self.img_path!r}, title={self.title!r}, "
                f"author={self.author!r}, score={self.score!r}, rating_count={self.rating_count!r}, "
                f"novel_introduct={self.novel_introduct!r})")

from sqlalchemy import Column, Integer, String, Text
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()

class NovelDetail(Base):
    __tablename__ = "novel_details"

    id = Column(Integer, primary_key=True, index=True)
    title = Column(String, nullable=True)
    novel_introduction = Column(Text, nullable=True)
    chapter_domain = Column(String, nullable=True)
    chapter_title = Column(String, nullable=True)
    content_text = Column(Text, nullable=True)
    chapter_index = Column(Integer, nullable=True)
    content_index = Column(Integer, nullable=True)

    def __repr__(self):
        return (f"NovelDetail(id={self.id}, title={self.title!r}, novel_introduction={self.novel_introduction!r}, "
                f"chapter_domain={self.chapter_domain!r}, chapter_title={self.chapter_title!r}, content_text={self.content_text!r}, "
                f"chapter_index={self.chapter_index!r}, content_index={self.content_index!r})")

3，utils 目录下创建文件 db_util.py

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

class DBUtil:
    @staticmethod
    def create_connection(host, port, user, password, db_name):
        DATABASE_URI = f"mysql+pymysql://{user}:{password}@{host}:{port}/{db_name}"
        engine = create_engine(
            DATABASE_URI,
            pool_size=10,
            max_overflow=20,
            pool_timeout=30,
            pool_recycle=1800
        )
        Session = sessionmaker(bind=engine)
        return Session()

    @staticmethod
    def insert_novels(session, novels):
        session.add_all(novels)
        session.commit()

    @staticmethod
    def insert_novel_details(session, novel_details):
        session.add_all(novel_details)
        session.commit()

4, 建表 sql

-- gudian.novels definition

CREATE TABLE `novels` (
  `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主键id',
  `detail_url` varchar(100) NOT NULL COMMENT '详情链接',
  `img_path` varchar(100) DEFAULT NULL COMMENT '小说书面图片地址',
  `title` varchar(50) NOT NULL COMMENT '小说标题',
  `author` varchar(10) DEFAULT NULL COMMENT '作者',
  `score` varchar(5) DEFAULT NULL COMMENT '分数',
  `rating_count` int(11) DEFAULT NULL COMMENT '评分人数',
  `novel_introduct` varchar(200) DEFAULT NULL COMMENT '小说简介',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=469 DEFAULT CHARSET=utf8mb4 COMMENT='小说信息主表，主要存储爬取网站中关于小说的相关信息，如 标题、网址、作者、简介等信息';

ALTER TABLE gudian.novels ADD CONSTRAINT novels_unique UNIQUE KEY (title);


CREATE TABLE gudian.novel_details (
    id BIGINT auto_increment NOT NULL COMMENT '主键id',
    title varchar(50) NOT NULL COMMENT '标题',
    novel_introduction TEXT NULL COMMENT '完整简介',
    chapter_domain varchar(100) NULL COMMENT '每章节链接',
    chapter_title varchar(80) NULL COMMENT '每章节标题',
    content_text TEXT NULL COMMENT '每章节每段内容',
    chapter_index INT NULL COMMENT '每章节序号',
    content_index INT NULL COMMENT '每章节每段序号',
    CONSTRAINT novel_details_pk PRIMARY KEY (id)
)
ENGINE=InnoDB
DEFAULT CHARSET=utf8mb4
COLLATE=utf8mb4_general_ci
COMMENT='小说详情表，名称、章节、每章节每段落内容';
ALTER TABLE gudian.novel_details MODIFY COLUMN content_text LONGTEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL COMMENT '每章节每段内容';

-- 全表删除
delete from novels;
delete from novel_details;


select title,count(*) as count from novel_details group by title;
select title,chapter_title,chapter_index,count(*) as count from novel_details
group by title,chapter_title,chapter_index order by title,chapter_index;

select title,chapter_title,GROUP_CONCAT(content_text) from  novel_details
group by title,chapter_title;