1,准备工作
在主目录下新建目录 xiaoshuo ,然后在其下创建文件 gudian.py
from time import sleep
import requests
from bs4 import BeautifulSoup
import re
from models.novels import Novel
from models.novel_detail import NovelDetail
from utils.db_util import DBUtil
def get_novel_info(domain,url):
resp = requests.get(url)
page = BeautifulSoup(resp.text, "html.parser")
page_wrap3 = page.find("div", class_="wrap3")
if not page_wrap3:
print("未搜索到【get_novel_info-wrap3】,当前地址链接为:", url)
sleep(1)
return get_novel_info(domain, url)
divs = page_wrap3.find_all("div", class_="sons")
results = []
if not divs:
return results
for div in divs:
novel = Novel()
# 获取详情信息
detail_url = domain + div.find("a").get("href")
# print("详情链接:", detail_url)
novel.detail_url = detail_url
# 获取小说书面图片
img_path = div.find("img").get("src")
# print("小说书面图片地址:", img_path)
novel.img_path = img_path
ps = div.find_all("p")
# 获取小说标题
title = ps[0].find("a").text
# print("小说标题:", title)
novel.title = title
# 获取作者 和 分数、评分人数
text = ps[1].text
pattern = r"作者:([^ ]+)\s+([\d.]+)\((\d+)人评分\)"
match = re.search(pattern, text)
if match:
author = match.group(1)
score = match.group(2)
rating_count = match.group(3)
# 获取作者
# print("作者:", author)
novel.author = author
# 获取分数
# print("分数:", score)
novel.score = score
# 获取评分人数
# print("评分人数:", rating_count)
novel.rating_count = rating_count
# 获取小说简介
# 使用正则表达式匹配<p>标签内的非标签文本
novel_introduct = BeautifulSoup(str(ps[2]), "html.parser").get_text()
# print("小说简介:", novel_introduct)
novel.novel_introduct = novel_introduct
results.append(novel)
return results
def get_novel_details(novels):
results = []
# 从详情中 获取相关信息
detail_resp = requests.get(novels.detail_url)
detail_page = BeautifulSoup(detail_resp.text, "html.parser")
wrap3 = detail_page.find("div", class_="wrap3")
if not wrap3:
print("未搜索到【wrap3】,当前小说为:",novels.title,"地址链接为:",novels.detail_url)
sleep(1)
# return results
return get_novel_details(novels)
# 获取小说简介
detail_div = wrap3.find("div", class_="son2")
pattern = r'''
(?P<before_p_tag>.*?) # 捕获 <p> 标签之前的内容
<p\s+style="margin:0px;\s*font-size:12px;\s*line-height:160%;"> # 匹配 <p> 标签及其样式
(?P<p_content>.*?) # 捕获 <p> 标签内的内容
</p> # 匹配 </p> 标签
(?P<after_p_tag>.*?) # 捕获 </p> 标签之后的内容直到 </div>
</div> # 匹配结束的 </div> 标签
'''
match = re.search(pattern, str(detail_div), re.DOTALL | re.VERBOSE)
if match:
# before_p_tag = match.group('before_p_tag').strip()
# p_content = match.group('p_content').strip()
novel_introduction = match.group('after_p_tag').strip()
# 获取小说简介
# print("小说完整简介:", novel_introduction)
else:
# print("未找到匹配项")
return results
# 获取小说各章节标题
spans = wrap3.find("div", class_="bookcont").find_all("span")
for index, span in enumerate(spans):
chapter = span.find("a")
# 获取章节链接
chapter_domain = domain + chapter.get("href")
# print("每章节链接:", chapter_domain)
# 获取章节标题
chapter_title = chapter.text
# print("每章节标题:", chapter_title)
# 获取章节序号
chapter_index = index + 1
# print("每章节序号:", chapter_index)
# 获取章节内容
content_ps = get_content_ps(chapter_domain)
if not content_ps or len(content_ps) == 0:
print("章节:", chapter_title, "没有内容!")
continue
for idx, content_p in enumerate(content_ps):
nodel_detail = NovelDetail()
nodel_detail.title = novels.title
nodel_detail.novel_introduction = novel_introduction
nodel_detail.chapter_domain = chapter_domain
nodel_detail.chapter_title = chapter_title
nodel_detail.chapter_index = chapter_index
if idx == 0 or content_p.text.strip() == "":
# print(content_p.text)
continue
content_text = content_p.text
content_index = idx
# 打印每章内容
# print("每章节每段内容:", content_text, "每章节每段序号:", content_index)
nodel_detail.content_text = content_text
nodel_detail.content_index = content_index
results.append(nodel_detail)
return results
def get_content_ps(chapter_domain):
try:
content_resp = requests.get(chapter_domain)
content_page = BeautifulSoup(content_resp.text, "html.parser")
content_wrap3 = content_page.find("div", class_="wrap3")
if not content_wrap3:
print("未搜索到【wrap3】,当前地址链接为:", chapter_domain)
sleep(1)
return get_content_ps(chapter_domain)
content_div = content_wrap3.find("div", class_="son2")
content_ps = content_div.find_all("p")
return content_ps
except Exception as e:
print("获取章节内容失败:", e,"地址:",chapter_domain)
return []
def syncOneNovel(session,domain, url):
try:
novel_infos = get_novel_info(domain, url)
sleep(1)
if novel_infos:
DBUtil.insert_novels(session, novel_infos)
for novel in novel_infos:
try:
novel_details = get_novel_details(novel)
DBUtil.insert_novel_details(session, novel_details)
except Exception as e:
print("获取小说详情失败:", e,"地址:",novel.detail_url)
except Exception as e:
print("获取小说信息失败:", e,"地址:",url)
if __name__ == "__main__":
# 创建数据库连接
session = DBUtil.create_connection("localhost", "3307","loren", "123456", "gudian")
domain = "https://www.guwenxue.cc"
# 1~214
for page_number in range(1,215):
url = f"{domain}/xiaoshuo_{page_number}.html"
syncOneNovel(session, domain, url)
print(f"第 {page_number} 页同步完成!")
print("全部结束!")
2,创建 models、utils 目录
在 models 目录下有两个实体类 novels.py、novel_detail.py
from sqlalchemy import Column, Integer, String, Float
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class Novel(Base):
__tablename__ = "novels"
id = Column(Integer, primary_key=True, index=True)
detail_url = Column(String, nullable=True)
img_path = Column(String, nullable=True)
title = Column(String, nullable=True)
author = Column(String, nullable=True)
score = Column(String, nullable=True)
rating_count = Column(Integer, nullable=True)
novel_introduct = Column(String, nullable=True)
def __repr__(self):
return (f"Novel(id={self.id}, detail_url={self.detail_url!r}, img_path={self.img_path!r}, title={self.title!r}, "
f"author={self.author!r}, score={self.score!r}, rating_count={self.rating_count!r}, "
f"novel_introduct={self.novel_introduct!r})")
from sqlalchemy import Column, Integer, String, Text
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class NovelDetail(Base):
__tablename__ = "novel_details"
id = Column(Integer, primary_key=True, index=True)
title = Column(String, nullable=True)
novel_introduction = Column(Text, nullable=True)
chapter_domain = Column(String, nullable=True)
chapter_title = Column(String, nullable=True)
content_text = Column(Text, nullable=True)
chapter_index = Column(Integer, nullable=True)
content_index = Column(Integer, nullable=True)
def __repr__(self):
return (f"NovelDetail(id={self.id}, title={self.title!r}, novel_introduction={self.novel_introduction!r}, "
f"chapter_domain={self.chapter_domain!r}, chapter_title={self.chapter_title!r}, content_text={self.content_text!r}, "
f"chapter_index={self.chapter_index!r}, content_index={self.content_index!r})")
3,utils 目录下创建文件 db_util.py
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
class DBUtil:
@staticmethod
def create_connection(host, port, user, password, db_name):
DATABASE_URI = f"mysql+pymysql://{user}:{password}@{host}:{port}/{db_name}"
engine = create_engine(
DATABASE_URI,
pool_size=10,
max_overflow=20,
pool_timeout=30,
pool_recycle=1800
)
Session = sessionmaker(bind=engine)
return Session()
@staticmethod
def insert_novels(session, novels):
session.add_all(novels)
session.commit()
@staticmethod
def insert_novel_details(session, novel_details):
session.add_all(novel_details)
session.commit()
4, 建表 sql
-- gudian.novels definition
CREATE TABLE `novels` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主键id',
`detail_url` varchar(100) NOT NULL COMMENT '详情链接',
`img_path` varchar(100) DEFAULT NULL COMMENT '小说书面图片地址',
`title` varchar(50) NOT NULL COMMENT '小说标题',
`author` varchar(10) DEFAULT NULL COMMENT '作者',
`score` varchar(5) DEFAULT NULL COMMENT '分数',
`rating_count` int(11) DEFAULT NULL COMMENT '评分人数',
`novel_introduct` varchar(200) DEFAULT NULL COMMENT '小说简介',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=469 DEFAULT CHARSET=utf8mb4 COMMENT='小说信息主表,主要存储爬取网站中关于小说的相关信息,如 标题、网址、作者、简介等信息';
ALTER TABLE gudian.novels ADD CONSTRAINT novels_unique UNIQUE KEY (title);
CREATE TABLE gudian.novel_details (
id BIGINT auto_increment NOT NULL COMMENT '主键id',
title varchar(50) NOT NULL COMMENT '标题',
novel_introduction TEXT NULL COMMENT '完整简介',
chapter_domain varchar(100) NULL COMMENT '每章节链接',
chapter_title varchar(80) NULL COMMENT '每章节标题',
content_text TEXT NULL COMMENT '每章节每段内容',
chapter_index INT NULL COMMENT '每章节序号',
content_index INT NULL COMMENT '每章节每段序号',
CONSTRAINT novel_details_pk PRIMARY KEY (id)
)
ENGINE=InnoDB
DEFAULT CHARSET=utf8mb4
COLLATE=utf8mb4_general_ci
COMMENT='小说详情表,名称、章节、每章节每段落内容';
ALTER TABLE gudian.novel_details MODIFY COLUMN content_text LONGTEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL COMMENT '每章节每段内容';
-- 全表删除
delete from novels;
delete from novel_details;
select title,count(*) as count from novel_details group by title;
select title,chapter_title,chapter_index,count(*) as count from novel_details
group by title,chapter_title,chapter_index order by title,chapter_index;
select title,chapter_title,GROUP_CONCAT(content_text) from novel_details
group by title,chapter_title;