页面源代码:
<!DOCTYPE html>
<html lang="zh">
<head>
<script type="text/javascript" src="https://img.shicimingju.com/newpage/js/scmj.js"></script>
<script src="https://img.shicimingju.com/newpage/js/all.js"></script>
<meta charset="UTF-8">
<title>《四大名著》在线阅读_诗词名句网</title>
<meta name="viewport" content="width=device-width,user-scalable=no" />
<meta name="description" content="" />
<link rel="stylesheet" href="https://img.shicimingju.com/public/css/www_v3.css">
<link rel="stylesheet" href="https://img.shicimingju.com/public/css/font_576092_m2icqcebp7evzpvi.css">
<script src="https://img.shicimingju.com/public/js/device.js"></script>
<script src="https://img.shicimingju.com/public/js/jq.js"></script>
<script src="https://img.shicimingju.com/public/js/www_v3.js"></script>
</head>
<body>
<div id="top_bar">
<div id="top_bar_main">
<div id="top_left">
<a href="/"><img src="https://img.shicimingju.com/public/image/logo.png" id="logo"/></a>
<div id="top_left_menu">
<ul>
<li><a href="/">首页</a></li>
<li><a href="/shicimark">分类</a></li>
<li><a href="/category/all">作者</a></li>
<li><a href="/paiming">排行榜</a></li>
<li><a href="/cate?cate_id=4">课本古诗</a></li>
<li><a href="/cipai/index.html">词牌名</a></li>
<li><a href="/hecheng/index.html">合称</a></li>
<li><a href="/book/">古籍</a></li>
</ul>
</div>
<div style="clear: both"></div>
</div>
<div id="top_right">
<div id="input_div" style="margin-top: 20px;">
<input type="text" name="kw" id="search_input" placeholder="" autocomplete="off" value="">
<a href="javascript:doSearch();"><img src="https://img.shicimingju.com/public/image/web_search.png" class="www-icon"></a>
<div style="clear: both"></div>
</div>
<div id="tip_result" class="card">
<ul>
</ul>
<div id="tip_content">
</div>
</div>
</div>
<div style="clear: both;"></div>
</div>
</div>
<div id="nav-top"></div>
<div id="main">
<div id="main_left">
<div class="card bookmark-list">
<h1>《四大名著》在线阅读</h1>
<div class="des">
中国的四大名著是《三国演义》、《水浒传》、《西游记》、《红楼梦》。这四部著作历久不衰,是汉语文学史中不可多得的经典作品。其中的故事、场景、人物已经深深地影响了中国人的思想观念、价值取向。四部著作都有很高的文学水平和艺术成就。细致的刻画和所蕴含的深刻思想都为历代读者所称道。是中国文学史上的四座伟大丰碑。 </div>
<div>
<div class="book-item">
<a href="/book/sanguoyanyi.html"><img src="https://img.shicimingju.com/public/image/book/sanguoyanyi.jpg"></a>
<br/>
<h3><a href="/book/sanguoyanyi.html">《三国演义》</a></h3>
</div>
<div class="book-item">
<a href="/book/shuihuzhuan.html"><img src="https://img.shicimingju.com/public/image/book/shuihuzhuan.jpg"></a>
<br/>
<h3><a href="/book/shuihuzhuan.html">《水浒传》</a></h3>
</div>
<div class="book-item">
<a href="/book/xiyouji.html"><img src="https://img.shicimingju.com/public/image/book/xiyouji.jpg"></a>
<br/>
<h3><a href="/book/xiyouji.html">《西游记》</a></h3>
</div>
<div class="book-item">
<a href="/book/hongloumeng.html"><img src="https://img.shicimingju.com/public/image/book/hongloumeng.jpg"></a>
<br/>
<h3><a href="/book/hongloumeng.html">《红楼梦》</a></h3>
</div>
<div style="clear: both"></div>
</div>
</div>
</div>
<div id="main_right">
<div class="card zz_other_shici">
<div class="aside_title">推荐阅读</div>
<ul>
<li><a href="/bookmark/sidamingzhu.html">四大名著</a></li>
<li><a href="/bookmark/ershisishi.html">二十四史</a></li>
<li><a href="/bookmark/sishu.html">四书</a></li>
<li><a href="/bookmark/wujing.html">五经</a></li>
<li><a href="/bookmark/yanyixiaoshuo.html">演义小说</a></li>
<div style="clear: both"></div>
</ul>
</div>
</div>
<div style="clear: both;"></div>
</div><a href="javascript:toTop();"><div id="toTop">顶部</div></a>
<div id="footer">
<div id="footer_main">
<div id="bottom_nav">
<a href="/category/story">诗人故事</a>
<a href="/category/xiehouyu">歇后语</a>
<a href="/category/miyu">谜语</a>
<a href="/category/duilian">对联</a>
Copyright © 2010-2020 <a href="https://beian.miit.gov.cn/#/Integrated/index" target="_blank">琼ICP备2021009280号-1</a>
</div>
</div>
</div>
<script>
var _hmt = _hmt || [];
(function() {
var hm = document.createElement("script");
hm.src = "https://hm.baidu.com/hm.js?649f268280b553df1f778477ee743752";
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(hm, s);
})();
</script>
</body>
</html>
python代码,爬取四大名著:
import random
import time
import requests
import os
from bs4 import BeautifulSoup
# 抓取四大名著
def get_html(main_url):
'''
获取HTML页面内容的方法
:param main_url: 获取页面的url
:return:
'''
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
# 获取当前页面内容 返回四大名著标题和url
# 三国演义/第一章节.text
# 红楼梦/第一章节.text
response = requests.get(main_url, headers=headers)
response.encoding = response.apparent_encoding
return BeautifulSoup(response.text, 'lxml')
def get_book(soup):
'''
获取4大名著的标题和url
:param soup:
:return:
'''
# 取出大名著标题和url
div = soup.find_all('div', class_="book-item")
book_dic = {} # 存储书的字典 {书名:书链接}
for con in div:
# 获取书的名称,replace作用:把\n替换成空
book_name = con.get_text().replace('\n', '')
# 获取url
book_href = 'https://www.shicimingju.com' + con.a['href']
book_dic[book_name] = book_href
return book_dic
def get_book_mulu(books_html):
'''
抓取四大名著的章节 标题和href
:param books_html: 当前章节的HTML内容
:return:
'''
# 包含了整个章节的div
div = books_html.find_all('div', class_="book-mulu")
# 存储章节标题和url的字典
mulu_dict = {} # {章节: url}
for mulu in div:
# 抓取超链接
mulu_hrefs = mulu.find_all('a')
for mulu_href in mulu_hrefs:
# 后去章节标题
title = mulu_href.get_text()
# 获取章节url
href = mulu_href['href']
mulu_dict[title] = 'https://www.shicimingju.com' + href
return mulu_dict
def book_mulu_content(chapter, books_html):
'''
# 获取章节内容
:param chapter: 标题
:param books_html: 获取到的bs4对象的页面内容
:return:
'''
con_dic = {}
div = books_html.find('div', class_='chapter_content')
text = div.text
con_dic[chapter] = text
return con_dic
def save_books(book_name, book_contents):
'''
存储文章内容
:param book_name: 四大名著的书名 如:三国演义
:param book_contents: 抓取到章节里面的内容
:return:
'''
# 创建目录
if not os.path.exists(book_name):
os.mkdir(book_name)
for title in book_contents:
# 拼接路径
path = os.path.join(book_name, title+'.text')
# 进行存储
with open(path, 'a', encoding='UTF-8') as f:
f.write(book_contents[title])
print(f'{book_name} === {title} 下载完成!!!!!')
def main(main_url):
'''
运行的主函数
:param main_url:
:return:
'''
# 里面包含了四大名著
book_dic = get_book(get_html(main_url))
# 循环获取到书名称 有了书名称就能获取到 url
for book_name in book_dic:
# 获取当前四大名著的章节
mulu_dic = get_book_mulu(get_html(book_dic[book_name]))
for title, url in mulu_dic.items():
# 获取章节内容
book_contents = book_mulu_content(title, get_html(url))
save_books(book_name, book_contents)
time.sleep(random.randint(1, 3))
if __name__ == '__main__':
main_url = 'https://www.shicimingju.com/bookmark/sidamingzhu.html'
main(main_url)
运行结果:
《三国演义》 === 第一回·宴桃园豪杰三结义 斩黄巾英雄首立功 下载完成!!!!!
《三国演义》 === 第二回·张翼德怒鞭督邮 何国舅谋诛宦竖 下载完成!!!!!
...后面省略!!!