1, 获得每个章节的内容
from multiprocessing.dummy import Pool
import requests
from bs4 import BeautifulSoup
import re
import os
def combine_name(snum, title):
"""
根据标题的章节数和标题生成可排序的形式
:param snum: 章节数
:param title: 题目
:return: 中文题目, *****(五位数字)
"""
st = int(396871)
num = int(snum[31:-5]) - st
title = title.split()[-1]
if num <10:
snum = '0000'+ str(num)
elif num <100:
snum = '000'+ str(num)
elif num <1000:
snum = '00'+ str(num)
elif num <10000:
snum = '0' + str(num)
else:
snum = str(num)
return title, snum
def get_html(url):
"""
获取指定网页的源代码
:param url: 网址
:return: 该网页源代码
"""
html = requests.get(url)
return html
def get_chapter_url(source_url, html):
"""
:param source_url: 该网页的网址
:param html: 网页的源代码
:return: 每一章文章的链接(列表的形式)
"""
url_list = []
soup = BeautifulSoup(html.text, 'lxml')
chapters = soup.find_all('a')
for chapter in chapters[100:-10]:
temp = chapter.attrs['href']
url_list.append(source_url + temp[7:])
return url_list
def get_article(url):
"""
获得小说正文内容,并保存
:param url: 每个章节的网址
:return: None
"""
try:
html = get_html(url)
soup = BeautifulSoup(html.text, 'lxml')
head_text = soup.find_all('h1')[0].text
_, chapter_name = combine_name(url, head_text)
text = re.search('<div id="content">(.*?)</div>', html.text, re.S).group(0)
text = ' '+text[18:-6].replace("<br><br>",'\n ').replace(' ', '')
text = _ + '\n'+ text[:-2]
with open(os.path.join('****', chapter_name + '.txt'), 'w', encoding='utf8') as f:
f.write(text)
except Exception:
print(url)
def multiprocess_get_article(url_list):
pool = Pool(10)
pool.map(get_article, url_list)
def main():
book_name = '****'
os.makedirs(book_name, exist_ok=True)
url = 'https://www.ibooktxt.com/0_646/'
html = get_html(url)
chapter_list = get_chapter_url(url, html)
multiprocess_get_article(chapter_list)
if __name__ == '__main__':
main()
2,将这些章节整合到一起
import os
def combine_name(num):
"""
根据标题的章节数的形式
:param snum: 章节数
:return: *****(五位数字)
"""
if num < 10:
snum = '0000'+ str(num)
elif num < 100:
snum = '000' + str(num)
elif num < 1000:
snum = '00' + str(num)
elif num < 10000:
snum = '0' + str(num)
else:
snum = str(num)
return snum
def main():
TEXT = ''
chapter_num = 74
for i in range(74,1512):
path = combine_name(i)
try:
with open(os.path.join('****', path + '.txt'), 'r', encoding='utf8') as f:
text = f.read()
except Exception:
continue
TEXT += '第'+str(chapter_num)+'章 ' + text + '\f'
chapter_num += 1
with open(os.path.join('****', '整篇' + '.txt'), 'a', encoding='utf8') as f:
f.write(TEXT)
if __name__ == '__main__':
main()