import requests
import re
s = requests.Session()
url = 'https://www.xxbiquge.com/2_2634/'#小说网址
html = s.get(url)
html.encoding = 'utf-8'#根据网页源代码中的格式来改
caption_title_1 = re.findall(r'<a href="(/2_2634/.*?\.html)">.*?</a>',html.text)
path = r'E:\工作\Python\爬虫\title.txt' #文本保存路径
file_name = open(path,'a',encoding='utf-8')
for i in caption_title_1:
caption_title_1 = 'https://www.xxbiquge.com'+i#下载后续章节 字符串连接网址
# 网页源代码
s1 = requests.Session()
r1 = s1.get(caption_title_1)
r1.encoding = 'utf-8'
name = re.findall(r'<meta name="keywords" content="(.*?)" />',r1.text)[0]
print(name)
file_name.write(name)
file_name.write('\n')
chapters = re.findall(r'<div id="content">(.*?)</div>',r1.text,re.S)[0]
chapters = chapters.replace(' ', '')
chapters = chapters.replace('readx();', '')
chapters = chapters.replace('& lt;!--go - - & gt;', '')
chapters = chapters.replace('<!--go-->', '')
chapters = chapters.replace('()', '')
s = str(chapters)
s_replace = s.replace('<br/>',"\n")
while True:
index_begin = s_replace.find("<")
index_end = s_replace.find(">",index_begin+1)
if index_begin == -1:
break
s_replace = s_replace.replace(s_replace[index_begin:index_end+1],"")
pattern = re.compile(r' ',re.I)
fiction = pattern.sub(' ',s_replace)
file_name.write(fiction)
file_name.write('\n')
file_name.close()