# 全书网小说爬取
import re
import requests
class QuanShuSpider(object):
def __init__(self):
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763"}
# 拿到魔幻分类对应的第一页小说内容
def get_novel_list(self):
response = requests.get("https://b.faloo.com/y/1/0/0/0/0/0/1.html", headers=self.headers)
response.encoding = "gbk" # 修改文本编码方式
href_pattern = '<h1 class="fontSize17andHei" title=".*?"><a href="(.*?)" target="_blank" title=".*?">.*?</a></h1>'
hrefs = re.findall(href_pattern, response.text)
urls = ['https:' + href for href in hrefs]
return urls
# 获取每部小说的章节列表
def get_chapter_list(self, novel_url):
response = requests.get(novel_url, headers=self.headers)
pattern = '<a href="(.*?)" target="_self" title="(.*?)"'
chapter_list = re.findall(pattern, response.text)
datas = []
for href, name in chapter_list:
url = 'https:' + href
datas.append((name, url))
return datas
# 获取章节内容并保存
def get_chapter(self, chapter):
name, url = chapter
html = requests.get(url, headers=self.headers)
html.encoding = "gbk"
# 获取章节内容
pattern = '<div class="noveContent">(.*?)</div>'
chapter = re.search(pattern, html.text, re.S).group()
content = re.sub(r'\r\n|\\u3000|<br>', '\n', chapter, re.S)
# 保存文件
print(content)
print('\r%s已写入' % name, end='')
def run(self):
# 获取每一页的小说链接
urls = self.get_novel_list()
# 获取每一部小说的章节链接
for url in urls:
chapter_list = self.get_chapter_list(url)
# 获取小说章节内容
for chapter in chapter_list:
self.get_chapter(chapter)
exit()
if __name__ == "__main__":
quanshu = QuanShuSpider()
quanshu.run()
爬取全书网
最新推荐文章于 2022-08-18 23:11:40 发布