网站:http://www.quanshuwang.com/
目标:爬取小说内容
步骤:
- 打开一本小说源代码
- 获取各个章节的链接
- 打开各个章节的源代码
- 提取各个章节的内容
- 下载内容
小说链接如下:
http://www.quanshuwang.com/book/9/9055(每本小说链接都不同,这本是盗墓笔记的链接)
各个章节链接如下:
注意的是,在打印源码内容时要看看源码的编码格式,这个是'gbk'.
# coding=utf-8
import urllib.request
import re
import os
class Load_Novel(object):
def save_novel(self):
#第一步打开一本小说
novel_url = 'http://www.quanshuwang.com/book/9/9055'
response = urllib.request.urlopen(novel_url).read()
response = response.decode('gbk')
#print (response)
#第二步取出所有章节的链接,用正则表达式 通配符.*?
req = r'<li><a href="(.*?)" title="(.*?)">.*?</a></li>'
req = re.compile(req)
html = re.findall(req,response)
#print (html)
#第三步,获取章节源代码
for item in html:
chapter_url = item[0]
#print (chapter_url)
chapter_title = item[1]
response = urllib.request.urlopen(chapter_url).read()
response = response.decode('gbk')
#print (response)
#第四步,提取内容
req = '</script> (.*?)<script type="text/javascript">'
req = re.compile(req,re.S) #多行匹配
novel_content = re.findall(req,response)
#print (novel_content[0])#数据类型是列表
novel_content = novel_content [0].replace(' ',' ')
#print (novel_content)#从列表变成字符
novel_content = novel_content.replace('<br />','')
#print (novel_content)
if not os.path.exists('novel'):
os.mkdir('novel')
print ("正在保存%s"%chapter_title)
with open('novel/{}.txt'.format(chapter_title),'w') as f:
f.write(novel_content)
#内置属性,别的文件引入这个文件时,无法执行下面代码
if __name__ == "__main__":
load_novel = Load_Novel ()
load_novel.save_novel()