#coding = utf-8
import requests
from bs4 import BeautifulSoup
import time
import os, sys
from pathlib import Path
#下载类
class downloader(object):
def __init__(self):
self.server = 'http://www.xuliehao.org/'
self.target = 'http://www.xuliehao.org/novel20931/'
self.names = [] #存放章节名
self.urls = [] #存放章节链接
self.nums = 0 #章节数
#获取下载地址
def get_download_url(self):
req = requests.get(url = self.target)
req.encoding = 'gbk'
html = req.text
div_bf = BeautifulSoup(html)
div = div_bf.find_all(id="list") #因为小说的章节对应的tag
a_bf = BeautifulSoup(str(div[0]))
a = a_bf.find_all('a')
self.nums = len(a[15:])
for each in a[15:]:
self.names.append(each.string)
self.urls.append(self.server+each.get('href'))
#获取章节内容
def get_contents(self,target):
req = requests.get(url =target)
req.encoding = 'gbk'
html = req.text
bf = BeautifulSoup(html,'lxml')
texts = bf.find_all(id="content")
#texts = texts[0].text.replace('\xa0'*8,'\n\n')
return texts[0].text
#创建文件夹
def mkdirdir(self,dirname):
dir = os.getcwd()
dirname = dir +"/" + dirname
is_dir = Path(dirname)
if is_dir.is_dir():
print (dirname + " is exist")
else:
os.mkdir(dirname,0o777)
self.path = dirname
#将抓取的文章内容写入文件
def writer(self,name,text):
write_flag = True
dirname = self.path + "/" + name + ".txt";
dirname = dirname.replace(' ','') #将章节名称带空格的去掉
print (dirname)
with open(dirname,'a',encoding='utf-8') as f:
f.write(name+'\n')
f.writelines(text)
f.write('\n\n')
#主函数
if __name__ == "__main__":
dl = downloader()
dl.get_download_url()
dl.mkdirdir("testStory")
for i in range(dl.nums):
dl.writer(dl.names[i],dl.get_contents(dl.urls[i]))
1.小说的目录在html中的tag如下:
2.每章对应的小说内容在html中tag如下