python之下载小说

#coding = utf-8
import requests
from bs4 import BeautifulSoup
import time
import os, sys
from pathlib import Path
#下载类
class downloader(object):
    def __init__(self):
        self.server = 'http://www.xuliehao.org/'
        self.target = 'http://www.xuliehao.org/novel20931/'
        self.names = [] #存放章节名
	self.urls = [] #存放章节链接
	self.nums = 0   #章节数

 #获取下载地址
    def get_download_url(self):
        req = requests.get(url = self.target)
	req.encoding = 'gbk'
	html = req.text
	div_bf = BeautifulSoup(html)
	div = div_bf.find_all(id="list") #因为小说的章节对应的tag
	a_bf = BeautifulSoup(str(div[0]))
	a = a_bf.find_all('a')
	self.nums = len(a[15:])
	for each in a[15:]:	
	    self.names.append(each.string)
	    self.urls.append(self.server+each.get('href'))
 
  #获取章节内容
    def get_contents(self,target):
        req = requests.get(url =target)
	req.encoding = 'gbk'
	html = req.text
	bf = BeautifulSoup(html,'lxml')
	texts = bf.find_all(id="content")
	#texts = texts[0].text.replace('\xa0'*8,'\n\n')
	return texts[0].text
	
    #创建文件夹
    def mkdirdir(self,dirname):
	dir = os.getcwd()
	dirname = dir +"/" + dirname
	is_dir = Path(dirname)
	if is_dir.is_dir():
	    print (dirname + " is exist")
	else:
	    os.mkdir(dirname,0o777)
	self.path = dirname
  
  #将抓取的文章内容写入文件
    def writer(self,name,text):
	write_flag = True
	dirname = self.path + "/" + name + ".txt";
	dirname = dirname.replace(' ','') #将章节名称带空格的去掉
	print (dirname)
	with open(dirname,'a',encoding='utf-8') as f:
	    f.write(name+'\n')
	    f.writelines(text)
	    f.write('\n\n')
#主函数
if __name__ == "__main__":
	
    dl = downloader()
    dl.get_download_url()
    dl.mkdirdir("testStory") 
    for i in range(dl.nums):
        dl.writer(dl.names[i],dl.get_contents(dl.urls[i]))

1.小说的目录在html中的tag如下:

2.每章对应的小说内容在html中tag如下

 

 

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值