import requests
from pyquery import PyQuery as pq
def get_content(a):
response=requests.get(a)
response.encoding = 'gbk'
doc = pq(response.text)
text=doc('#content.showtxt')
bookname=doc('div.bookname h1')
c=str(bookname)
a=str(text)
b=a.replace(" <br/> <br/>","\n").replace('<br/><br/>','\n').replace('<script>chaptererror();</script><br/> 请记住本书首发域名:www.biqugexsw.com。笔趣阁小说网手机版阅读网址:m.biqugexsw.com</div>','').replace('\xa0','').replace('<div id="content" class="showtxt">','')#初级过滤
file = open(u'F:\python\小说下载区\小说.txt','a+')
file.write(a)#写入章节名称
file.write(b)#写入内容
file.close()
def get_mulu():
index_url='https://www.biqugexsw.com/71_71883/'#替换任意一本小说url
response=requests.get(index_url)
response.encoding = response.apparent_encoding#自判断编码
doc = pq(response.text)
urls = doc('div.listmain a')
length=int(len(urls))
count=0
for i in urls.items():
a='https://www.biqugexsw.com/'+i.attr.href#获取所有章节
get_content(a)
count += 1
print('进度:%0.5f' % (count / length)+'%')
get_mulu()
入门级爬虫,3500章的小说大概是用了20分钟,还可以吧,中间没有发生堵塞和timeout估计是这家网站没做反爬。