'''
url='http://www.qu.la/paihangbang/'
爬取笔趣阁的排行版小说
'''
import requests
import bs4
# 抓取整个网页
def get_html(url):
try:
r = requests.get(url, timeout=300)
r.raise_for_status
# 我手动测试了编码。并设置好,这样有助于效率的提升
r.encoding = ('utf-8')
# print(r.text)
return r.text
except:
return "Someting Wrong!"
# 获取排行榜小说及其链接:
def get_content(url):
'''
爬取每一类型小说排行榜,
按顺序写入文件,
文件内容为 小说名字+小说链接
将内容保存到列表
并且返回一个装满url链接的列表
'''
url_list = []
html = get_html(url)
soup = bs4.BeautifulSoup(html, 'lxml')
# 由于小说排版的原因,历史类和完本类小说不在一个div里
# print(soup.prettify())
category_list = soup.find_all('div', class_='index_toplist mright mbottom')
history_finished_list = soup.find_all('div', class_='index_toplist mbottom')
# print(category_list)
for cate in category_list:#分别遍历
name = cate.find('div', class_='toptab').span.string
with open('小说/novel_list.csv', 'a+') as f:
f.write("\n小说种类:{} \n".format(name))
print("\n小说种类:{} \n".format(name))
# 我们直接通过style属性来定位总排行榜
general_list = cate.find(style='display: block;')
# 找到全部的小说名字,发现他们全部都包含在li标签之中
book_list = general_list.find_all('li')
# 循环遍历出每一个小说的的名字,以及链接
for book in book_list:
link = 'http://www.qu.la/' + book.a['href']
title = book.a['title']
# 我们将所有文章的url地址保存在一个列表变量里
url_list.append(link)
# 这里使用a模式,防止清空文件
with open('./小说/novel_list.csv', 'a') as f:
f.write("小说名:{:<} \t 小说地址:{:<} \n".format(title, link))
print("小说名:{:<} \t 小说地址:{:<} \n".format(title, link))
# print(history_finished_list)
for cate in history_finished_list:
name = cate.find('div', class_='toptab').span.string
with open('小说/novel_list.csv', 'a') as f:
f.write("\n小说种类:{} \n".format(name))
print("\n小说种类:{} \n".format(name))
general_list = cate.find(style='display: block;')
book_list = general_list.find_all('li')
for book in book_list:
link = 'http://www.qu.la' + book.a['href']
title = book.a['title']
url_list.append(link)
with open('小说/novel_list.csv', 'a') as f:
f.write("小说名:{:<} \t 小说地址:{:<} \n".format(title, link))
print("小说名:{:<} \t 小说地址:{:<} \n".format(title, link))
# print(url_list)
return url_list
# 获取单本小说的所有章节链接:
def get_txt_url(url):
'''
获取该小说每个章节的url地址:
并创建小说文件
'''
# print(url)
url_list = []
html = get_html(url)
soup = bs4.BeautifulSoup(html, 'lxml')
lista = soup.find_all('dd')
txt_name = soup.find('h1').text#小说名字
with open('小说/{}.txt'.format(txt_name), "a+") as f:
f.write('小说标题:{} \n'.format(txt_name))
for url1 in lista:
url_list.append(str(url) + url1.a['href'])
return url_list, txt_name
# 获取单页文章的内容并保存到本地:
def get_one_txt(url, txt_name):
'''
获取小说每个章节的文本
并写入到本地
'''
html = get_html(url).replace('<br/>', "\n")
soup = bs4.BeautifulSoup(html, 'lxml')
#print(soup.prettify())
try:
print("==============================================1")
txt = soup.find('div', id='content').text.replace('chaptererror();', '\n')
txt = txt.replace('<br/>', '-----------------------------')
#print(txt)
#txt=txt.replace(' ', ' ')
title = soup.find('title').text.split('_')[0]#以_拆分取第一个
#
with open('小说/{}.txt'.format(txt_name), "a", encoding='gb18030', errors='ignore') as f:
#print(txt)
f.write(title + '\n')
# print(txt)
f.write(txt)
print('当前小说:{} 当前章节{} 已经下载完毕'.format(txt_name, title))
except NameError as e:
print('---->', e)
def main():
url="https://www.qu.la/paihangbang/"
url_list=get_content(url)#获取排行榜的所有小说链接
for url in url_list:#遍历这些链接
url_list1,txtname=get_txt_url(url)#获取单本小说的所有章节链接,以及小说名称
for url1 in url_list1:#遍历小说的所有章节
#print(url1)
get_one_txt(url1,txtname)#获取每章节文章的内容并保存到本地
if __name__ == "__main__":
main()