爬取笔趣阁全站小说,注意爬取的顺序。
import requests as r
from lxml import etree
class Pavilion:
@classmethod
def get_book(cls):
url = "http://www.xbiquge.la/xiaoshuodaquan/"
res = r.request(method="GET",url=url)
str_html = res.text
ele = etree.HTML(str_html)
book_urls = ele.xpath("//div[@class='novellist']/ul/li/a/@href")
return book_urls
@classmethod
def get_chapter(cls):
book_urls = cls.get_book()
for book_url in book_urls:
res_html = r.request(method="GET",url=book_url).content.decode()
ele = etree.HTML(res_html)
book_name = ele.xpath("//h1/text()")[0] # 每本书的书名
book_chapter_urls = ele.xpath("//div[@id='list']/dl/dd/a/@href")
cls.get_content(book_chapter_urls,book_name)
@classmethod
def get_content(cls,book_chapter_urls,book_name):
for book_chapter_url in book_chapter_urls:
print("换章节啦++++++++++++++++++++++")
res_html = r.request(method="GET",url="http://www.xbiquge.la/"+book_chapter_url).content.decode()
ele = etree.HTML(res_html)
chapter_content = ele.xpath("//div[@id='content']/text()")
book_chapter_name = ele.xpath("//h1/text()")[0]
print(book_chapter_name)
with open("crawler_demo\crawler_three\\books\\"+book_name + ".txt", "a+", encoding="utf-8") as w:
w.write(book_chapter_name+"\n")
for i in chapter_content:
w.write(i)
w.write("\n\n")
if __name__ == '__main__':
Pavilion.get_chapter()