这个爬虫只是选定热门小说,不支持自选搜索下载,日后会补充并改进。
选定小说网址:
- 笔趣阁
爬取:
- 需要导入的包
import requests
from lxml import etree
import os
import re
import time
import datetime
- 获取主页面上的小说分类地址及名称
def main_html(url,headers): # 获取首页小说分类地址
nav_name_href = "//*[@id='wrapper']/div[2]/ul/li/a//@href"
response = requests.get(url,headers=headers).text
html = etree.HTML(response)
nav_name = html.xpath(nav_name_href)[2:-1]
return nav_name #返回首页小说分类地址
- 获取主页面下的热门小说列表。
def next_html(url,headers): # 该分类下的热门小说
xpath_book = '//*[@id="newscontent"]/div[2]/ul/li/span/a/text()' # 书名称
xpath_url = '//*[@id="newscontent"]/div[2]/ul/li/span/a/@href' # 书的url
xpath_id = '//*[@id="newscontent"]/div[2]/ul/li/span/text()' # 小说作者
name_url = [] # 用来存放小说的名称与地址
rep = requests.get(url=url,headers=headers).text
html = etree.HTML(rep)
r_book = html.xpath(xpath_book)
r_url = html.xpath(xpath_url)
BookAndUrl = dict(zip(r_book,r_url))
print(r_book)
try:
s_book = input("============请输入你想要看的小说全称============"+"\n")
s_url = BookAndUrl[s_book