准备
安装Python以及必要的模块(requests,xpath)
思路
继上次写的代码问题改正:
- 指定下载
- 解决一下全局变量的问题
- 完善部分下载函数
作用
可以选择爬取网站内任意小说 选择章数 保存路径
代码
import requests
import time
import sys
import os
from lxml import etree
from urllib import parse
# 首先获取访问网站的URl
def get_content(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
}
r = requests.get(url=url, headers=headers)
r.encoding = 'utf-8'
content = r.text
# print(content)
return content
except: # 反应错误信息
s = sys.exc_info()
print("Error '%s' happened on line %d" % (s[1], s[2].tb_lineno))
return " ERROR "
# 解析网站找到小说的本书
def get_analysis(content):
ele = etree.HTML(content.encode('utf-8'))
# print(type(ele))
result = ele.xpath("//div[@class='result-game-item-detail']")
if (len(result) == 0):
print('没有找到你想要的小说')
exit()
i = 0
index = []
for result in result:
i = i + 1
print(i)
href = result.xpath("h3/a/@href")[0]
title = result.xpath("h3/a/@title")[0]
author = result.xpath("div/p/span[2]/text()")[0]
index.append(href)
print(href, title, author)
return get_pick(index)
# 挑选小说
def get_pick(index):
bookIndex = input('你选择下载那一篇小说(输入序号数字)')
if bookIndex.isdigit(): # isdigit是验证是否是整数 有bug
# input输入字符串注意转型
bookIndex = int(bookIndex)
bookIist = get_content(index[bookIndex - 1])
ele2 = etree.HTML(bookIist.encode('utf-8'))
chapter = ele2.xpath("//div[@id='list']/dl/dd/a/@href")
return chapter
else:
print("请正确输入")
exit()
# 选择章节或者全部下载
def get_select(chapter):
judge = input('是否下载全本 (回车默认全部) yes/no')
if judge == 'no': # 有bug
print('请输入你要下载的章节 ?--? ')
i = 0
y = []
while i < 2:
x = input()
y.append(x)
i += 1
m = int(y[0])
n = int(y[1])
#temp = []
#for z in range(m, n + 1):
# temp.append(chapter[z - 1])
#chapter = temp
chapter = chapter[m-1:n]
return chapter
else:
print('开始下载全部')
return chapter
# 正文解析获取
def get_mainFiction(chapter):
for chapter in chapter:
tmpeUrl = 'https://www.xsbiquge.com' + chapter
mainbodyUrl = get_content(tmpeUrl)
ele3 = etree.HTML(mainbodyUrl.encode('utf-8'))
mainbody = ele3.xpath("//div[@id='content']/text()")
chapterName = ele3.xpath("//div[@class='bookname']/h1/text()")[0]
finishedProduct = "\n".join(mainbody)
save(chapterName, finishedProduct, PATH)
print(chapterName + '\t下载完成')
# 保存本地或者选择保存地点
def save(chapterName, finishedProduct, path):
# 创建的目录
if not os.path.exists(path):
os.makedirs(path)
filename = BOOKS + ".txt"
f = open(path + '\\' + filename, "a+", encoding='utf-8')
f.write(chapterName + '\n')
f.write(finishedProduct + '\n')
f.close
# 主程序
def main():
# 书名全局
global BOOKS
BOOKS = input('你要搜索的小说是:')
ret = parse.quote(BOOKS)
url = 'https://www.xsbiquge.com/search.php?keyword=' + ret
# print(url) #验证链接
content = get_content(url)
chapter = get_analysis(content)
chapter = get_select(chapter)
# 保存路径全局
global PATH
PATH = input('选择保存路径(回车默认本地)')
start_time = time.time() # 计时开启
# 回车默认本地
if PATH == '':
PATH = os.getcwd()
print(PATH)
get_mainFiction(chapter)
else:
get_mainFiction(chapter)
end_time = time.time()
project_time = end_time - start_time
print('下载用时', project_time)
main()
总结:
- 还存在一些小bug
- 希望大家可以挑一下刺尤其在代码基础简练方面 我将不胜感激
- 蟹蟹大家