准备
安装Python以及必要的模块(requests,xpath)
思路
- 获取网站url 采用异常处理try
- 解析内容获得到作者 章节
- 编辑文件写入章节 作者
- 进一步获取文章内容
- 编辑文件继续写入
代码
import requests
import time
import sys
from lxml import etree
from urllib import parse
#首先获取访问网站的URl
def get_content(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
}
r = requests.get(url=url, headers=headers)
r.encoding = 'utf-8'
content = r.text
# print(content)
return content
except: # 反应错误信息
s = sys.exc_info()
print("Error '%s' happened on line %d" % (s[1], s[2].tb_lineno))
return " ERROR "
#获取小说内容
def get_analysis(content):
ele = etree.HTML(content.encode('utf-8'))
# print(type(ele))
result = ele.xpath("//div[@class='result-game-item-detail']")
if (len(result) == 0):
print('没有找到你想要的小说')
exit()
i = 0
index = []
for result in result:
i = i + 1
print(i)
href = result.xpath("h3/a/@href")[0]
title = result.xpath("h3/a/@title")[0]
author = result.xpath("div/p/span[2]/text()")[0]
index.append(href)
print(href, title, author)
bookIndex = input('你选择下载那一篇小说(输入序号数字)')
if bookIndex.isdigit(): # isdigit是验证是否是整数
bookIndex = int(bookIndex)
bookIist = get_content(index[bookIndex - 1])
ele2 = etree.HTML(bookIist.encode('utf-8'))
chapter = ele2.xpath("//div[@id='list']/dl/dd/a/@href")
print(chapter)
chapterName = ele2.xpath("//div[@id='list']/dl/dd/a/text()")
n=0
for chapter in chapter:
tmpeUrl = 'https://www.xsbiquge.com' + chapter
print(tmpeUrl)
mainbodyUrl = get_content(tmpeUrl)
ele3 = etree.HTML(mainbodyUrl.encode('utf-8'))
mainbody = ele3.xpath("//div[@id='content']/text()")
finishedProduct = "\n".join(mainbody)
global chapterName2
chapterName2 = chapterName[n]
save(finishedProduct)
n=n+1
else:
print("请正确输入")
exit()
#写入文件
def save(fil):
filename = books + ".txt"
f = open(filename, "a+", encoding='utf-8')
f.write(chapterName2 + '\n')
f.write(fil + '\n')
f.close
# 主程序
def main():
start_time = time.time()
global books
books = input('你要搜索的小说是:')
ret = parse.quote(books)
url = 'https://www.xsbiquge.com/search.php?keyword=' + ret
print(url)
content = get_content(url)
get_analysis(content)
end_time = time.time()
project_time = end_time - start_time
print('程序用时', project_time)
main()
正当我要放弃的时候我看到了错误!!!
不足:
代码繁琐
变量没有搞清又使用了全局变量
没法指定下载只能下载全部
目标:
改进上面不足
最终版本已更新
ps:变量乱的要死还没注释请别喷