Python爬虫_小说爬取进化版

最新推荐文章于 2024-07-18 15:53:23 发布

Corgy.

最新推荐文章于 2024-07-18 15:53:23 发布

阅读量184

点赞数

分类专栏： python 文章标签： python xpath

本文链接：https://blog.csdn.net/small_dog_/article/details/106166661

版权

python 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

准备

安装Python以及必要的模块（requests，xpath）

新笔趣阁

思路

继上次写的代码问题改正:

指定下载
解决一下全局变量的问题
完善部分下载函数

作用

可以选择爬取网站内任意小说选择章数保存路径

代码

import requests
import time
import sys
import os
from lxml import etree
from urllib import parse


# 首先获取访问网站的URl
def get_content(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
        }
        r = requests.get(url=url, headers=headers)
        r.encoding = 'utf-8'
        content = r.text
        # print(content)
        return content
    except:  # 反应错误信息
        s = sys.exc_info()
        print("Error '%s' happened on line %d" % (s[1], s[2].tb_lineno))
        return " ERROR "


# 解析网站找到小说的本书
def get_analysis(content):
    ele = etree.HTML(content.encode('utf-8'))
    # print(type(ele))
    result = ele.xpath("//div[@class='result-game-item-detail']")
    if (len(result) == 0):
        print('没有找到你想要的小说')
        exit()
    i = 0
    index = []
    for result in result:
        i = i + 1
        print(i)
        href = result.xpath("h3/a/@href")[0]
        title = result.xpath("h3/a/@title")[0]
        author = result.xpath("div/p/span[2]/text()")[0]
        index.append(href)
        print(href, title, author)
    return get_pick(index)


# 挑选小说
def get_pick(index):
    bookIndex = input('你选择下载那一篇小说（输入序号数字）')
    if bookIndex.isdigit():  # isdigit是验证是否是整数    有bug
        # input输入字符串注意转型
        bookIndex = int(bookIndex)
        bookIist = get_content(index[bookIndex - 1])
        ele2 = etree.HTML(bookIist.encode('utf-8'))
        chapter = ele2.xpath("//div[@id='list']/dl/dd/a/@href")
        return chapter
    else:
        print("请正确输入")
        exit()


# 选择章节或者全部下载
def get_select(chapter):
    judge = input('是否下载全本 （回车默认全部） yes/no')
    if judge == 'no':  # 有bug
        print('请输入你要下载的章节 ?--? ')
        i = 0
        y = []
        while i < 2:
            x = input()
            y.append(x)
            i += 1
        m = int(y[0])
        n = int(y[1])
        #temp = []
        #for z in range(m, n + 1):
        #    temp.append(chapter[z - 1])
        #chapter = temp
        chapter = chapter[m-1:n]
        return chapter
    else:
        print('开始下载全部')
        return chapter


# 正文解析获取
def get_mainFiction(chapter):
    for chapter in chapter:
        tmpeUrl = 'https://www.xsbiquge.com' + chapter
        mainbodyUrl = get_content(tmpeUrl)
        ele3 = etree.HTML(mainbodyUrl.encode('utf-8'))
        mainbody = ele3.xpath("//div[@id='content']/text()")
        chapterName = ele3.xpath("//div[@class='bookname']/h1/text()")[0]
        finishedProduct = "\n".join(mainbody)
        save(chapterName, finishedProduct, PATH)
        print(chapterName + '\t下载完成')


# 保存本地或者选择保存地点
def save(chapterName, finishedProduct, path):
    # 创建的目录
    if not os.path.exists(path):
        os.makedirs(path)
    filename = BOOKS + ".txt"
    f = open(path + '\\' + filename, "a+", encoding='utf-8')
    f.write(chapterName + '\n')
    f.write(finishedProduct + '\n')
    f.close


# 主程序
def main():
    # 书名全局
    global BOOKS
    BOOKS = input('你要搜索的小说是：')
    ret = parse.quote(BOOKS)
    url = 'https://www.xsbiquge.com/search.php?keyword=' + ret
    # print(url)   #验证链接
    content = get_content(url)
    chapter = get_analysis(content)
    chapter = get_select(chapter)
    # 保存路径全局
    global PATH
    PATH = input('选择保存路径（回车默认本地）')
    start_time = time.time()  # 计时开启
    # 回车默认本地
    if PATH == '':
        PATH = os.getcwd()
        print(PATH)
        get_mainFiction(chapter)
    else:
        get_mainFiction(chapter)
    end_time = time.time()
    project_time = end_time - start_time
    print('下载用时', project_time)


main()

总结：

还存在一些小bug
希望大家可以挑一下刺尤其在代码基础简练方面我将不胜感激
蟹蟹大家

Corgy.

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
Python爬虫_小说爬取进化版

准备安装Python以及必要的模块（requests，xpath）新笔趣阁思路继上次写的代码问题改正:指定下载解决一下全局变量的问题完善部分下载函数作用可以选择爬取网站内任意小说选择章数保存路径代码import requestsimport timeimport sysimport osfrom lxml import etreefrom urllib import parse# 首先获取访问网站的URldef get_content(url): tr
复制链接

扫一扫