零组文库,想用的抓紧

零组文库已经是关闭了,在网上找到了一个备份站点:主页 · 资料文库

http://book.iwonder.run/index.html

网页太多,就写了个脚本,但是,太多的请求访问,结果就访问不了了,脚本如下

#!C:\Python3.7
# -*- coding:utf-8 -*-

import requests
from lxml import etree
import os
import re


def get_date(url):
    try:
        req =requests.get(url)
        req.encoding = "UTF-8"
        # print(req.text)
        html = etree.HTML(req.text)
        # 1 /html/body/div/div[1]/nav/ul/li[2]/span/text()
        #1_level directory
        directory_html = html.xpath("/html/body/div/div[1]/nav/ul/li")
        for directory in directory_html:
            directory_name = directory.xpath("./span/text()")
            if len(directory_name)>0:
                directory_name=directory_name[0].replace("\n","").strip()[1:-1]
                # print(directory_name)
                mkdir(directory_name)#创建一级文件夹

                ##2_level directory
                sub_html = directory.xpath("./ul/li")
                for sub in sub_html:
                    # 2 /html/body/div/div[1]/nav/ul/li[2]/ul/li/span/text()
                    # 3/html/body/div/div[1]/nav/ul/li[6]/ul/li/ul/li/span/text()
                    sub_name = sub.xpath("./span/text()")
                    if len(sub_name) >0:
                        sub_name=sub_name[0].replace("\n","").strip()
                        # print(sub_name)
                        mkdir(os.path.join(directory_name,sub_name))#创建二级文件夹
                        mkdir(os.path.join(directory_name,sub_name,'img'))#创建图片文件夹
                        # href /html/body/div/div[1]/nav/ul/li[2]/ul/li[1]/ul/li/a/@href
                        #3_page_url
                        href=sub.xpath("./ul/li/a/@href")
                        # print(href)
                    # exit()

                        if len(href)>0:
                            for u in href:
                                if "http" in u:
                                    continue
                                page_url=url+u
                                print("2",page_url)
                                data,file_name = page(page_url)#页面处理
                                # print(file_name)
                                file_name=file_name+".html"
                                save_file(os.path.join(directory_name,sub_name,file_name),data.encode())
                                #save_img
                                img_list = get_img_url(page_url,sub_name)
                                if len(img_list)>0:
                                    for img_url in img_list:
                                        save_img(os.path.join(directory_name,sub_name),img_url)

                        else:
                            # #3_level directory
                            print("3")
                            third_html= sub.xpath("./ul/li")
                            for third in third_html:
                                third_name = third.xpath("./span/text()")[0].replace("\n","").strip()
                                # print("third:",third_directory)

                                if len(third_name)>0:

                                    mkdir(os.path.join(directory_name,sub_name,third_name))#创建三级目录
                                    mkdir(os.path.join(directory_name, sub_name,third_name, 'img'))  # 创建图片文件夹
                                    href = third.xpath("./ul/li/a/@href")
                                    # print(href)

                                    if len(href) > 0:
                                        for u in href:
                                            if "http" in u:
                                                continue
                                            page_url=url+u
                                            print("3",page_url)
                                            data,file_name = page(page_url)#页面处理
                                            # print(file_name)
                                            file_name = file_name + ".html"
                                            save_file(os.path.join(directory_name,sub_name,third_name,file_name),data.encode())
                                            #save_img
                                            img_list = get_img_url(page_url,sub_name)
                                            if len(img_list)>0:
                                                for img_url in img_list:
                                                    save_img(os.path.join(directory_name,sub_name,third_name),img_url)
        print("[****] finish!")
    except Exception as e:
        print(e)
        pass



def mkdir(name):
    dir = os.getcwd()
    dir=os.path.join(dir,"osec",name)
    # print(dir)
    if not os.path.exists(dir):
        os.mkdir(dir)
        print("[D*]mkdir ",dir,"success!")
    else:
        print("[D-]", dir, "exists!")


def save_file(name,data):
    dir = os.getcwd()
    dir = os.path.join(dir, "osec", name)
    if not os.path.exists(dir):
        with open(dir,"wb") as file:
            file.write(data)
            print("[f*] write ",dir," success!")
    else:
        print("[f-] write ", dir, " exists!")


def page(url):
    try:
        req = requests.get(url)
        req.encoding = 'UTF-8'
        # print(req.text)
        start = req.text.find('<nav role="navigation">')
        stop = req.text.find('</nav>')
        # print(start,stop)
        rem = req.text[start:stop]
        # print(rem)
        data = req.text.replace(rem,"")
        # print(data)

        #filename
        html =etree.HTML(req.text)
        filename = html.xpath("/html/body/div/div[2]/div/div[2]/div/div/div[1]/section/h1/text()")#
        if len(filename)>0:
            filename=filename[0].strip()
        else:
            filename=url.rsplit("/",1)[-1].rsplit(".",1)[0]
        filename=str(filename).replace("<","").replace("=","").replace("/","_").replace("\\","_")
        # print(data,filename)
        return data,filename
    except Exception as e:
        pass


def get_img_url(page_url,sub_name):
    base_url = "http://book.iwonder.run/0day/"
    req = requests.get(page_url)
    req.encoding = 'UTF-8'
    html = etree.HTML(req.text)
    img_list=[]
    img_list = html.xpath("//*[@id=\"book-search-results\"]/div[1]/section/p/img/@src")
    if len(img_list)>0:
        for i in range(len(img_list)):
            img_list[i] = base_url+sub_name+"/"+img_list[i]
    # print(img_list)
    return img_list


def save_img(path,img_url):

    base_path = os.getcwd()
    base_path = os.path.join(base_path,"osec",path,"img")
    filename = str(img_url).rsplit("/",1)[-1]
    file_path = os.path.join(base_path,filename)
    # print(file_path)
    if not os.path.exists(file_path):
        try:
            file_data = requests.get(img_url).content
            with open(file_path,"wb") as file:
                file.write(file_data)
                print("[img**] save img ",file_path," success!")
        except:
            pass
    else:
        print("[img--] save img ", file_path, " exists!")




if __name__ == '__main__':

    url="http://book.iwonder.run/"
    page_url="http://book.iwonder.run/0day/74cms/%E6%96%B0%E7%89%8874cms%20v4.2.1-v4.2.129-%E5%90%8E%E5%8F%B0getshell%E6%BC%8F%E6%B4%9E.html"

    get_date(url)
    # mkdir("测试")
    # page("http://book.iwonder.run/0day/Coremail/1.html")
    # img(page_url,"test")
    # save_img(os.path.join("dirctory","subname"),)

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
零组文库是一个非常受欢迎的文学作品分享平台,上面拥有大量的优秀作品和作者。尽管内容可能不全,但是它提供了epub格式的下载选项,使得读者可以方便地在各种电子设备上阅读。 epub是一种开放的电子书格式,适用于多种阅读器和平台。相比其他格式,epub具有可扩展性和可定制性,使得读者可以根据自己的喜好进行调整。同时,epub是一个支持多媒体内容的格式,可以包含图像、音频和视频等元素,为读者带来更加丰富的阅读体验。 通过将零组文库的作品以epub格式提供下载,读者可以在手机、平板电脑或电子阅读器上随时随地进行阅读。无论是在家中、办公室还是旅途中,都能随时享受到精彩的阅读体验。而且,epub格式具有自动调整字体大小和排版等功能,使得阅读更加舒适,并且可以根据个人喜好进行设置。 另外,在零组文库中下载epub格式的作品,也可以方便地进行标注和批注。读者可以在文本中划线、做笔记,或者进行高亮,方便自己做读书笔记或做出阅读思考。这些功能都能够帮助读者更好地理解和消化作品中的内容。 总而言之,零组文库提供epub格式的下载选项,为读者带来了更加便捷和丰富的阅读体验。无论在电子设备上阅读,还是进行标注和批注,都能够更好地享受文学作品带来的乐趣和思考。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值