python 扒取小说2

网络爬虫2
def get_html(urls):
    import urllib.request as ur
    try:
        page = ur.urlopen(urls)
        return page.read().decode('utf-8-sig')
    except:
        return ""


def get_url(html_page):
    start_position = html_page.find('a href="')
    if start_position == -1:
        return None, 0
    start_position += 8
    end_position = html_page.find('"', start_position)
    return html_page[start_position:end_position], end_position


def get_all_url(seed):
    html_page =get_html(seed)
    useful_links = []
    while True:
        a_url, end = get_url(html_page)
        if a_url:
            if a_url not in useful_links and is_useful(a_url):
                useful_links.append(a_url)
            html_page = html_page[end + 1:]
        else:
            break
        useful_links.sort()
    return useful_links


def is_useful(a_url):
    import re
    if re.match(r"/\d+.\d+.+\.html", a_url):
        return True
    else:
        return False


def get_content(html_page):
    start_flag = html_page.find("readx()")
    # print(start_flag)
    if start_flag < 0:
        return "获取章节失败"
    end_flag = html_page.find("read3()")
    content_page = html_page[start_flag:end_flag]
    start_flag = content_page.find("</script>")
    if start_flag < 0:
        return "获取章节失败"
    else:
        start_flag += 8
        end_flag = content_page.find("</div>")
        content =deal_content(content_page[start_flag:end_flag])
        return content


def deal_content(content_page):
    first_deal = content_page.replace(" ", " ")
    second_deal = content_page.replace("<br/>", "\n")
    return second_deal+"\n"


def get_title(html_page):
    start_flag = html_page.find("bookname")
    if start_flag < 0:
        return "获取章节失败"
    start_flag = html_page.find("<h1>",start_flag)+4
    end_flag = html_page.find("</h1>",start_flag)
    content_title = html_page[start_flag:end_flag]
    return content_title


def write_to_txt(name,seed):
    new_file = open("./res/"+name,"a")
    all_useful_links = get_all_url(seed)
    # print(all_useful_links)
    while all_useful_links:
        a_link = seed[:-7]+all_useful_links.pop(0)
        print(a_link)
        html_page = get_html(a_link)
        # print(html_page)
        content_title = get_title(html_page)
        content = get_content(html_page)
        new_file.write(content_title+"\n"+content)
    if not new_file.closed:
        new_file.close()
    print("缓存结束")




seed = "http://www.biquge.com/0_176/"
write_to_txt("大主宰.txt",seed)

现在比之一来说代码的可读性和移植性都很好,但是不知道为什么执行的效率很慢

求解

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值