python协程多线程HTMLParser下载mp3

86 篇文章 3 订阅
本文介绍如何结合Python的HTMLParser库和协程多线程技术,高效地从HTML网页中解析出MP3链接,并进行下载操作。
摘要由CSDN通过智能技术生成

使用HTMLParser解析html网页,顺便协程+多线程下载mp3文件

# coding=utf-8
import requests
import os
import chardet
import asyncio
import threading
from concurrent.futures import ThreadPoolExecutor
from enum import Enum
from html.parser import HTMLParser

gPool = ThreadPoolExecutor(30)
gLock = threading.Lock()


class EProcess(Enum):
    idle = 0
    start = 1
    end = 2


class CParseHtml(HTMLParser):

    def __init__(self, url):
        HTMLParser.__init__(self)
        hName = os.path.basename(url)
        prefix = hName.strip(".html").strip("?")
        self.mlog = "log" + prefix + ".txt"
        if os.path.isfile(self.mlog):
            os.remove(self.mlog)
        self.mUrls = []
        self.mProcess = EProcess.idle
        self.mBasePrefix = "http://www.txxxxx.com"

    def pinfo(self, text, flag=False):
        if flag:
            with open(self.mlog, "a") as f:
                f.write(text + "\n")
        print(text)


class CParsePageFirst(CParseHtml):

    def __init__(self, url):
        CParseHtml.__init__(self, url)
        self.mUl = False

    def handle_starttag(self, tag, attrs):
        for kv in attrs:
            if len(kv) == 2:
                if kv[1] == "zaixianlianbo":
                    self.mProcess = EProcess.start
                elif self.mProcess == EProcess.start and kv[0] == "href":
                    ul = self.mBasePrefix + kv[1]
                    self.mUrls.append(ul)

    def handle_endtag(self, tag):
        if self.mProcess == EProcess.start and tag == "ul":
            self.mProcess = EProcess.end


class CParsePageDown(CParseHtml):
    def __init__(self, url):
        CParseHtml.__init__(self, url)
        self.mUl = False

    def handle_starttag(self, tag, attrs):
        if tag == "script":
            self.mProcess = EProcess.start

    def handle_endtag(self, tag):
        if tag == "script":
            self.mProcess = EProcess.idle

    def handle_data(self, data):
        if (self.mProcess == EProcess.start):
            # print("data = {}".format(data))
            if data.find("getAspParas") != -1:
                httpPrefix = "http://mp3"
                httpSuffix = ".mp3"
                idxStart = data.find(httpPrefix)
                idxEnd = data.find(httpSuffix)
                if idxStart != -1 and idxEnd != -1:
                    mp3Url = data[idxStart:idxEnd + len(httpSuffix)]
                    self.mUrls.append(mp3Url)


async def getHtml(url, loop):
    global gPool
    result = await loop.run_in_executor(gPool, requests.get, url)
    if (result.status_code == 200):
        en = chardet.detect(result.content)
        # print("en = {}".format(en))
        encode = en["encoding"]
        if encode.lower() == "gb2312":
            encode = "gb18030"
        content = result.content.decode(encode)
        # base = os.path.basename(url)
        # with open(base.strip("?"), "w") as f:
        #     f.write(content)
        return 200, content
    return 404, ""


async def dealOne(url, loop):
    code, content = await getHtml(url, loop)
    if code == 200:
        down = CParsePageDown(url)
        down.feed(content)
        if len(down.mUrls) != 1:
            # down.pinfo("downOne:error len(murls) == {}".len(down.mUrls))
            pass
        else:
            mp3Url = down.mUrls[0]
            saveName = os.path.basename(mp3Url)
            if not os.path.isfile(saveName):
                global gPool
                result = await loop.run_in_executor(gPool, requests.get, mp3Url)
                if (result.status_code == 200):
                    with open(saveName, "wb") as f:
                        f.write(result.content)

                    global gLock
                    gLock.acquire()
                    print("down ok {}".format(saveName))
                    gLock.release()

            return saveName, True
    return url, False


async def dealAll(url, loop):
    code, content = await getHtml(url, loop)
    if code == 200:
        parseFirst = CParsePageFirst(url)
        parseFirst.feed(content)

        ts = []
        for ul in parseFirst.mUrls:
            task = asyncio.ensure_future(dealOne(ul, loop))
            ts.append(task)
        rs = await asyncio.gather(*ts)
        return rs
    else:
        print("error:dealAll")
    return []

if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    try:
        url = "http://www.txxxxxxm/books/15051.html"
        rs = loop.run_until_complete(dealAll(url, loop))

        okCount = 0
        failCount = 0
        for name, success in rs:
            if success:
                okCount += 1
            else:
                failCount += 1
                print("fail name = {}".format(name))
        print("成功下载{}, 失败下载{}".format(okCount, failCount))
    finally:
        loop.close()
    print("main:end")

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
按DOM模型解析html文件的工具包 已下是源码列表: META-INF/MANIFEST.MF META-INF/maven/org.htmlparser/htmlparser/pom.properties META-INF/maven/org.htmlparser/htmlparser/pom.xml org.htmlparser.Parser.class org.htmlparser.PrototypicalNodeFactory.class org.htmlparser.beans.BeanyBaby.class org.htmlparser.beans.FilterBean.class org.htmlparser.beans.HTMLLinkBean.class org.htmlparser.beans.HTMLTextBean.class org.htmlparser.beans.LinkBean.class org.htmlparser.beans.StringBean.class org.htmlparser.filters.AndFilter.class org.htmlparser.filters.CssSelectorNodeFilter.class org.htmlparser.filters.HasAttributeFilter.class org.htmlparser.filters.HasChildFilter.class org.htmlparser.filters.HasParentFilter.class org.htmlparser.filters.HasSiblingFilter.class org.htmlparser.filters.IsEqualFilter.class org.htmlparser.filters.LinkRegexFilter.class org.htmlparser.filters.LinkStringFilter.class org.htmlparser.filters.NodeClassFilter.class org.htmlparser.filters.NotFilter.class org.htmlparser.filters.OrFilter.class org.htmlparser.filters.RegexFilter.class org.htmlparser.filters.StringFilter.class org.htmlparser.filters.TagNameFilter.class org.htmlparser.http.HttpHeader.class org.htmlparser.sax.Attributes.class org.htmlparser.sax.Feedback.class org.htmlparser.sax.Locator.class org.htmlparser.sax.XMLReader.class org.htmlparser.scanners.CompositeTagScanner.class org.htmlparser.scanners.JspScanner.class org.htmlparser.scanners.ScriptDecoder.class org.htmlparser.scanners.ScriptScanner.class org.htmlparser.scanners.StyleScanner.class org.htmlparser.tags.AppletTag.class org.htmlparser.tags.BaseHrefTag.class org.htmlparser.tags.BlockquoteTag.class org.htmlparser.tags.BodyTag.class org.htmlparser.tags.Bullet.class org.htmlparser.tags.BulletList.class org.htmlparser.tags.CompositeTag.class org.htmlparser.tags.DefinitionList.class org.htmlparser.tags.DefinitionListBullet.class org.htmlparser.tags.Div.class org.htmlparser.tags.DoctypeTag.class org.htmlparser.tags.FormTag.class org.htmlparser.tags.FrameSetTag.class org.htmlparser.tags.FrameTag.class org.htmlparser.tags.HeadTag.class org.htmlparser.tags.HeadingTag.class org.htmlparser.tags.Html.class org.htmlparser.tags.ImageTag.class org.htmlparser.tags.InputTag.class org.htmlparser.tags.JspTag.class org.htmlparser.tags.LabelTag.class org.htmlparser.tags.LinkTag.class org.htmlparser.tags.MetaTag.class org.htmlparser.tags.ObjectTag.class org.htmlparser.tags.OptionTag.class org.htmlparser.tags.ParagraphTag.class org.htmlparser.tags.ProcessingInstructionTag.class org.htmlparser.tags.ScriptTag.class org.htmlparser.tags.SelectTag.class org.htmlparser.tags.Span.class org.htmlparser.tags.StyleTag.class org.htmlparser.tags.TableColumn.class org.htmlparser.tags.TableHeader.class org.htmlparser.tags.TableRow.class org.htmlparser.tags.TableTag.class org.htmlparser.tags.TextareaTag.class org.htmlparser.tags.TitleTag.class org.htmlparser.util.CharacterReference.class org.htmlparser.util.CharacterReferenceEx.class org.htmlparser.util.DefaultParserFeedback.class org.htmlparser.util.FeedbackManager.class org.htmlparser.util.IteratorImpl.class org.htmlparser.util.NodeTreeWalker.class org.htmlparser.util.ParserFeedback.class org.htmlparser.util.ParserUtils.class org.htmlparser.util.Translate.class org.htmlparser.visitors.HtmlPage.class org.htmlparser.visitors.LinkFindingVisitor.class org.htmlparser.visitors.ObjectFindingVisitor.class org.htmlparser.visitors.StringFindingVisitor.class org.htmlparser.visitors.TagFindingVisitor.class org.htmlparser.visitors.TextExtractingVisitor.class org.htmlparser.visitors.UrlModifyingVisitor.class org/htmlparser/beans/images/Chain16.gif org/htmlparser/beans/images/Chain32.gif org/htmlparser/beans/images/Knot16.gif org/htmlparser/beans/images/Knot32.gif
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值