python解析html

最新推荐文章于 2024-06-08 11:12:05 发布

水星灭绝

最新推荐文章于 2024-06-08 11:12:05 发布

阅读量207

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/wulong710/article/details/109581129

版权

python 专栏收录该内容

86 篇文章 3 订阅

订阅专栏

用HTMLParser解析html

参考：https://www.cnblogs.com/liuhaidon/archive/2019/12/18/12060184.html

'''
Description: 
Author: pdh
Date: 2020-11-09 11:52:39
LastEditors: pdh
LastEditTime: 2020-11-09 15:37:57
FilePath: \html_mp3\down.py
'''
# coding=utf-8


import requests
import os
import asyncio
import sys
from html.parser import HTMLParser
from concurrent.futures import ThreadPoolExecutor

gPool = ThreadPoolExecutor(50)


class CParseHtml(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.mlog = "log.txt"
        if os.path.isfile(self.mlog):
            os.remove(self.mlog)
        self.mPrepared = False
        self.mUrls = []

    def pinfo(self, text, flag=False):
        if flag:
            with open(self.mlog, "a") as f:
                f.write(text + "\n")
        print(text)


'''
Description: 解析音乐所在的网页地址
Author: pdh
Date: 2020-11-09 14:34:10
'''


class CParserPages(CParseHtml):

    def __init__(self):
        CParseHtml.__init__(self)

    # 对开始标签的处理方法。例如<div id="main">，参数tag指的是div，attrs指的是一个（name,Value)的列表，即列表里面装的数据是元组。

    def handle_starttag(self, tag, attrs):
        # self.pinfo("stattag {}, attrs = {}".format(tag, attrs))
        if tag == 'h3':
            self.mPrepared = True

        if self.mPrepared:
            if tag == 'a':
                for kv in attrs:
                    if len(kv) == 2:
                        if kv[0] == 'href':
                            self.mUrls.append(kv[1])
                            # self.pinfo(kv[1], True)

    def handle_endtag(self, tag):  # 对结束标签的处理方法。例如</div>，参数tag指的是div。
        # self.pinfo("endtag: {}".format(tag))
        if self.mPrepared:
            if tag == 'ul':
                self.mPrepared = False

    # def handle_startendtag(self, tag, attrs):  # 识别没有结束标签的HTML标签，例如<img />等。
    #     self.pinfo("startendtag:tag = {}, attrs={}".format(tag, attrs))

    # def handle_data(self, data):  # 对标签之间的数据的处理方法。<tag>test</tag>，data指的是“test”。
    #     self.pinfo("data: {}".format(data))

    # def handle_comment(self, data):  # 对HTML中注释的处理方法。
    #     self.pinfo("comment:data = {}".format(data))


'''
Description: 从网页中取出mp3文件真实地址并下载
Author: pdh
Date: 2020-11-09 14:35:01
'''


class CParserMusic(CParseHtml):

    def __init__(self):
        CParseHtml.__init__(self)

    # 对开始标签的处理方法。例如<div id="main">，参数tag指的是div，attrs指的是一个（name,Value)的列表，即列表里面装的数据是元组。

    def handle_starttag(self, tag, attrs):
        self.pinfo("stattag {}, attrs = {}".format(tag, attrs), True)
        if tag == 'source':
            for kv in attrs:
                if len(kv) == 2:
                    if kv[0] == 'src':
                        self.mUrls.append(kv[1])
                        # self.pinfo(kv[1], True)

    def handle_endtag(self, tag):  # 对结束标签的处理方法。例如</div>，参数tag指的是div。
        self.pinfo("endtag: {}".format(tag), True)
        if self.mPrepared:
            if tag == 'source':
                self.mPrepared = False

    # def handle_startendtag(self, tag, attrs):  # 识别没有结束标签的HTML标签，例如<img />等。
    #     self.pinfo("startendtag:tag = {}, attrs={}".format(tag, attrs))

    # def handle_data(self, data):  # 对标签之间的数据的处理方法。<tag>test</tag>，data指的是“test”。
    #     self.pinfo("data: {}".format(data))

    # def handle_comment(self, data):  # 对HTML中注释的处理方法。
    #     self.pinfo("comment:data = {}".format(data))


def getNames(url):
    print("getNames:url={}".format(url))
    result = requests.get(url)
    if result.status_code == 200:
        # with open("name_url.html", "wb+") as f:
        #     f.write(result.content)
        content = result.content.decode("utf-8")

        parsePage = CParserPages()
        parsePage.feed(content)
        return parsePage.mUrls
    else:
        print("getNames:error {}".format(result))
        sys.exit(1)
    return []


async def downOne(url, loop):
    print("downOne: url = " + url)
    result = requests.get(url)
    if result.status_code == 200:
        with open("music.html", "wb") as f:
            f.write(result.content)

        content = result.content.decode("utf-8")
        parseMp3 = CParserMusic()
        await asyncio.sleep(0.01)
        parseMp3.feed(content)
        if len(parseMp3.mUrls) == 1:
            musicUrl = parseMp3.mUrls[0]
            fm = requests.get(musicUrl)
            if fm.status_code == 200:
                oldArr = os.path.basename(musicUrl).split(".")
                if len(oldArr) == 2:
                    newName = os.path.basename(url).strip(
                        '.html') + "." + oldArr[1]
                    with open(newName, "wb") as f:
                        f.write(fm.content)
                    print("musicUrl = {}".format(musicUrl))
                    print("url = {}".format(url))
                    # http://e1.ixinmo.com/asdasdasd/368/1/1604907060/1727375ca5265526d95a0123d4f51380/0cf5ad54c37623026e1a66976188b6a9.m4a
                    # http://e1.ixinmo.com/asdasdasd/368/1/1604906520/b1f1c1b94a295d6b5521946b7d6c5568/2ffef8c6bde7331e04c9ef8691117549.m4a
        else:
            print("downOne: error {}".format(parseMp3.mUrls))

    else:
        print("downOne: error url={}".format(url))


async def deal(loop):
    url = "http://www.ixinmo.com/shu/178.html"
    ts = []
    pages = getNames(url)

    global gPool

    for p in pages:
        task = asyncio.ensure_future(downOne(p, loop))
        ts.append(task)
        break
    result = await asyncio.gather(*ts)
    print("deal: result = {}".format(result))


if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    try:
        loop.run_until_complete(deal(loop))
    finally:
        loop.close()

    print("main:end")