用HTMLParser解析html
参考:https://www.cnblogs.com/liuhaidon/archive/2019/12/18/12060184.html
'''
Description:
Author: pdh
Date: 2020-11-09 11:52:39
LastEditors: pdh
LastEditTime: 2020-11-09 15:37:57
FilePath: \html_mp3\down.py
'''
# coding=utf-8
import requests
import os
import asyncio
import sys
from html.parser import HTMLParser
from concurrent.futures import ThreadPoolExecutor
gPool = ThreadPoolExecutor(50)
class CParseHtml(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.mlog = "log.txt"
if os.path.isfile(self.mlog):
os.remove(self.mlog)
self.mPrepared = False
self.mUrls = []
def pinfo(self, text, flag=False):
if flag:
with open(self.mlog, "a") as f:
f.write(text + "\n")
print(text)
'''
Description: 解析音乐所在的网页地址
Author: pdh
Date: 2020-11-09 14:34:10
'''
class CParserPages(CParseHtml):
def __init__(self):
CParseHtml.__init__(self)
# 对开始标签的处理方法。例如<div id="main">,参数tag指的是div,attrs指的是一个(name,Value)的列表,即列表里面装的数据是元组。
def handle_starttag(self, tag, attrs):
# self.pinfo("stattag {}, attrs = {}".format(tag, attrs))
if tag == 'h3':
self.mPrepared = True
if self.mPrepared:
if tag == 'a':
for kv in attrs:
if len(kv) == 2:
if kv[0] == 'href':
self.mUrls.append(kv[1])
# self.pinfo(kv[1], True)
def handle_endtag(self, tag): # 对结束标签的处理方法。例如</div>,参数tag指的是div。
# self.pinfo("endtag: {}".format(tag))
if self.mPrepared:
if tag == 'ul':
self.mPrepared = False
# def handle_startendtag(self, tag, attrs): # 识别没有结束标签的HTML标签,例如<img />等。
# self.pinfo("startendtag:tag = {}, attrs={}".format(tag, attrs))
# def handle_data(self, data): # 对标签之间的数据的处理方法。<tag>test</tag>,data指的是“test”。
# self.pinfo("data: {}".format(data))
# def handle_comment(self, data): # 对HTML中注释的处理方法。
# self.pinfo("comment:data = {}".format(data))
'''
Description: 从网页中取出mp3文件真实地址并下载
Author: pdh
Date: 2020-11-09 14:35:01
'''
class CParserMusic(CParseHtml):
def __init__(self):
CParseHtml.__init__(self)
# 对开始标签的处理方法。例如<div id="main">,参数tag指的是div,attrs指的是一个(name,Value)的列表,即列表里面装的数据是元组。
def handle_starttag(self, tag, attrs):
self.pinfo("stattag {}, attrs = {}".format(tag, attrs), True)
if tag == 'source':
for kv in attrs:
if len(kv) == 2:
if kv[0] == 'src':
self.mUrls.append(kv[1])
# self.pinfo(kv[1], True)
def handle_endtag(self, tag): # 对结束标签的处理方法。例如</div>,参数tag指的是div。
self.pinfo("endtag: {}".format(tag), True)
if self.mPrepared:
if tag == 'source':
self.mPrepared = False
# def handle_startendtag(self, tag, attrs): # 识别没有结束标签的HTML标签,例如<img />等。
# self.pinfo("startendtag:tag = {}, attrs={}".format(tag, attrs))
# def handle_data(self, data): # 对标签之间的数据的处理方法。<tag>test</tag>,data指的是“test”。
# self.pinfo("data: {}".format(data))
# def handle_comment(self, data): # 对HTML中注释的处理方法。
# self.pinfo("comment:data = {}".format(data))
def getNames(url):
print("getNames:url={}".format(url))
result = requests.get(url)
if result.status_code == 200:
# with open("name_url.html", "wb+") as f:
# f.write(result.content)
content = result.content.decode("utf-8")
parsePage = CParserPages()
parsePage.feed(content)
return parsePage.mUrls
else:
print("getNames:error {}".format(result))
sys.exit(1)
return []
async def downOne(url, loop):
print("downOne: url = " + url)
result = requests.get(url)
if result.status_code == 200:
with open("music.html", "wb") as f:
f.write(result.content)
content = result.content.decode("utf-8")
parseMp3 = CParserMusic()
await asyncio.sleep(0.01)
parseMp3.feed(content)
if len(parseMp3.mUrls) == 1:
musicUrl = parseMp3.mUrls[0]
fm = requests.get(musicUrl)
if fm.status_code == 200:
oldArr = os.path.basename(musicUrl).split(".")
if len(oldArr) == 2:
newName = os.path.basename(url).strip(
'.html') + "." + oldArr[1]
with open(newName, "wb") as f:
f.write(fm.content)
print("musicUrl = {}".format(musicUrl))
print("url = {}".format(url))
# http://e1.ixinmo.com/asdasdasd/368/1/1604907060/1727375ca5265526d95a0123d4f51380/0cf5ad54c37623026e1a66976188b6a9.m4a
# http://e1.ixinmo.com/asdasdasd/368/1/1604906520/b1f1c1b94a295d6b5521946b7d6c5568/2ffef8c6bde7331e04c9ef8691117549.m4a
else:
print("downOne: error {}".format(parseMp3.mUrls))
else:
print("downOne: error url={}".format(url))
async def deal(loop):
url = "http://www.ixinmo.com/shu/178.html"
ts = []
pages = getNames(url)
global gPool
for p in pages:
task = asyncio.ensure_future(downOne(p, loop))
ts.append(task)
break
result = await asyncio.gather(*ts)
print("deal: result = {}".format(result))
if __name__ == "__main__":
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(deal(loop))
finally:
loop.close()
print("main:end")