#小练习 使用HTMLParser获取data时注意事项 分类: pytho...

from HTMLParser import HTMLParser
class myHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.links=[]

    def handle_starttag(self,tag,attrs):
        if tag=='a':
            if len(attrs)==0:pass
            else:

                for variable,value in attrs:
                    if variable=='href':
                        self.links.append(value)
    def handle_data(self,data):
#        if data.strip():
#            print data.strip()
        print '-'*10
        print 'data:',data
        print '-'*10

if __name__=='__main__':
    html_code='''
    <a href="www.google.com"> goolge.com </a>
    <A Href="www.pythonclub.org">PythonClub </a>
    <A HREF='www.sina.com.cn'> sina </a>
    '''
    hp=myHTMLParser()
    hp.feed(html_code)
    hp.close()
##    print hp.links


由于 字符串的data中含有空格,如

<a href="www.google.com"> goolge.com </a>
中的google两端有空格,所以要进行strip处理,如代码中的注释,否则会将空格看作一个data,输出空格。


#coding:utf-8
from sgmllib import SGMLParser

class urlparser(SGMLParser):

    def reset(self):
        #添加判断标示,如果遇到<a>,则self.flog='a'
        self.flag=None
        #存放链接地址
        self.l=[]
        #存放
        self.v=[]
        self.d={}
        SGMLParser.reset(self)


    def start_a(self,attrs):
        self.flag='a'
        href=[v for k,v in attrs if k=='href']
        if href:
            self.l.extend(href)

    def handle_data(self,data):
        #只追加<a>标签后面的文本
        if self.flag=='a':
            self.v.append(data)

    #遇到</a>,则self.flog=None,以免读取 urls中的 NEW,\n添加至字典
    def end_a(self):
        self.flag=None

    def merge(self):
        for a,v in enumerate(self.l):
            for b,r in enumerate(self.v):
                if a==b:

                    self.d[r]=v

if __name__ == '__main__':
    urls='''
    <tr>
<td height="207" colspan="2" align="left" valign="top" class="normal">
<p>Damien Rice - 《0》 </p>
<a href="http://galeki.xy568.net/music/Delicate.mp3">1. Delicate</a><br />NEW
<a href="http://galeki.xy568.net/music/Volcano.mp3">2. Volcano</a><br />
<a href="http://galeki.xy568.net/music/The Blower's Daughter.mp3">3. The Blower's Daughter</a><br />
<a href="http://galeki.xy568.net/music/Cannonball.mp3">4. Cannonball </a><br />
<a href="http://galeki.xy568.net/music/Older Chests.mp3">5. Order Chests</a><br />
<a href="http://galeki.xy568.net/music/Amie.mp3">6. Amie</a><br />
<a href="http://galeki.xy568.net/music/Cheers Darlin'.mp3">7. Cheers Darling</a><br />
<a href="http://galeki.xy568.net/music/Cold Water.mp3">8. Cold water</a><br />
<a href="http://galeki.xy568.net/music/I Remember.mp3">9. I remember</a><br />
<a href="http://galeki.xy568.net/music/Eskimo.mp3">10. Eskimo</a></p>
</td>
</tr>
    '''
    upr=urlparser()
    upr.feed(urls)
##    for i in upr.l:
##        print i
    upr.merge()
    for item in upr.d.items():
        print item
    print len(upr.d)
    upr.close()


结果:


('4. Cannonball ', 'http://galeki.xy568.net/music/Cannonball.mp3')
('1. Delicate', 'http://galeki.xy568.net/music/Delicate.mp3')
('6. Amie', 'http://galeki.xy568.net/music/Amie.mp3')
('8. Cold water', 'http://galeki.xy568.net/music/Cold Water.mp3')
('10. Eskimo', 'http://galeki.xy568.net/music/Eskimo.mp3')
('9. I remember', 'http://galeki.xy568.net/music/I Remember.mp3')
('5. Order Chests', 'http://galeki.xy568.net/music/Older Chests.mp3')
("3. The Blower's Daughter", "http://galeki.xy568.net/music/The Blower's Daughter.mp3")
('2. Volcano', 'http://galeki.xy568.net/music/Volcano.mp3')
('7. Cheers Darling', "http://galeki.xy568.net/music/Cheers Darlin'.mp3")
10

版权声明:本文为博主原创文章,未经博主允许不得转载。

转载于:https://www.cnblogs.com/think1988/p/4628022.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
import http.clientfrom html.parser import HTMLParserimport argparsefrom concurrent.futures import ThreadPoolExecutorimport threadingprefix = "save/"readed_path = set()cur_path = []new_path = []lock = threading.Lock()cond=threading.Condition()class MyHttpParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.tag = [] self.href = "" self.txt = "" def handle_starttag(self, tag, attrs): self.tag.append(tag) # print("start tag in list :" + str(self.tag)) if tag == "a": for att in attrs: if att[0] == &#39;href&#39;: self.href = att[1] def handle_endtag(self, tag): if tag == "a" and len(self.tag) > 2 and self.tag[-2] == "div": print("in div, link txt is %s ." % self.txt) print("in div, link url is %s ." % self.href) if not self.href in readed_path: readed_path.add(self.href) lock.acquire() new_path.append(self.href) lock.release() # print("end tag in list :" + str(self.tag)) self.tag.pop(-1) def handle_data(self, data): if len(self.tag) >= 1 and self.tag[-1] == "a": self.txt = datadef LoadHtml(path, file_path): if len(file_path) == 0: file_path = "/" conn = http.client.HTTPConnection(path) try: conn.request("GET", file_path) response = conn.getresponse() print(response.status, response.reason, response.version) data = response.read().decode("utf-8") if response.status == 301: data = response.getheader("Location") lock.acquire() new_path.append(data) lock.release() data = "" #print(data) conn.close() return data except Exception as e: print(e.args)怎么设置文件的保存路径
最新发布
06-01

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值