HTMLParser

最新推荐文章于 2021-06-03 03:14:37 发布

百里行者

最新推荐文章于 2021-06-03 03:14:37 发布

阅读量461

点赞数

分类专栏： python 文章标签： url html import hp class

本文链接：https://blog.csdn.net/kevin6216/article/details/7020778

版权

python 专栏收录该内容

32 篇文章 0 订阅

订阅专栏

#coding:utf-8

import urllib2
from HTMLParser import HTMLParser
 
class MyHTMLParser(HTMLParser):
    def __init__(self, get_tag, key_word, key_value, key_properby):
        HTMLParser.__init__(self)
        self.get_tag = get_tag
        self.key_word = key_word
        self.key_value = key_value
        self.key_properby = key_properby
        self.links = []
    def handle_starttag(self, tag, attrs):
        #print "Encountered the beginning of a %s tag" % tag
        if tag == self.get_tag:
            if len(attrs) == 0: pass
            else:
                attrs = dict(attrs)
                for variable, value in attrs.iteritems():
                    if attrs.get(self.key_word) == self.key_value:
                        if variable == self.key_properby:
                                self.links.append(value)

def source_url_get():
    url = 'http://v.youku.com/v_vpfoldervideolist/page_1_id_55814819_f_5316052_o_1_p_9.html?__rt=1&__ro=vpfoldervideolist'
    html_code = urllib2.urlopen(url).read()
    
    hp = MyHTMLParser('a', 'charset', '5-1', 'href')
    hp.feed(html_code)
    hp.close()
    for link in hp.links:
        url = "http://www.flvcd.com/parse.php?flag=&format=&kw=%s" % (link)
        html_code = urllib2.urlopen(url).read()
        hp = MyHTMLParser('a', 'onclick', '_alert();return false;', 'href')
        hp.feed(html_code)
        hp.close()
        print '\n'.join(hp.links)
    
if __name__ == "__main__":
    source_url_get()