#coding:utf-8
import urllib2
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
def __init__(self, get_tag, key_word, key_value, key_properby):
HTMLParser.__init__(self)
self.get_tag = get_tag
self.key_word = key_word
self.key_value = key_value
self.key_properby = key_properby
self.links = []
def handle_starttag(self, tag, attrs):
#print "Encountered the beginning of a %s tag" % tag
if tag == self.get_tag:
if len(attrs) == 0: pass
else:
attrs = dict(attrs)
for variable, value in attrs.iteritems():
if attrs.get(self.key_word) == self.key_value:
if variable == self.key_properby:
self.links.append(value)
def source_url_get():
url = 'http://v.youku.com/v_vpfoldervideolist/page_1_id_55814819_f_5316052_o_1_p_9.html?__rt=1&__ro=vpfoldervideolist'
html_code = urllib2.urlopen(url).read()
hp = MyHTMLParser('a', 'charset', '5-1', 'href')
hp.feed(html_code)
hp.close()
for link in hp.links:
url = "http://www.flvcd.com/parse.php?flag=&format=&kw=%s" % (link)
html_code = urllib2.urlopen(url).read()
hp = MyHTMLParser('a', 'onclick', '_alert();return false;', 'href')
hp.feed(html_code)
hp.close()
print '\n'.join(hp.links)
if __name__ == "__main__":
source_url_get()
HTMLParser
最新推荐文章于 2021-06-03 03:14:37 发布