HTMLParser

#coding:utf-8

import urllib2
from HTMLParser import HTMLParser
 
class MyHTMLParser(HTMLParser):
    def __init__(self, get_tag, key_word, key_value, key_properby):
        HTMLParser.__init__(self)
        self.get_tag = get_tag
        self.key_word = key_word
        self.key_value = key_value
        self.key_properby = key_properby
        self.links = []
    def handle_starttag(self, tag, attrs):
        #print "Encountered the beginning of a %s tag" % tag
        if tag == self.get_tag:
            if len(attrs) == 0: pass
            else:
                attrs = dict(attrs)
                for variable, value in attrs.iteritems():
                    if attrs.get(self.key_word) == self.key_value:
                        if variable == self.key_properby:
                                self.links.append(value)

def source_url_get():
    url = 'http://v.youku.com/v_vpfoldervideolist/page_1_id_55814819_f_5316052_o_1_p_9.html?__rt=1&__ro=vpfoldervideolist'
    html_code = urllib2.urlopen(url).read()
    
    hp = MyHTMLParser('a', 'charset', '5-1', 'href')
    hp.feed(html_code)
    hp.close()
    for link in hp.links:
        url = "http://www.flvcd.com/parse.php?flag=&format=&kw=%s" % (link)
        html_code = urllib2.urlopen(url).read()
        hp = MyHTMLParser('a', 'onclick', '_alert();return false;', 'href')
        hp.feed(html_code)
        hp.close()
        print '\n'.join(hp.links)
    
if __name__ == "__main__":
    source_url_get()
    
    
    


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值