#coding=utf-8
from HTMLParser import HTMLParser
import urllib
import sys
import os
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.links = []
def handle_starttag(self, tag, attrs):
if tag == "img":
for (variable, value) in attrs:
if (variable == "src"):
self.links.append(value)
if __name__ == "__main__":
html = urllib.urlopen('http://www.baidu.com').read()
hp = MyHTMLParser()
# hp.feed(html)
# 网易的网页编码格式为gb2312需要修改成这样否则出错
hp.feed(html.decode('gb2312','ignore'))
hp.close()
imgs = hp.links
# print imgs
'''
获
from HTMLParser import HTMLParser
import urllib
import sys
import os
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.links = []
def handle_starttag(self, tag, attrs):
if tag == "img":
for (variable, value) in attrs:
if (variable == "src"):
self.links.append(value)
if __name__ == "__main__":
html = urllib.urlopen('http://www.baidu.com').read()
hp = MyHTMLParser()
# hp.feed(html)
# 网易的网页编码格式为gb2312需要修改成这样否则出错
hp.feed(html.decode('gb2312','ignore'))
hp.close()
imgs = hp.links
# print imgs
'''
获