1. 用urllib2/sgmllib包,将目标网页的所有URL列出。
import
urllib2
from sgmllib import SGMLParser
class URLLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.urls = []
def start_a(self, attrs):
href = [v for k, v in attrs if k == ' href ' ]
if href:
self.urls.extend(href)
f = urllib2.urlopen( " http://icode.csdn.net " )
if f.code == 200 :
parser = URLLister()
parser.feed(f.read())
f.close()
for url in parser.urls: print url
from sgmllib import SGMLParser
class URLLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.urls = []
def start_a(self, attrs):
href = [v for k, v in attrs if k == ' href ' ]
if href:
self.urls.extend(href)
f = urllib2.urlopen( " http://icode.csdn.net " )
if f.code == 200 :
parser = URLLister()
parser.feed(f.read())
f.close()
for url in parser.urls: print url
2. 使用BeautifulSoup分析数据