主要是用了SGMLParser和urllib模块
#
!/usr/lib/python
# getimg.py
import sys,os
from sgmllib import SGMLParser
type = sys.getfilesystemencoding()
class URLLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.is_Contant = ""
self.titles = []
self.imgs = []
def start_div(self, attrs):
href = [v for k, v in attrs if k == ' class ' ]
if href:
if href[0] == ' posttitle ' :
self.is_Contant = 1
def end_div(self):
self.is_Contant = ""
def start_img(self,attrs):
href = [self.imgs.append(v) for k, v in attrs if k == ' src ' ]
def handle_data(self, text):
if self.is_Contant:
text = text.decode( ' UTF-8 ' ).encode(type)
self.titles.append(text)
if __name__ == " __main__ " :
import urllib
u = ' http://www.cnblogs.com '
usock = urllib.urlopen(u)
parser = URLLister()
parser.feed(usock.read())
usock.close()
parser.close()
f = file( ' result.txt ' , ' w ' )
for title in parser.titles:
print title
f.write(title + ' \r\n ' )
for img in parser.imgs:
urllib.urlretrieve(( '' if img.find( ' http:// ' ) == 0 else u) + img, ' d:/tmp/ ' + img.split( ' / ' )[ - 1 ])
f.close()
# getimg.py
import sys,os
from sgmllib import SGMLParser
type = sys.getfilesystemencoding()
class URLLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.is_Contant = ""
self.titles = []
self.imgs = []
def start_div(self, attrs):
href = [v for k, v in attrs if k == ' class ' ]
if href:
if href[0] == ' posttitle ' :
self.is_Contant = 1
def end_div(self):
self.is_Contant = ""
def start_img(self,attrs):
href = [self.imgs.append(v) for k, v in attrs if k == ' src ' ]
def handle_data(self, text):
if self.is_Contant:
text = text.decode( ' UTF-8 ' ).encode(type)
self.titles.append(text)
if __name__ == " __main__ " :
import urllib
u = ' http://www.cnblogs.com '
usock = urllib.urlopen(u)
parser = URLLister()
parser.feed(usock.read())
usock.close()
parser.close()
f = file( ' result.txt ' , ' w ' )
for title in parser.titles:
print title
f.write(title + ' \r\n ' )
for img in parser.imgs:
urllib.urlretrieve(( '' if img.find( ' http:// ' ) == 0 else u) + img, ' d:/tmp/ ' + img.split( ' / ' )[ - 1 ])
f.close()
上面的代码将主题保存到了当前目录的result.txt文件里
所有的图片保存到了d:/tmp/目录