from sgmllib import SGMLParser from string import find, replace, rjust from threading import Thread import urllib __author__ = " Chen Peng (peng.ch@hotmail.com) " __version__ = " $Revision: 1.0 $ " __date__ = " $Date: 2006/03/03 $ " __copyright__ = " Copyright (c) 2006 Chen Peng " __license__ = " Python " __all__ = [ " Gif_163_Parser " ] class PDownloadThread( Thread ): """ Download the files in the dict and save them to local files with the given name """ def __init__ ( self, DictList,i ): Thread. __init__ ( self ) self.DictList = DictList self.pageno = str(i) def run( self ): for k in self.DictList.keys(): try : print ' Download ' + self.DictList[k] + ' ...... ' uFile = urllib.urlretrieve( self.DictList[k], ' ./files/ ' + k + ' . ' + self.DictList[k].split( ' . ' )[self.DictList[k].split( ' . ' ). __len__ () - 1 ]) except : logfile = open( ' error.log ' , ' a ' ) logfile.write(self.pageno + ' ' + self.DictList[k] + ' ' + k + ' ' ) logfile.close() print ' Save to file ' + k class Gif_163_Parser( SGMLParser ): """ 任务:下载163彩图 原理:http://mms.163.com/new_web/cm_lv2_pic.jsp?catID=&ord=dDate&page=2&type=1&key= 从1到415页(共6637)分析得到如下路径:“/fgwx/hhsj/1_060302175613_186/128x128.gif” eg:<script>showPic('22930','1','/fgwx/hhsj/1_060302175613_186/128x128.gif', '1','编号:22930 名字: 因为有你 人气:100');</script> 下载路径:http://mmsimg.163.com/new_web/loaditem.jsp/type=1/path=/fgwx/llfj/1_060302175612_995/176x176.gif """ def reset( self ): SGMLParser.reset( self ) self.headURL = ' http://mmsimg.163.com/new_web/loaditem.jsp/type=1/path= ' self.SubURL = [] self.Links = {} def start_script( self, attrs ): # self.SubURL.extend( [' %s="%s"' % ( key, value ) for key, value in attrs] ) pass def end_script( self ): pass def handle_data( self, text ): if find( text, ' showPic ' ) !=- 1 : self.Links[replace( text.split( ' /n ' )[ 1 ], ' Ãû×Ö: ' , '' )] = self.headURL + replace ( text.split( ' , ' )[ 2 ], ' ' ' , '' ); def Execute( self ): for i in range( 1 , 415 ): self.Links.clear try : usock = urllib.urlopen( " http://mms.163.com/new_web/cm_lv2_pic.jsp?catID=&ord=dDate&page= " + str(i) + " &type=1&key= " ) self.feed( usock.read() ) usock.close() TestThread = PDownloadThread( self.Links ,i ) TestThread.start() self.close() except IOError: pass # print ( ["%s=%sn"% ( k, self.Links[k] ) for k in self.Links.keys()] ) # print self.Links if __name__ == ' __main__ ' : # Gif_163_Parser().Execute() testtask = Gif_163_Parser() testtask.Execute()