网页分析实现批量下载

最新推荐文章于 2024-08-18 23:05:27 发布

rehung

最新推荐文章于 2024-08-18 23:05:27 发布

阅读量1.7k

点赞数

分类专栏： python 文章标签： import thread download date class python

python 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

from sgmllib import SGMLParser

from string import find, replace, rjust

from threading import Thread

import urllib

__author__ = " Chen Peng (peng.ch@hotmail.com) "

__version__ = " $Revision: 1.0 $ "

__date__ = " $Date: 2006/03/03 $ "

__license__ = " Python "

__all__ = [ " Gif_163_Parser " ]

class PDownloadThread( Thread ):

"""

Download the files in the dict and save them to local files with the given name

"""

def __init__ ( self, DictList,i ):

Thread. __init__ ( self )

self.DictList = DictList

self.pageno = str(i)

def run( self ):

for k in self.DictList.keys():

try :

print ' Download ' + self.DictList[k] + ' ...... '

uFile = urllib.urlretrieve( self.DictList[k], ' ./files/ ' + k + ' . ' + self.DictList[k].split( ' . ' )[self.DictList[k].split( ' . ' ). __len__ () - 1 ])

except :

logfile = open( ' error.log ' , ' a ' )

logfile.write(self.pageno + ' ' + self.DictList[k] + ' ' + k + ' ' )

logfile.close()

print ' Save to file ' + k

class Gif_163_Parser( SGMLParser ):

"""

任务:下载163彩图

原理:http://mms.163.com/new_web/cm_lv2_pic.jsp?catID=&ord=dDate&page=2&type=1&key=

从1到415页（共6637）分析得到如下路径：“/fgwx/hhsj/1_060302175613_186/128x128.gif”

eg:<script>showPic('22930','1','/fgwx/hhsj/1_060302175613_186/128x128.gif', '1','编号：22930 名字: 因为有你人气:100');</script>

下载路径:http://mmsimg.163.com/new_web/loaditem.jsp/type=1/path=/fgwx/llfj/1_060302175612_995/176x176.gif

"""

def reset( self ):

SGMLParser.reset( self )

self.headURL = ' http://mmsimg.163.com/new_web/loaditem.jsp/type=1/path= '

self.SubURL = []

self.Links = {}

def start_script( self, attrs ):

# self.SubURL.extend( [' %s="%s"' % ( key, value ) for key, value in attrs] )

pass

def end_script( self ):

pass

def handle_data( self, text ):

if find( text, ' showPic ' ) !=- 1 :

self.Links[replace( text.split( ' /n ' )[ 1 ], ' Ãû×Ö: ' , '' )] = self.headURL + replace ( text.split( ' , ' )[ 2 ], ' ' ' , '' );

def Execute( self ):

for i in range( 1 , 415 ):

self.Links.clear

try :

usock = urllib.urlopen( " http://mms.163.com/new_web/cm_lv2_pic.jsp?catID=&ord=dDate&page= " + str(i) + " &type=1&key= " )

self.feed( usock.read() )

usock.close()

TestThread = PDownloadThread( self.Links ,i )

TestThread.start()

self.close()

except IOError:

pass

# print ( ["%s=%sn"% ( k, self.Links[k] ) for k in self.Links.keys()] )

# print self.Links

if __name__ == ' __main__ ' :

# Gif_163_Parser().Execute()

testtask = Gif_163_Parser()

testtask.Execute()

rehung

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录