python抓取网页图片显示不出来_python抓取网页图片

#-*- coding: utf-8 -*-

importurllib, httplib, urlparseimportsysimportredefhttpExists(url):

host, path= urlparse.urlsplit(url)[1:3]if ':' inhost:#port specified, try to use it

host, port = host.split(':', 1)try:

port=int(port)exceptValueError:print 'invalid port number %r' %(port,)returnFalseelse:#no port specified, use default port

port =Nonetry:

connection= httplib.HTTPConnection(host, port=port)

connection.request("HEAD", path)

resp=connection.getresponse( )if resp.status == 200: #normal 'found' status

found =Trueelif resp.status == 302: #recurse on temporary redirect

found = httpExists(urlparse.urljoin(url,resp.getheader('location', '')))else: #everything else -> not found

print "Status %d %s : %s" %(resp.status, resp.reason, url)

found=FalseexceptException, e:print e.__class__, e, url

found=Falsereturnfound"""根据url获取文件名"""

defgGetFileName(url):if url==None: returnNoneif url=="" : return ""arr=url.split("/")return arr[len(arr)-1]"""根据url下载文件,文件名参数指定"""

defgDownloadWithFilename(url,savePath,file):#参数检查,现忽略

try:

urlopen=urllib.URLopener()

fp=urlopen.open(url)

data=fp.read()

fp.close()print 'download file url :',url

file=open(savePath + file,'w+b')

file.write(data)

file.close()exceptIOError:print "download error!"+urldefgDownload(url,savePath):

fileName=gGetFileName(url)

gDownloadWithFilename(url,savePath,fileName)defgetRexgList(lines,regx,searchRegx):if lines==None : returnlists=[]for line inlines:

ismatch=re.search(regx,line,re.IGNORECASE)ifismatch :

matchs=re.search(searchRegx,line,re.IGNORECASE)if matchs !=None:

groups=matchs.groups()for str ingroups:if str not inlists:

lists.append(str)returnlistsdefcheckLine(lines):for line inlines :

matchs= re.search(r'url\((\S+)\)',re.IGNORECASE)if matchs !=None :printmatchs.groups()defgetPageLines(url):if url==None : return

if not httpExists(url): return

try:

page=urllib.urlopen(url)

html=page.readlines()

page.close()returnhtmlexcept:print "getPageLines() error!"

return

defgetCurrentPageImage(url,savePath):

lines=getPageLines(url)print 'lines.length',len(lines)

regxlists= getRexgList(lines,r'src\s*="images(\S+)"',r'src\s*="(\S+)"')if regxlists==None: return

print 'getCurrentPageImage() images.length',len(regxlists)for jpg inregxlists:

jpg=url +jpg

gDownload(jpg,savePath)defgetCSSImages(link,savePath,url):

lines=getPageLines(link)print 'lines.length',len(lines)

regxlists= getRexgList(lines,r'url\((\S+)\)',r'url\((\S+)\)')if regxlists==None: return

print 'getCurrentPageImage() images.length',len(regxlists)for jpg inregxlists:

jpg=url +jpg

gDownload(jpg,savePath)"""根据url获取其上的相关htm、html链接,返回list"""

defgGetHtmlLink(url):#参数检查,现忽略

rtnList=[]

lines=getPageLines(url)

regx= r"""href="?(\S+)\.htm"""

for link in getRexgList(lines,regx,r'href="(\S+)"'):

link=url +linkif link not inrtnList:

rtnList.append(link)printlinkreturnrtnList"""根据url获取其上的相关css链接,返回list"""

defgGetCSSLink(url):#参数检查,现忽略

rtnList=[]

lines=getPageLines(url)

regx= r"""href="?(\S+)\.css"""

for link in getRexgList(lines,regx,r'href="(\S+)"'):

link= url +linkif link not inrtnList:

rtnList.append(link)returnrtnListdefgetPageImage(url,savePath):"""getCurrentPageImage(url,savePath)"""

"""读取其他的CSS,html文件中的图片

links=gGetHtmlLink(url)

for link in links:

print u'get images on link-html读取'

getCurrentPageImage(link,savePath)"""links=gGetCSSLink(url)for link inlinks:print 'get images on link:',link

getCSSImages(link,savePath,url)if __name__ == '__main__':

url= 'http://www.templatemo.com/templates/templatemo_281_chrome/'savePath= 'd:/tmp/'

print 'download pic from [' + url +']'

print 'save to [' +savePath+'] ...'getPageImage(url,savePath)print "download finished"

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值