python抓取网页图片显示不出来_python抓取网页图片

最新推荐文章于 2021-02-04 03:58:17 发布

weixin_39955351

最新推荐文章于 2021-02-04 03:58:17 发布

阅读量262

点赞数

文章标签： python抓取网页图片显示不出来

#-*- coding: utf-8 -*-

importurllib, httplib, urlparseimportsysimportredefhttpExists(url):

host, path= urlparse.urlsplit(url)[1:3]if ':' inhost:#port specified, try to use it

host, port = host.split(':', 1)try:

port=int(port)exceptValueError:print 'invalid port number %r' %(port,)returnFalseelse:#no port specified, use default port

port =Nonetry:

connection= httplib.HTTPConnection(host, port=port)

connection.request("HEAD", path)

resp=connection.getresponse( )if resp.status == 200: #normal 'found' status

found =Trueelif resp.status == 302: #recurse on temporary redirect

found = httpExists(urlparse.urljoin(url,resp.getheader('location', '')))else: #everything else -> not found

print "Status %d %s : %s" %(resp.status, resp.reason, url)

found=FalseexceptException, e:print e.__class__, e, url

found=Falsereturnfound"""根据url获取文件名"""

defgGetFileName(url):if url==None: returnNoneif url=="" : return ""arr=url.split("/")return arr[len(arr)-1]"""根据url下载文件，文件名参数指定"""

defgDownloadWithFilename(url,savePath,file):#参数检查，现忽略

try:

urlopen=urllib.URLopener()

fp=urlopen.open(url)

data=fp.read()

fp.close()print 'download file url :',url

file=open(savePath + file,'w+b')

file.write(data)

file.close()exceptIOError:print "download error!"+urldefgDownload(url,savePath):

fileName=gGetFileName(url)

gDownloadWithFilename(url,savePath,fileName)defgetRexgList(lines,regx,searchRegx):if lines==None : returnlists=[]for line inlines:

ismatch=re.search(regx,line,re.IGNORECASE)ifismatch :

matchs=re.search(searchRegx,line,re.IGNORECASE)if matchs !=None:

groups=matchs.groups()for str ingroups:if str not inlists:

lists.append(str)returnlistsdefcheckLine(lines):for line inlines :

matchs= re.search(r'url\((\S+)\)',re.IGNORECASE)if matchs !=None :printmatchs.groups()defgetPageLines(url):if url==None : return

if not httpExists(url): return

try:

page=urllib.urlopen(url)

html=page.readlines()

page.close()returnhtmlexcept:print "getPageLines() error!"

return

defgetCurrentPageImage(url,savePath):

lines=getPageLines(url)print 'lines.length',len(lines)

regxlists= getRexgList(lines,r'src\s*="images(\S+)"',r'src\s*="(\S+)"')if regxlists==None: return

print 'getCurrentPageImage() images.length',len(regxlists)for jpg inregxlists:

jpg=url +jpg

gDownload(jpg,savePath)defgetCSSImages(link,savePath,url):

lines=getPageLines(link)print 'lines.length',len(lines)

regxlists= getRexgList(lines,r'url\((\S+)\)',r'url\((\S+)\)')if regxlists==None: return

print 'getCurrentPageImage() images.length',len(regxlists)for jpg inregxlists:

jpg=url +jpg

gDownload(jpg,savePath)"""根据url获取其上的相关htm、html链接，返回list"""

defgGetHtmlLink(url):#参数检查，现忽略

rtnList=[]

lines=getPageLines(url)

regx= r"""href="?(\S+)\.htm"""

for link in getRexgList(lines,regx,r'href="(\S+)"'):

link=url +linkif link not inrtnList:

rtnList.append(link)printlinkreturnrtnList"""根据url获取其上的相关css链接，返回list"""

defgGetCSSLink(url):#参数检查，现忽略

rtnList=[]

lines=getPageLines(url)

regx= r"""href="?(\S+)\.css"""

for link in getRexgList(lines,regx,r'href="(\S+)"'):

link= url +linkif link not inrtnList:

rtnList.append(link)returnrtnListdefgetPageImage(url,savePath):"""getCurrentPageImage(url,savePath)"""

"""读取其他的CSS，html文件中的图片

links=gGetHtmlLink(url)

for link in links:

print u'get images on link-html读取'

getCurrentPageImage(link,savePath)"""links=gGetCSSLink(url)for link inlinks:print 'get images on link:',link

getCSSImages(link,savePath,url)if __name__ == '__main__':

url= 'http://www.templatemo.com/templates/templatemo_281_chrome/'savePath= 'd:/tmp/'

print 'download pic from [' + url +']'

print 'save to [' +savePath+'] ...'getPageImage(url,savePath)print "download finished"

weixin_39955351

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python抓取网页图片显示不出来_python抓取网页图片

#-*- coding: utf-8 -*-importurllib, httplib, urlparseimportsysimportredefhttpExists(url):host, path= urlparse.urlsplit(url)[1:3]if ':' inhost:#port specified, try to use ithost, port = host.split(':',...
复制链接

扫一扫