用python实现的一个抓取图片的爬虫

最新推荐文章于 2023-10-06 22:07:56 发布

1994hb

最新推荐文章于 2023-10-06 22:07:56 发布

阅读量658

点赞数

分类专栏： python 文章标签： python 正则表达式爬虫

本文链接：https://blog.csdn.net/u012841873/article/details/43117941

版权

python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

最近学到python的时候学到正则表达式，于是心血来潮就用python写了一个简单抓取网站图片的小爬虫，为此贴上代码来记录自己学习python的过程，同时也希望大家多提出爬虫改进的意见，因为此爬虫只能抓取以http://开头的图片，而像百度图库的图片大多数都不是以http：//开头，希望大家能给出意见。

废话不多说，代码上：

version：1.0

#!/usr/bin/python

import re
import urllib


def getHtml():
	url=raw_input('Enter url:')
	page=urllib.urlopen(url)
        html=page.read()
	return html

def getImgUrl(html):
	print '''
	         1 represents .jpg
	         2 represents .png
		 4 represents .gif
                 3 represents .jpg+.png
		 5 represents .jpg+.gif
		 6 represents .png+.gif
		 7 represents .jpg+.png+.gif
		 '''
        imgForm=int(raw_input('Enter the Form of Img:'))
	imgurllist=[]
	if imgForm==1:
		reg=r'http://[^\s]+?\.jpg'
		imgre=re.compile(reg,re.I)
		imgurllist=re.findall(imgre,html)
	elif imgForm==2:
                reg=r'http://[^\s]+?\.png'
		imgre=re.compile(reg,re.I)
		imgurllist=re.findall(imgre,html)
	elif imgForm==4:
		reg=r'http://[^\s]+?\.gif'
		imgre=re.compile(reg,re.I)
		imgurllist=re.findall(imgre,html)
	elif imgForm==3:
	        reg=r'(http://[^\s]+?\.(png|jpg))'
		imgre=re.compile(reg,re.I)
		imglist=re.findall(imgre,html)
		imgurllist=[]
                for x in imglist:
			imgurllist.append(x[0])
	elif imgForm==5:
		reg=r'(http://[^\s]+?\.(gif|jpg))'
		imgre=re.compile(reg,re.I)
		imglist=re.findall(imgre,html)
		imgurllist=[]
                for x in imglist:
			imgurllist.append(x[0])
        elif imgForm==6:
		reg=r'(http://[^\s]+?\.(png|gif))'
		imgre=re.compile(reg,re.I)
		imglist=re.findall(imgre,html)
		imgurllist=[]
                for x in imglist:
			imgurllist.append(x[0])
	else:
                reg=r'(http://.+?\.(png|jpg|jpg))'
		imgre=re.compile(reg,re.I)
		imglist=re.findall(imgre,html)
		imgurllist=[]
                for x in imglist:
			imgurllist.append(x[0])
         
	return imgurllist

def downloadImg(url):
	jpg=0;
	png=0;
	gif=0;
	for imgurl in url:
		if(re.findall(r'.+\.jpg',imgurl)):
			urllib.urlretrieve(imgurl,"%s.jpg"%jpg)
			jpg+=1
		elif(re.findall(r'.+\.png',imgurl)):
			urllib.urlretrieve(imgurl,"%s.png"%png)
			png+=1
		elif(re.findall(r'.+?\.gif',imgurl)):
			urllib.urlretrieve(imgurl,"%s.gif"%gif)
			gif+=1
		else:
			print "not picture captured"
	
html=getHtml()
imgurl=getImgUrl(html)
downloadImg(imgurl)
print imgurl