网站链接:http://www.plantarium.ru/page/samples/taxon/41302.html
网站的图片需要逐级进去,而且打开缓慢容易出错,所以打算把图片下下来便于查找,于是便有了这个小爬虫。
# -*- coding: utf-8 -*-
import re,os,requests,urllib2,chardet,time,sys #requests,chardet模块需要自己安装
stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr
reload(sys)
sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde
sys.setdefaultencoding('utf-8')
#只获取网页源代码
def only_content(url):
headers = {'User-agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
request = requests.get(url,timeout=20,headers = headers)
content = request.text
return content
#获取网页源代码(提取所需内容)
def get_content(url,reg):
i=0
p=True
headers = {'User-agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
while p and i<=10:
try:
request = requests.get(url,timeout=20,headers = headers)
content = request.text
want=reg.findall(content)
if want==[]:
i+=1
print 'get none,I will try again'
# time.sleep(1)
else:
print 'get success!'
p=False
except:
i+=1
print 'get wrong,please wait 2 seconds!'
time.sleep(2)
return want
#获取网页源代码(用于转码)-为了解决防止个别网址不是使用的utf-8而乱码
def for_change(url,reg):
p=True
headers={'User-agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
while p:
try:
request=urllib2.Request(url,headers=headers)
req=urllib2.urlopen(request,timeout=20)
res=req.read()
enc=chardet.detect(res)['encoding']
print u'该网页使用'+enc+u'编码'
content=res.decode(enc).encode('utf-8')
want=reg.findall(content)
print 'get success!'
p=False
except:
print 'get wrong,please wait 10 minutes!'
time.sleep(10)
return want
#创建文件夹
def create_folder(path):
if not os.path.exists(path):
os.mkdir(path)
#保存图片
def download_image(imageurl,imagename):
i=0
p=True
while p and i<=10:
try:
data=requests.get(imageurl,timeout=20).content
with open(imagename,'wb') as f:
f.write(data)
p=False
except:
i+=1
print 'save picture wrong,please wait 2 seconds'
time.sleep(2)
#主程序
if __name__ == '__main__':
path='D:\\Russian_pictures\\'
create_folder(path)
n=0 #计数
order=[] #存放目网址
family=[] #存放科网址
genus=[] #存放属网址
#提取单双子叶
url="http://www.plantarium.ru"
url1=url+'/page/samples/taxon/41302.html'
a1=re.compile(r'href="(/page/samples/taxon.+?.html)',re.I) #()为无命名组,仅获取括号内内容
u1=get_content(url1,a1)
print u1
#提取目
for u11 in u1:
url2=url+u11
a2=re.compile(r'href="(/page/samples/taxon.+?.html)',re.I)
u2=get_content(url2,a2)
u2.pop(0) #删除第一个目录网址
order.extend(u2)
print 'It has '+str(len(order))+' orders'
#提取科
for u22 in order:
url3=url+u22
a3=re.compile(r'href="(/page/samples/taxon.+?.html)',re.I)
u3=get_content(url3,a3)
u3.pop(0)
u3.pop(0)
family.extend(u3)
print 'It has '+str(len(family))+' families'
#提取属
for u33 in family:
url4=url+u33
a4=re.compile(r'href="(/page/samples/taxon.+?.html)',re.I)
u4=get_content(url4,a4)
u4.pop(0)
u4.pop(0)
u4.pop(0)
genus.extend(u4)
print 'It has '+str(len(genus))+' genera'
#下载种 (直接从科里面提)
for u44 in genus:
url5=url+u44
print url5
a5=re.compile(r'href="(/page/view/item/.+?.html)',re.I)
b5=re.compile(r'this,event.+?">(.+?)</a>',re.I)
u5=get_content(url5,a5)
n5=get_content(url5,b5) #每个科的路径列表
pat=path
for pa in n5:
pat=pat+pa+'\\'
create_folder(pat)
u5=set(u5) #每个属的所有图片集合
#获取该属图片页数
for u55 in u5:
pp=True
num=0 #设置错误超过次数跳过
url6=url+u55
#此处的俄文用正则表达式没有匹配到,不知道什么原因,有大神了解的请指点下!!
'''
a6=re.compile(r'из (.+?) найденных изображений')
page=int(get_content(url6,a6)[0])/30+1
'''
#这里使用split函数来代替
while pp and num<=10:
try:
number=only_content(url6).split('найденных изображений')[0].split('Показаны')[1].split('из ')[1]
print number
page=int(number)/30+1
pp=False
for i in range(0,page):
url7=url6.replace('view/item','view/part/'+str(i)+'/item')
a7=re.compile(r'href="(/page/image/id/.+?.html)',re.I)
u7=get_content(url7,a7)
#提取每张图片
for u77 in u7:
n+=1
url_every=url+u77
name_a=re.compile(r'<title>.+?([a-zA-Z]+ +[a-zA-Z]*).+?</title>',re.I)
image_a=re.compile(r'src="(.+?.jpg)" width=',re.I)
name=get_content(url_every,name_a)[0].strip()+'-'+str(n)+'.jpg'
print name
image_name=pat+name
image_url=url+get_content(url_every,image_a)[0]
download_image(image_url,image_name)
print str(n)+' now'
except:
num+=1
print 'page is not get,please wait 2 seconds'
time.sleep(2)
print 'all '+str(n)+' download over'