该程序为下载 http://www.orchidspecies.com/ 兰花网站图片,并以名字命名图片的小爬虫。
requests,chardet 第三方模块需要自己下载。
# -*- coding: utf-8 -*-
import re,os,requests,urllib2,chardet,time,sys
stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr
reload(sys)
sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde
sys.setdefaultencoding('utf-8')
#获取网页源代码(提取所需内容)
def get_content(url,reg):
headers = {'User-agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
request = requests.get(url,timeout=20,headers = headers)
content = request.text
want=reg.findall(content)
return want
#获取网页源代码(用于转码)-为了解决http://www.orchidspecies.com/indexcattleyo.htm乱码
def for_change(url,reg):
headers={'User-agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
request=urllib2.Request(url,headers=headers)
req=urllib2.urlopen(request,timeout=20)
res=req.read()
enc=chardet.detect(res)['encoding']
print u'该网页使用'+enc+u'编码'
content=res.decode(enc).encode('utf-8')
want=reg.findall(content)
return want
#创建文件夹
def create_folder(path):
if not os.path.exists(path):
os.mkdir(path)
#保存图片
def download_image(imageurl,imagename):
data=requests.get(imageurl,timeout=20).content
with open(imagename,'wb') as f:
f.write(data)
#写入记事本备份
def create_txt(txtname,data):
with open(txtname,'a') as f:
f.write(data)
#下载每个种
def load_picture(everyurl,url,path,n):
p3=True
x=1
a3=re.compile(r'src="(.+?\.\w{3})"',re.I)
#获取每个种的网址和名字
if everyurl.find('">')!=-1:
picurl=everyurl.split('">')[0]
name=' '.join(everyurl.split('">')[1].strip().split())
name=name.replace(' x ',u' × ').replace('<P>','').replace("?","").replace("!","")
if name.find(u' × ')!=-1:
name=name.split()[0]+' '+name.split()[1]+' '+name.split()[2]
else:
name=name.split()[0]+' '+name.split()[1]
#创建种的文件夹
if not os.path.exists(path+name):
os.mkdir(path+name)
print name
#获取图片网址并下载
while p3:
try:
u4=get_content(url+'/'+picurl,a3)
p3=False
for u5 in u4:
p4=True
if u5 not in('orphotdir/scent.jpg',
'orphotdir/deepshade.jpg',
'orphotdir/partialshade.jpg',
'orphotdir/partialsun.jpg',
'orphotdir/sun.jpg',
'orphotdir/tempcold.jpg',
'orphotdir/tempcool.jpg',
'orphotdir/tempint.jpg',
'orphotdir/temphot.jpg',
'orphotdir/spring.jpg',
'orphotdir/summer.jpg',
'orphotdir/fall.jpg',
'orphotdir/winter.jpg'):
while p4:
try:
imageurl=url+'/'+u5
imagename=path+name+"\\%s %s-%s.jpg" % (name,str(n),str(x))
download_image(imageurl,imagename)
print str(n)+'-'+str(x)
x+=1
p4=False
except:
print str(n)+'-'+str(x)+' is not download,please wait 10 second!'
time.sleep(10)
p3=False
except:
txtname=u'出错.txt'
data=url+'/'+picurl+' '+name+' '+time.strftime('%Y-%m-%d %X', time.localtime())+'\n'
with open(txtname,'a') as f:
f.write(data)
print u'第'+str(n)+u'个种网页获取失败,请稍候10秒'
time.sleep(10)
if __name__ == '__main__':
path='D:\\orchid_only\\'
create_folder(path)
n=0 #计数
alll=[] #存放所有种网址
#提取一级网址
url="http://www.orchidspecies.com"
a1=re.compile(r'SIZE=2><A href="(index\w.+?)">',re.I)
p1=True
print url
while p1:
try:
u1=content1=get_content(url,a1)
u1=list(set(u1))
print u'获取一级网址成功,开始提取二级网址'
p1=False
except:
print u'获取一级网址失败,10秒后重新连接'
time.sleep(10)
#提取二级网址
a2=re.compile(r'<P><LI><a href="(.+?)</A>',re.I)
for u2 in u1:
u2=url+'/'+u2
p2=True
print u2
while p2:
try:
u3=get_content(u2,a2)
print len(u3)
if len(u3)==0:
u3=for_change(u2,a2)
print len(u3)
alll.extend(u3)
print u'获取二级网址成功,存放成功'
p2=False
except:
print u'获取二级网址失败,10秒后重新连接'
time.sleep(10)
#提取每个种的图片
for everyurl in alll:
n+=1
print u'正在下载第'+str(n)+u'个种'
load_picture(everyurl,url,path,n)
print 'over,共下载兰花'+str(n)+'种'