__author__ = 'AllenMinD'
import requests,urllib,os
from bs4 import BeautifulSoup
ans = 1 #counting
for page in range(0,43):
flag = 1 #web exists or not
if page<10:
url = 'http://bbs.fengniao.com/forum/pic/slide_101_8903443_8017670'+str(page)+'.html'
else:
url = 'http://bbs.fengniao.com/forum/pic/slide_101_8903443_801767'+str(page)+'.html'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text,'lxml')
file_name = ''
download_link = []
for pic_tag in soup.find_all('a'):
if pic_tag.get('href') == '/forum/8903443.html':
file_name = pic_tag.get('title')
if pic_tag.get('class') == ['pictureDownload']:
if pic_tag.get('href') == '': #if this page is None
flag = 0
break
else:
download_link.append(pic_tag.get('href'))
if flag == 0 : #this page is None
continue
folder_path = 'D:/spider_things/2016.4.8/' + file_name + '/'
if not os.path.exists(folder_path):
os.makedirs(folder_path)
for item in download_link:
try:
urllib.urlretrieve(item,folder_path + str(ans) + '.jpg')
print 'you have downloaded' , ans , 'pic(s)'
ans = ans + 1
except urllib.ContentTooShortError,e: #if the picture is too big , pass it
continue
这次同样是爬去蜂鸟网的图片,但是中途遇到了2个新问题:
1. 空网页:
蜂鸟网的有些图片集的图片连接不是连号的,这时候就要用一个if语句来跳过一些没有图片的连接
if pic_tag.get('href') == '': #if this page is None
flag = 0
break
.....
if flag == 0 : #this page is None
continue
2.错误处理
爬取这次图片的时候发现,有些图片太大了,超出了urllib.urlretrieve方法所规定的范围,即出现报错:urllib.ContentTooShortError
这时候,要利用try...except 来处理
try...except的格式是:
try:
......
except 错误类型(如urllib.ContentTooShortError),e:
......