有时候需要自动下载图片,在图片比较多的时候,一张张保存也麻烦,该脚本可以自动爬取站点里面的图片,并保存。
需要的童鞋拿去用
#-*- coding: utf-8 -*-
'''
author: Derry
date:2015.1.19
'''
from HTMLParser import HTMLParser
import urllib
import time
import random
import os
import re
def saveImage(host,url):
try:
splitPath = url.split('/')
f_name ="%d_"%random.randint(1,99999) + splitPath.pop()
res = re.match('^http',url)
if res is None:
url='http://'+host+"/"+url
print 'fixed image url=',url
cmd='curl -o ./img/%s %s'%(f_name,url)
os.system(cmd)
except Exception,e:
print "[Error]couldn't download: %s:%s" %(f_name,e)
def getHost(url):
hosts=[]
res = re.match('^http',url)
if res == None:
return ""
else:
segs=url[7:].split('/')
print segs
#segs[0] is the host
return segs[0]
class MyParser(HTMLParser):
def __init__(self,url):
HTMLParser.__init__(self)
self.url_list=[]
self.url = url
self.host=getHost(url)
def handle_starttag(self,tag,attrs):
if tag == 'a' and attrs:
for key ,value in attrs:
if key == 'href':
if len(value) < 10 or value.find('javascript') != -1:
continue
self.url_list.append(value)
if tag == 'img' and attrs:
for key,value in attrs:
if key=='src':
print 'img url=',value
saveImage(self.host,value)
#urllib.urlretrieve(value,genFileName())
def getUrlList(self):
v_list=[]
for url in self.url_list:
if url.find(self.url) == -1:
res = re.match('^http',url)
if res is None:
url='http://'+self.host+"/"+url;
print 'fixed url=',url
v_list.append(url)
else:
print 'is expect url',url
return v_list
all_urls = ['http://www.hao123.com'] #抓取图片的初始网址
history_urls = []
all_hosts=[]
while len(all_urls) > 0:
cur_url=''
while True:
cur_url=all_urls[0]
if cur_url in history_urls:
all_urls.remove(cur_url)
continue
else:
all_urls.remove(cur_url)
history_urls.append(cur_url)
break
print 'visiting url [%s]'%(cur_url)
try:
page = urllib.urlopen(cur_url).read()
except Exception,e:
print 'url open error',e
continue
parser = MyParser(cur_url)
#print page
try:
parser.feed(page)
except Exception,e:
print 'feed error',e
continue
cur_url_list = parser.getUrlList()
#cur_host_list = splitHost(cur_url_list)
#print cur_host_list
all_urls.extend(cur_url_list)
parser.close()
# print all_urls