经过一个晚上的不懈努力(查书,百度,知乎,开源中国,……),终于完成了我的第一只超简单图片爬虫程序。这只小东西是基于Python 3.x实现的,能够下载指定网页下大于50KB的图片到本地。但它的行动力相当之弱,竟不能再不同的网页间流窜。虽然它只是一只刚刚出生的幼虫,却还是有一些妙用的,同时我还是期待它的成长!
#coding=utf-8
# James Yin <ywhjames@hotmail.com>
# 图片爬虫,获取指定网页下的jpg,png,gif图片,过滤50KB以下的小图
import re
import urllib.request
import time
"""
通过Content-Length头获取文件大小
url - 目标文件URL
proxy - 代理
"""
def get_file_size(url, proxy=None):
opener = urllib.request.build_opener()
if proxy:
if url.lower().startswith('https://'):
opener.add_handler(urllib.request.ProxyHandler({'https' : proxy}))
else:
opener.add_handler(urllib.request.ProxyHandler({'http' : proxy}))
request = urllib.request.Request(url)
request.get_method = lambda: 'HEAD'
try:
response = opener.open(request)
response.read()
except Exception as e:
print('%s %s') % (url, e)
else:
return dict(response.headers).get('Content-Length', 0)
def getHtml(url):
page = urllib.request.urlopen(url)
print(page.info())
html_t = page.read()
try:
html = html_t.decode("UTF-8")
except UnicodeDecodeError as err:
print("Error:", err)
print("Try decode with 'GBK' charSet.")
html = html_t.decode("GBK") #中文字符编码解析
return html
def getImg(html):
imgre = re.compile(r"""
<img\s+
(?:[^>]+\s+)?
(?:src|file)=
(?:
"(https?://[^">]+\.(?:jpg|png|gif))"
|
'(https?://[^'>]+\.(?:jpg|png|gif))'
|
(https?://[^">']+\.(?:jpg|png|gif))
)
[^>]*>
""", re.IGNORECASE|re.VERBOSE) #正则表达式匹配图片URL
imglist = imgre.findall(html)
x = 0
try:
for imgurl in imglist:
for url in imgurl:
if url:
if int(get_file_size(url)) < 50000: #按图片大小过滤
continue
print("download:", url)
filename = time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime(time.time()))
filename += '_' + str(x)
if url[-3:] == 'jpg':
filename += '.jpg'
elif url[-3:] == 'png':
filename += '.png'
else:
filename += '.gif'
try:
urllib.request.urlretrieve(url, filename)
except urllib.error.ContentTooShortError as err:
print("Error:", err)
x+=1
except ValueError as err:
print("Error:",err)
if __name__ == '__main__':
address = input("please input web address:")
html = getHtml(address)
getImg(html)
print("end!")