写了个爬虫程序,爬取某外国漫画网站的图片。由于网络环境不稳定,常常造成图片下载不完整的情况(我用流式传输的方式下载的)。
那么,怎么才能筛选出这些下载不完整的图片呢?
网上查阅资料,jpg格式图片的二进制文件是有固定的起始标识和结束标识的,因此可以利用这一特点,对其逐一检查。
结束符:……\xff\xd9
因为我的图片都是jpg格式,所以没有进行起始符的校验。有需要的兄弟可以搜一下。
废话不多说了,代码如下:
import os
import re
import sys
sys.path.insert(0, os.path.dirname(__file__))
from muses_8 import Muses_8
# 保存不完整的图片名的列表;
DELETED_FILES_LIST = []
class IntegralityValidation(object):
def __init__(self):
# 初始化根目录路径;
self.base_path = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + '/8muses/'
def perform_looping(self, base_point_list, base_point):
for point in base_point_list:
point_full_path = os.path.join(base_point, point)
if os.path.isfile(point_full_path):
# 如果对象是jpg文件,则校验完整性;
self.perform_validation(point_full_path)
elif os.path.isdir(point_full_path):
# 如果对象是文件夹,则对其进行遍历;
self.perform_listting(point_full_path)
def perform_listting(self, base_point):
base_point_list = os.listdir(base_point)
self.perform_looping(base_point_list, base_point)
def filter_dir_list(self, base_point, dir_pass_list):
base_point_list = os.listdir(base_point)
#筛选出不需要遍历的一级目录;
base_point_list = list(set(base_point_list) - set(dir_pass_list))
# 开始遍历;
self.perform_looping(base_point_list, base_point)
def perform_validation(self, file_path):
# 校验脚本的核心,判断是否以b'xff\xd9'结尾;
if not re.search(r'xff\\xd9\'$', str(open(file_path, 'rb').read())):
print('\033[93mimage not completely downloaded.ready to delete.\033[0m')
DELETED_FILES_LIST.append(file_path)
def run(self, dir_pass_list):
self.filter_dir_list(self.base_path, dir_pass_list)
removed_list = []
global DELETED_FILES_LIST
# 遍历列表,删除损坏图片;
for file_path in DELETED_FILES_LIST:
try:
os.remove(file_path)
except FileNotFoundError:
pass
print('file [%s] doesn\'t exist.' % file_path)
except:
print('\033[91mremoving %s failed.\033[0m')
continue
print('\033[96mDone.\033[0m')
removed_list.append(file_path)
failed_removing_list = set(DELETED_FILES_LIST) - set(removed_list)
if failed_removing_list:
print('some files not deleted:')
print(failed_removing_list)
else:
print('\033[96m' + '-' * 50 + 'removing done.' + '-' * 50 + '\033[0m')
if __name__ == '__main__':
# 排除已经完整下载过的页面;
finished_pages = [1, 2, 3, 4]
dir_pass_list = []
for page_num in finished_pages:
dir_pass_list.extend(Muses_8().get_titles_in_this_page(page_num))
validation_tool = IntegralityValidation()
validation_tool.run(dir_pass_list)