2021SC@SDUSC 山大智云 8.offline_download
offline_download是基于php和bash的离线下载神奇。可以正确的为你下载网站,指定要与不要的路径联接部分,或是特定文件格式。
介绍关于offline_download功能的实现。
结构
model
db_oper
thread_pool
offline_download
offline_download_setting
handlers
model.py
该文件主要是定义关于该功能的数据库模型
OfflineDownloadRecord(Base)
odr_id(主键,自增)
repo_id
path
url
owner
timestamp
size
status
comment
OfflineDownloadStatus(object)
定义下载时可能出现的状态
UNKNOWN = 0
WAITING = 1
QUEUING = 2
DOWNLOADING = 3
OK = 4
ERROR = 5
TLE = 6 # Time limit exceed
db_oper.py
目录结构
class DBOper
def add_offline_download_record
def get_offilne_download_tasks
def get_offline_download_tasks_by_user
def get_record_status
def get_record_comment
def get_record_file_size
6个方法都是对数据库进行相关操作
添加操作
session.add(...)
session.commit()
查询操作
q=session.query()
...对q进行一系列操作(排序,slice)
return q.all()
or
r=q.first,然后返回r的具体属性
class DBOper(object)
对象结构
def __init__(self, settings):
self.edb_session = settings.session_cls
self.seafdb_session = settings.seaf_session_cls
def set_record_status(self, odr_id, status, comment=None):
def set_record_comment(self, odr_id, comment):
# When download success, the path will be set to file path.
def set_record_path(self, odr_id, path):
def get_record_comment(self, odr_id):
def get_offline_download_tasks_by_status(self, status=OfflineDownloadStatus.WAITING):
def set_record_file_size(self, odr_id, size):
每个函数的操作类似,都是对数据库进行一定的操作
thread_pool.py
定义线程池,线程池与病毒扫描中的定义一样,故不再介绍。
offline_download_settings.py
文件结构
__init__(self,config_file):初始化
parse_config(self,config_file):解析配置
def is_enabled(self):返回是否有文件下载功能
def __init__(self, config_file):
self.enable_offline_download = False
self.temp_dir = '/tmp/offline-download'
self.max_workers = 10
# default 10
self.time_limit = 30 * 60
# default 30 minutes
self.session_cls = None
self.seaf_session_cls = None
self.parse_config(config_file)
init方法定义了一系列变量。
def parse_config(self, config_file):
try:
cfg = ConfigParser()
events_conf = config_file
cfg.read(events_conf)
except Exception as e:
logger.error('Failed to read events config, disable offline download: %s', e)
return
conf = get_offline_download_conf(cfg)
if not conf['enabled']:
return
try:
self.session_cls = appconfig.session_cls
self.seaf_session_cls = appconfig.seaf_session_cls
except Exception as e:
logger.warning('Failed to init db session class: %s', e)
return
self.enable_offline_download = True
self.temp_dir = conf['tempdir']
self.max_workers = conf['workers']
# default 10
self.time_limit = conf['time-limit']
parse_config方法解析配置。对init中的属性进行设置。
def is_enabled(self):
return self.enable_offline_download
is_enabled方法返回enable_offline_download属性。该属性定义了是否拥有offline_download功能。
offline_download.py
文件结构
OfflineDownloadTask:定义了一些属性
(odr_id,repo_id,path,url,owner)
OfflineDownLoad:
__init__:定义了settings,DBOper,thread_pool
restore()
start()
tle_call_back()
download_file()
OfflineDownloadTimeLimitTimer(Thread)
__init__:定义了settings,finished,downloaded
run()
cancel()
补充:event解决其他线程需要通过某个线程的状态来确定下一步的操作的线程同步问题。(与信号量类似)
event.isSet():返回event的状态值;
event.wait():如果 event.isSet()==False将阻塞线程;
event.set(): 设置event的状态值为True,所有阻塞池的线程激活进入就绪状态, 等待操作系统调度;
event.clear():恢复event的状态值为False。
class OfflineDownloadTimeLimitTimer(Thread):
def __init__(self, downloader, settings):
Thread.__init__(self)
self.settings = settings
self.finished = Event()
self.downloader = downloader
def run(self):
while not self.finished.is_set():
self.finished.wait(5)
if not self.finished.is_set():
self.downloader.start()
def cancel(self):
self.finished.set()
class OfflineDownloadTimeLimitTimer
def __init__(self, downloader, settings):
Thread.__init__(self)
self.settings = settings
self.finished = Event()
self.downloader = downloader
def run(self):
while not self.finished.is_set():
self.finished.wait(5)
if not self.finished.is_set():
self.downloader.start()
def cancel(self):
self.finished.set()
class OfflineDownload
restore:检查并恢复所有中断的任务。首先添加下载状态任务,然后添加排队任务。
task_list = self.db_oper.get_offline_download_tasks_by_status(OfflineDownloadStatus.DOWNLOADING)
for row in task_list:
self.thread_pool.put_task(OfflineDownloadTask(row.odr_id, row.repo_id, row.path, row.url, row.owner))
task_list = self.db_oper.get_offline_download_tasks_by_status(OfflineDownloadStatus.QUEUING)
for row in task_list:
self.thread_pool.put_task(OfflineDownloadTask(row.odr_id, row.repo_id, row.path, row.url, row.owner))
start():检查并恢复所有中断的任务。
task_list = self.db_oper.get_offline_download_tasks_by_status(OfflineDownloadStatus.WAITING)
if task_list is None or isinstance(task_list, Iterable):
for row in task_list:
self.db_oper.set_record_status(row.odr_id, OfflineDownloadStatus.QUEUING)
self.thread_pool.put_task(OfflineDownloadTask(row.odr_id, row.repo_id, row.path, row.url, row.owner))
else:
logger.debug("[Offline Download] Got an noniterable response from database: %s.", task_list)
tle_call_back:将下载状态设置为TLE(超时)状态
download_file:下载文件的方法
self.db_oper.set_record_status(download_task.odr_id, OfflineDownloadStatus.DOWNLOADING)
tdir = self.db_oper.get_record_comment(download_task.odr_id)
if tdir is None or len(tdir) == 0 or not os.path.isdir(tdir):
tdir = tempfile.mkdtemp(dir=self.settings.temp_dir)
self.db_oper.set_record_comment(download_task.odr_id, tdir)
logger.debug("Created temp dir '%s' for task '%d'", tdir, download_task.odr_id)
else:
logger.debug("Using old temp dir '%s' for task '%d'", tdir, download_task.odr_id)
将下载事件设置为DOWNLOADING,设置record_commit
log_dir = os.path.join(os.environ.get('SEAFEVENTS_LOG_DIR', ''))
logfile = os.path.join(log_dir, 'offline_download.log')
with open(logfile, 'a') as fp:
logger.debug("Setting %s as log space.", logfile)
logger.debug("Executing: aria2c -c --dir \"%s\" \"%s\"", tdir, download_task.url)
try:
subprocess.call(['aria2c', '-c', '--dir', tdir, download_task.url],
stdout=fp, stderr=fp, timeout=self.settings.time_limit)
except subprocess.TimeoutExpired:
self.db_oper.set_record_status(download_task.odr_id, OfflineDownloadStatus.TLE)
return
下载事件配置
file_list = os.listdir(tdir)
if len(file_list) != 1:
raise Exception('No file downloaded')
else:
tfile_name = file_list[0]
tfile_path = os.path.join(tdir, tfile_name)
if not os.path.exists(tfile_path):
raise Exception('File has lost')
seafile_api.post_file(
download_task.repo_id, tfile_path,
download_task.path, file_list[0], download_task.owner
)
self.db_oper.set_record_file_size(download_task.odr_id, os.path.getsize(tfile_path))
self.db_oper.set_record_path(download_task.odr_id, download_task.path +
('' if download_task.path.endswith('/') else '/') + tfile_name)
self.db_oper.set_record_status(download_task.odr_id, OfflineDownloadStatus.OK)
对数据库进行一定操作。
except Exception as e:
logger.warning('Failed to do offline download for task %d: %s.',
download_task.odr_id, e)
self.db_oper.set_record_status(download_task.odr_id, OfflineDownloadStatus.ERROR,
"Download worker error: %s" % e)
finally:
if tdir is not None and len(tdir) > 0:
file_list = os.listdir(tdir)
for item in file_list:
os.unlink(os.path.join(tdir, item))
os.rmdir(tdir)
异常处理和finally方法
handlers.py
def OfflineDownloadEventHandler(session, msg):
elements = msg['content'].split('\t')
if len(elements) != 5:
logging.warning("got bad message: %s", elements)
logging.debug("Expected 4 arguments, found %d.", len(elements))
return
repo_id = elements[1]
path = elements[2]
user_name = elements[3]
url = elements[4]
# add_offline_download_record(appconfig.session_cls(), repo_id, path, user_name)
add_offline_download_record(session, repo_id, path, user_name, url)
def register_handlers(handlers):
handlers.add_handler('seahub.stats:offline-file-upload', OfflineDownloadEventHandler)
进行一些配置。
至此,我们基本了解了offline_download功能的实现结构。
model定义数据库,db_oper定义操作数据库的一些操作,thread_pool定义线程池,offline_setting定义config的parse,offline_download定义download功能的实现。handlers进行最后的一些配置。