"""
The queue module is responsible for interacting with the local batch or
queueing system, putting tasks on the queue and removing them as necessary.
"""
import os
import time
import logging
from contextlib import contextmanager
import tornado.httpclient
import tornado.gen
from tornado.concurrent import run_on_executor
import certifi
import iceprod.server
from iceprod.server import module
from iceprod.server.master_communication import send_master
from iceprod.server.globus import SiteGlobusProxy
import iceprod.core.functions
[docs]class StopException(Exception):
pass
logger = logging.getLogger('modules_queue')
[docs]class queue(module.module):
"""
Run the queue module, which queues jobs onto the local grid system(s).
"""
def __init__(self,*args,**kwargs):
# run default init
super(queue,self).__init__(*args,**kwargs)
self.proxy = None
self.max_duration = 3600*12
[docs] def start(self):
"""Start the queue"""
super(queue,self).start()
# set up x509 proxy
proxy_kwargs = {}
if 'gridftp_cfgfile' in self.cfg['queue']:
proxy_kwargs['cfgfile'] = self.cfg['queue']['gridftp_cfgfile']
self.proxy = SiteGlobusProxy(**proxy_kwargs)
# set up job cacert
use_ssl = 'system' in self.cfg and 'ssl' in self.cfg['system'] and self.cfg['system']['ssl']
if (use_ssl and 'cert' in self.cfg['system']['ssl']):
if 'I3PROD' in os.environ:
remote_cacert = os.path.expandvars(os.path.join('$I3PROD','etc','remote_cacert'))
else:
remote_cacert = os.path.expandvars(os.path.join('$PWD','remote_cacert'))
with open(remote_cacert,'w') as f:
f.write(open(certifi.where()).read())
f.write('\n# IceProd local cert\n')
f.write(open(self.cfg['system']['ssl']['cert']).read())
self.cfg['system']['remote_cacert'] = remote_cacert
# some setup
self.plugins = []
plugin_names = [x for x in self.cfg['queue'] if isinstance(self.cfg['queue'][x],dict)]
plugin_cfg = [self.cfg['queue'][x] for x in plugin_names]
plugin_types = [x['type'] for x in plugin_cfg]
logger.info('queueing plugins in cfg: %r',{x:y for x,y in zip(plugin_names,plugin_types)})
if not plugin_names:
logger.debug('%r',self.cfg['queue'])
logger.warning('no queueing plugins found. deactivating queue')
self.stop()
return
# try to find plugins
raw_types = iceprod.server.listmodules('iceprod.server.plugins')
logger.info('available modules: %r',raw_types)
plugins_tmp = []
for i,t in enumerate(plugin_types):
t = t.lower()
p = None
for r in raw_types:
r_name = r.rsplit('.',1)[1].lower()
if r_name == t:
# exact match
logger.debug('exact plugin match - %s',r)
p = r
break
elif t.startswith(r_name):
# partial match
if p is None:
logger.debug('partial plugin match - %s',r)
p = r
else:
name2 = p.rsplit('.',1)[1]
if len(r_name) > len(name2):
logger.debug('better plugin match - %s',r)
p = r
if p is not None:
plugins_tmp.append((p,plugin_names[i],plugin_cfg[i]))
else:
logger.error('Cannot find plugin for grid %s of type %s',plugin_names[i],t)
# instantiate all plugins that are required
gridspec_types = {}
if 'max_task_queued_time' in self.cfg['queue']:
self.max_duration += self.cfg['queue']['max_task_queued_time']
if 'max_task_processing_time' in self.cfg['queue']:
self.max_duration += self.cfg['queue']['max_task_processing_time']
for p,p_name,p_cfg in plugins_tmp:
logger.warning('queueing plugin found: %s = %s', p_name, p_cfg['type'])
# try instantiating the plugin
args = (self.cfg['site_id']+'.'+p_name, p_cfg, self.cfg,
self.modules, self.io_loop, self.executor, self.statsd)
try:
self.plugins.append(iceprod.server.run_module(p,*args))
except Exception as e:
logger.error('Error importing plugin',exc_info=True)
else:
desc = p_cfg['description'] if 'description' in p_cfg else ''
gridspec_types[self.cfg['site_id']+'.'+p_name] = {
'type': p_cfg['type'],
'description': desc,
}
duration = 0
if 'max_task_queued_time' in p_cfg:
duration += p_cfg['max_task_queued_time']
if 'max_task_processing_time' in p_cfg:
duration += p_cfg['max_task_processing_time']
if duration > self.max_duration:
self.max_duration = duration
@tornado.gen.coroutine
def cb():
# add gridspec and types to the db
try:
yield self.modules['db']['queue_set_site_queues'](
site_id=self.cfg['site_id'], queues=gridspec_types)
except Exception:
logger.warning('error setting site queues',exc_info=True)
# start queue loop
yield self.queue_loop()
self.io_loop.add_callback(cb)
@tornado.gen.coroutine
[docs] def queue_loop(self):
"""Run the queueing loop"""
try:
# check and clean grids
for p in self.plugins:
try:
yield p.check_and_clean()
except Exception:
logger.error('plugin %s.check_and_clean() raised exception',
p.__class__.__name__,exc_info=True)
# check proxy cert
try:
yield self.check_proxy(self.max_duration)
except Exception:
logger.error('error checking proxy',exc_info=True)
# buffer jobs and tasks for active datasets
if ('master' in self.cfg and 'url' in self.cfg['master']
and self.cfg['master']['url']):
gridspecs = [p.gridspec for p in self.plugins]
else:
# queue for all gridspecs, since we don't have a master
gridspecs = None
try:
yield self.buffer_jobs_tasks(gridspecs)
except Exception:
logger.error('error buffering jobs and tasks',
exc_info=True)
# queue tasks to grids
num_queued = 0
for p in self.plugins:
try:
yield p.queue()
num_queued += p.tasks_queued + p.tasks_processing
except Exception:
logger.error('plugin %s.queue() raised exception',
p.__class__.__name__,exc_info=True)
# do global queueing
try:
plugin_cfg = self.plugins[0].queue_cfg
# get num tasks to queue
tasks_on_queue = plugin_cfg['tasks_on_queue']
num = min(tasks_on_queue[1] - num_queued, tasks_on_queue[0])
if len(tasks_on_queue) > 2:
num = min(num, tasks_on_queue[2])
if num > 0:
# get priority factors
qf_p = 1.0
qf_d = 1.0
qf_t = 1.0
if 'queueing_factor_priority' in self.cfg['queue']:
qf_p = self.cfg['queue']['queueing_factor_priority']
elif 'queueing_factor_priority' in plugin_cfg:
qf_p = plugin_cfg['queueing_factor_priority']
if 'queueing_factor_dataset' in self.cfg['queue']:
qf_d = self.cfg['queue']['queueing_factor_dataset']
elif 'queueing_factor_dataset' in plugin_cfg:
qf_d = plugin_cfg['queueing_factor_dataset']
if 'queueing_factor_tasks' in self.cfg['queue']:
qf_t = self.cfg['queue']['queueing_factor_tasks']
elif 'queueing_factor_tasks' in plugin_cfg:
qf_t = plugin_cfg['queueing_factor_tasks']
yield self.global_queueing(qf_p,qf_d,qf_t,num=num)
except Exception:
logger.error('error in global queueing', exc_info=True)
except Exception:
logger.error('queue_loop stopped because of exception',
exc_info=True)
else:
# set timeout
if 'queue' in self.cfg and 'queue_interval' in self.cfg['queue']:
timeout = self.cfg['queue']['queue_interval']
if timeout <= 0:
timeout = 300
else:
timeout = 300
self.io_loop.call_later(timeout, self.queue_loop)
@run_on_executor
[docs] def check_proxy(self, duration=None):
"""Check the x509 proxy"""
try:
if duration:
self.proxy.set_duration(duration//3600)
self.proxy.update_proxy()
self.cfg['queue']['x509proxy'] = self.proxy.get_proxy()
except Exception:
logger.warning('cannot setup x509 proxy', exc_info=True)
@tornado.gen.coroutine
[docs] def global_queueing(self, queueing_factor_priority=1.0,
queueing_factor_dataset=1.0,
queueing_factor_tasks=1.0,
num=100):
"""
Do global queueing.
Fetch tasks from the global server that match the local resources
and add them to the local DB. This is non-blocking, but only
one at a time can run.
:param queueing_factor_priority: queueing factor for priority
:param queueing_factor_dataset: queueing factor for dataset id
:param queueing_factor_tasks: queueing factor for number of tasks
:param num: number of tasks to queue
"""
if ('master' not in self.cfg or 'url' not in self.cfg['master'] or
not self.cfg['master']['url']):
logger.debug('no master url, so skip global queueing')
return
#resources = yield self.modules['db']['node_get_site_resources'](
# site_id=self.cfg['site_id'])
url = self.cfg['master']['url']
params = {#'resources':resources,
'queueing_factor_priority':queueing_factor_priority,
'queueing_factor_dataset':queueing_factor_dataset,
'queueing_factor_tasks':queueing_factor_tasks,
'num':num,
}
if 'group_filters' in self.cfg:
params['filters'] = self.cfg['group_filters']
ret = yield send_master(self.cfg, 'queue_master', **params)
yield self.modules['db']['misc_update_tables'](tables=ret)
@tornado.gen.coroutine
[docs] def buffer_jobs_tasks(self,gridspecs):
"""Make sure active datasets have jobs and tasks defined"""
buffer = self.cfg['queue']['task_buffer']
if buffer > 0:
yield self.modules['db']['queue_buffer_jobs_tasks'](gridspec=gridspecs,
num_jobs=buffer)