"""
Queue database methods
"""
import time
import logging
from datetime import datetime
from functools import partial,reduce
import operator
from collections import OrderedDict, defaultdict, Counter, Iterable
import math
import random
import tornado.gen
from iceprod.core.dataclasses import Number,String
from iceprod.core.exe import Config
from iceprod.core import serialization
from iceprod.core.jsonUtil import json_encode,json_decode,json_compressor
from iceprod.server import GlobalID
from iceprod.server.dbmethods import _Methods_Base,datetime2str,str2datetime,nowstr
from iceprod.server import task_queue
logger = logging.getLogger('dbmethods.queue')
[docs]class queue(_Methods_Base):
"""
The Queue DB methods.
Takes a handle to a subclass of iceprod.server.modules.db.DBAPI
as an argument.
"""
def _queue_get_task_from_ret(self, ret):
tasks = OrderedDict()
try:
for row in ret:
dict_row = self._list_to_dict('task',row)
dict_row['status_changed'] = str2datetime(dict_row['status_changed'])
tasks[row[0]] = dict_row
except Exception:
return {}
else:
return tasks
[docs] @tornado.gen.coroutine
def queue_set_site_queues(self, site_id, queues):
"""
Set the site queues
Args:
site_id (str): The site id
queues (dict): The new site queues
"""
with (yield self.parent.db.acquire_lock('site')):
sql = 'select * from site where site_id = ?'
bindings = (site_id,)
ret = yield self.parent.db.query(sql, bindings)
if len(ret) > 0 and len(ret[0]) > 0:
# already a site entry, so just update
try:
old_site = self._list_to_dict('site',ret[0])
old_queues = json_decode(old_site['queues'])
for k in set(queues) & set(old_queues):
try:
for kk in set(old_queues[k]['resources']) - set(queues[k]['resources']):
queues[k]['resources'][kk] = old_queues[k]['resources'][kk]
except Exception:
try:
queues[k]['resources'] = old_queues[k]['resources']
except Exception:
queues[k]['resources'] = {}
queues = json_encode(queues)
except Exception:
logger.warning('set_site_queues(): cannot encode queues to json')
raise
sql = 'update site set queues = ? where site_id = ?'
bindings = (queues,site_id)
else:
# add a new site entry
try:
queues = json_encode(queues)
except Exception:
logger.warning('set_site_queues(): cannot encode queues to json')
raise
sql = 'insert into site (site_id,queues) values (?,?)'
bindings = (site_id,queues)
yield self.parent.db.query(sql, bindings)
if self._is_master():
master_update_history_id = yield self.parent.db.increment_id('master_update_history')
sql3 = 'insert into master_update_history (master_update_history_id,table_name,update_index,timestamp) values (?,?,?,?)'
bindings3 = (master_update_history_id,'site',site_id,nowstr())
try:
yield self.parent.db.query(sql3, bindings3)
except Exception:
logger.info('error updating master_update_history',
exc_info=True)
else:
yield self._send_to_master(('site',site_id,nowstr(),sql,bindings))
[docs] @tornado.gen.coroutine
def queue_get_active_tasks(self, gridspec=None):
"""
Get a dict of active tasks (waiting,queued,processing,reset,resume).
Args:
gridspec (str): The gridspec (None for master)
Returns:
dict: {status:{task_id:task}}
"""
try:
sql = 'select task_id from search where '
sql += 'task_status in ("waiting","queued","processing","reset","resume")'
if gridspec:
sql += ' and gridspec like ?'
bindings = ('%'+gridspec+'%',)
else:
bindings = tuple()
ret = yield self.parent.db.query(sql, bindings)
tasks = set(row[0] for row in ret)
sql = 'select * from task where task_id in (%s)'
task_groups = {}
for f in self._bulk_select(sql,tasks):
ret = yield f
tasks = self._queue_get_task_from_ret(ret)
for task_id in tasks:
status = tasks[task_id]['status']
if status not in ("waiting","queued","processing","reset","resume"):
continue
if status not in task_groups:
task_groups[status] = {}
task_groups[status][task_id] = tasks[task_id]
except Exception:
logger.info('error getting active tasks', exc_info=True)
raise
else:
raise tornado.gen.Return(task_groups)
[docs] @tornado.gen.coroutine
def queue_get_grid_tasks(self, gridspec):
"""
Get a list of tasks (queued, processing) on this
site and plugin.
Args:
gridspec (str): The gridspec (None for master)
Returns:
list: [(task_id, grid_queue_id, submit_time, and submit_dir)]
"""
try:
sql = 'select task_id from search '
sql += 'where gridspec like ? '
sql += ' and task_status in ("queued","processing")'
bindings = ('%'+gridspec+'%',)
ret = yield self.parent.db.query(sql, bindings)
tasks = set(row[0] for row in ret)
sql = 'select * from task where task_id in (%s)'
task_ret = []
for f in self._bulk_select(sql,tasks):
ret = self._queue_get_task_from_ret((yield f))
for task_id in ret:
if ret[task_id]['status'] not in ("queued","processing"):
continue
task_ret.append({
'task_id': task_id,
'grid_queue_id': ret[task_id]['grid_queue_id'],
'submit_time': ret[task_id]['status_changed'],
'submit_dir': ret[task_id]['submit_dir'],
})
logger.info("***********queued tasks: %r", task_ret)
except Exception:
logger.info('error getting grid tasks', exc_info=True)
raise
else:
raise tornado.gen.Return(task_ret)
[docs] @tornado.gen.coroutine
def queue_set_task_status(self, task, status):
"""
Set the status of a task, except if it's complete.
Args:
task (str or iterable): task_id or iterable of task_ids
status (str): status to set
"""
if isinstance(task,String):
task = [task]
elif not isinstance(task,Iterable):
raise Exception('unknown type for task')
sql = 'select task_id from search where '
sql += 'task_id in (%s) and task_status != "complete"'
tids = []
for f in self._bulk_select(sql,task):
ret = yield f
tids.extend(row[0] for row in ret)
logger.debug("task_ids: %r",tids)
now = nowstr()
sql = 'update search set task_status = ? '
sql += ' where task_id in (%s)'
bindings = (status,)
sql2 = 'update task set prev_status = status, '
sql2 += ' status = ?, status_changed = ? where task_id in (%s)'
bindings2 = (status,now)
self._bulk_select(sql, tids, extra_bindings=bindings)
self._bulk_select(sql2, tids, extra_bindings=bindings2)
for tt in tids:
if self._is_master():
master_update_history_id = yield self.parent.db.increment_id('master_update_history')
sql3 = 'insert into master_update_history (master_update_history_id,table_name,update_index,timestamp) values (?,?,?,?)'
bindings3 = (master_update_history_id,'search',tt,now)
master_update_history_id = yield self.parent.db.increment_id('master_update_history')
bindings4 = (master_update_history_id,'task',tt,now)
yield self.parent.db.query([sql3,sql3],[bindings3,bindings4])
else:
bindings = (status,tt)
bindings2 = (status,now,tt)
yield self._send_to_master(('search',tt,now,sql%'?',bindings))
yield self._send_to_master(('task',tt,now,sql2%'?',bindings2))
[docs] @tornado.gen.coroutine
def queue_reset_tasks(self, reset=[], fail=[]):
"""Reset and fail specified tasks"""
if reset:
yield self.queue_set_task_status(reset,'reset')
if fail:
yield self.queue_set_task_status(fail,'failed')
[docs] @tornado.gen.coroutine
def queue_get_task(self, task_id):
"""
Get tasks specified by task_id.
Args:
task (str or iterable): task_id or iterable of task_ids
Returns:
dict: tasks
"""
if isinstance(task_id,str):
# single task
sql = 'select * from task where task_id = ?'
bindings = (task_id,)
elif isinstance(task_id,Iterable):
# multiple tasks
b = ','.join(['?' for _ in range(len(task_id))])
sql = 'select * from task where task_id in ('+b+')'
bindings = tuple(task_id)
else:
raise Exception('task_id is not a str or iterable')
ret = yield self.parent.db.query(sql,bindings)
raise tornado.gen.Return(self._queue_get_task_from_ret(ret))
[docs] @tornado.gen.coroutine
def queue_get_task_by_grid_queue_id(self, grid_queue_id):
"""
Get tasks specified by grid_queue_id.
Args:
grid_queue_id (str): Id or list of ids
Returns:
dict: tasks
"""
if isinstance(grid_queue_id,str):
# single task
sql = 'select * from task where grid_queue_id = ?'
bindings = (grid_queue_id,)
elif isinstance(grid_queue_id,Iterable):
# multiple tasks
b = ','.join(['?' for _ in range(len(grid_queue_id))])
sql = 'select * from task where grid_queue_id in ('+b+')'
bindings = tuple(grid_queue_id)
else:
raise Exception('grid_queue_id is not a str or iterable')
ret = yield self.parent.db.query(sql, bindings)
raise tornado.gen.Return(self._queue_get_task_from_ret(ret))
[docs] @tornado.gen.coroutine
def queue_set_submit_dir(self, task, submit_dir):
"""
Set the submit_dir of a task.
Args:
task (str): task_id
submit_dir (str): Submit directory
"""
if not task:
raise Exception('No task')
sql = 'update task set submit_dir = ? '
sql += ' where task_id = ?'
bindings = (submit_dir,task)
yield self.parent.db.query(sql, bindings)
if self._is_master():
master_update_history_id = yield self.parent.db.increment_id('master_update_history')
sql3 = 'insert into master_update_history (master_update_history_id,table_name,update_index,timestamp) values (?,?,?,?)'
bindings3 = (master_update_history_id,'task',task,nowstr())
try:
yield self.parent.db.query(sql3, bindings3)
except Exception:
logger.info('error updating master_update_history',
exc_info=True)
else:
yield self._send_to_master(('task',task,nowstr(),sql,bindings))
[docs] @tornado.gen.coroutine
def queue_set_grid_queue_id(self, task, grid_queue_id):
"""
Set the grid_queue_id of a task.
Args:
task (str): task_id
grid_queue_id (str): Grid queue id
"""
if not task:
raise Exception('No task')
sql = 'update task set grid_queue_id = ? '
sql += ' where task_id = ?'
bindings = (grid_queue_id,task)
yield self.parent.db.query(sql, bindings)
if self._is_master():
master_update_history_id = yield self.parent.db.increment_id('master_update_history')
sql3 = 'insert into master_update_history (master_update_history_id,table_name,update_index,timestamp) values (?,?,?,?)'
bindings3 = (master_update_history_id,'task',task,nowstr())
try:
yield self.parent.db.query(sql3, bindings3)
except Exception:
logger.info('error updating master_update_history',
exc_info=True)
else:
yield self._send_to_master(('task',task,nowstr(),sql,bindings))
[docs] @tornado.gen.coroutine
def queue_buffer_jobs_tasks(self, gridspec=None, num_jobs=1000):
"""
Create a buffer of jobs and tasks ahead of queueing.
Args:
gridspec (str or iterable): Single or multiple gridspecs to match.
`None` for global queueing.
num_jobs (int): Number of jobs to buffer.
"""
now = nowstr()
# get possible datasets to buffer from
sql = 'select dataset_id, status, gridspec, jobs_submitted '
sql += 'from dataset where status = ? '
bindings = ('processing',)
if isinstance(gridspec, String):
sql += 'and gridspec like ?'
bindings += ('%'+gridspec+'%',)
gridspec = [gridspec]
elif isinstance(gridspec, Iterable):
if len(gridspec) < 1:
logger.info('in buffer_jobs_tasks, no gridspec %r', gridspec)
raise Exception('no gridspec defined')
sql += 'and ('+(' or '.join(['gridspec like ?' for _ in gridspec]))+')'
bindings += tuple(['%'+g+'%' for g in gridspec])
elif gridspec:
logger.info('in buffer_jobs_tasks, unknown gridspec %r', gridspec)
raise Exception('unknown gridspec type')
ret = yield self.parent.db.query(sql, bindings)
need_to_buffer = {}
for d, s, gs, js in ret:
logger.debug('gs=%r, gridspec=%r', gs, gridspec)
need_to_buffer[d] = {'gridspec':gs,'jobs':js,'job_index':0}
if not need_to_buffer:
# nothing to buffer
logger.info('nothing to buffer')
return
with (yield self.parent.db.acquire_lock('queue')):
# remove already buffered jobs
sql = 'select dataset_id,count(*) from job '
sql += ' where dataset_id in ('
sql += ','.join(['?' for _ in need_to_buffer])
sql += ') group by dataset_id'
bindings = tuple(need_to_buffer)
ret = yield self.parent.db.query(sql, bindings)
for d, num in ret:
need_to_buffer[d]['job_index'] = num
# get task_rels for buffering datasets
task_rel_ids = {}
task_rel_reqs = {}
try:
sql = 'select task_rel_id,dataset_id,task_index,name,depends,requirements from task_rel '
sql += 'where dataset_id in ('
sql += ','.join('?' for _ in need_to_buffer)+')'
bindings = tuple(need_to_buffer)
ret = yield self.parent.db.query(sql, bindings)
for tr_id, dataset_id, index, name, deps, reqs in ret:
if 'task_rels' not in need_to_buffer[dataset_id]:
need_to_buffer[dataset_id]['task_rels'] = {}
need_to_buffer[dataset_id]['task_rels'][tr_id] = (index,name,deps)
task_rel_ids[tr_id] = (dataset_id,index,deps)
task_rel_reqs[tr_id] = json_decode(reqs) if reqs else None
except Exception as e:
logger.info('error getting task_rels', exc_info=True)
raise
dataset_configs = {}
try:
sql = 'select dataset_id,config_data from config '
sql += ' where dataset_id in ('
sql += ','.join(['?' for _ in need_to_buffer])
sql += ')'
bindings = tuple(need_to_buffer)
ret = yield self.parent.db.query(sql, bindings)
for dataset_id, config in ret:
dataset_configs[dataset_id] = json_decode(config)
except Exception as e:
logger.info('error getting dataset configs', exc_info=True)
raise
# buffer for each dataset
# TODO: use priorities to do this better
for dataset in random.sample(list(need_to_buffer),len(need_to_buffer)):
try:
job_index = need_to_buffer[dataset]['job_index']
total_jobs = need_to_buffer[dataset]['jobs']
task_rels = need_to_buffer[dataset]['task_rels']
sorted_task_rels = sorted(task_rels,
key=lambda k: task_rels[k][0])
sorted_task_rel_values = sorted(task_rels.values(),
key=lambda v: v[0])
gs = need_to_buffer[dataset]['gridspec']
logger.debug('buffering dataset %s, job index %d',
dataset, job_index)
db_updates_sql = []
db_updates_bindings = []
while num_jobs > 0 and job_index < total_jobs:
# figure out the task dependencies for the tasks in
# the current job
depends = []
try:
for i, x in enumerate(sorted_task_rel_values):
index, name, deps = x
logger.debug('checking depends: %r',x)
task_deps = ([],[])
for d in deps.split(','):
if not d:
continue
if d in task_rels:
# linking within job
if i == sorted_task_rels.index(d):
raise Exception('cannot depend on ourself')
task_deps[0].append(task_rels[d][0])
continue
# linking to another dataset
if d not in task_rel_ids:
sql = 'select dataset_id,task_index,depends from task_rel '
sql += 'where task_rel_id = ?'
bindings = (d,)
ret = yield self.parent.db.query(sql, bindings)
for dataset_id, index, deps in ret:
task_rel_ids[d] = (dataset_id,index,deps)
if d not in task_rel_ids:
logger.error('cannot find task_rel_id %r',d)
raise Exception('dependency not found')
sql = 'select job_id, task_id from search where dataset_id = ?'
bindings = (task_rel_ids[d][0],)
ret = yield self.parent.db.query(sql, bindings)
jobs = {}
for j, t in ret:
if j not in jobs:
jobs[j] = [t]
else:
jobs[j].append(t)
sql = 'select job_id,job_index from job where '
sql += 'job_index = ? and job_id in ('
sql += ','.join('?' for _ in jobs) + ')'
bindings = (job_index,)+tuple(jobs)
ret = yield self.parent.db.query(sql, bindings)
if (not ret) or not ret[0]:
raise Exception('job_index not found')
tasks = sorted(jobs[ret[0][0]], key=lambda k: GlobalID.char2int(k))
task_deps[1].append(tasks[task_rel_ids[d][1]])
depends.append(task_deps)
except Exception:
logger.warning('missing dependency when buffering dataset')
raise
# make job
job_id = yield self.parent.db.increment_id('job')
sql = 'insert into job (job_id, dataset_id, status, job_index, '
sql += 'status_changed) values (?,?,?,?,?)'
bindings = (job_id, dataset, 'processing', job_index, now)
db_updates_sql.append(sql)
db_updates_bindings.append(bindings)
# make tasks
task_ids = []
for _ in task_rels:
x = yield self.parent.db.increment_id('task')
task_ids.append(x)
sql = 'insert into task (task_id,status,prev_status,'
sql += 'status_changed,submit_dir,grid_queue_id,'
sql += 'failures,evictions,walltime,walltime_err,walltime_err_n,'
sql += 'depends,requirements,task_rel_id) values '
sql += '(?,?,?,?,?,?,?,?,?,?,?,?,?,?)'
sql2 = 'insert into search (task_id,job_id,dataset_id,gridspec,'
sql2 += 'name,task_status) values (?,?,?,?,?,?)'
for index, task_rel_id in enumerate(sorted_task_rels):
deps = [task_ids[i] for i in depends[index][0]]
deps.extend(depends[index][1])
if not task_rel_reqs[task_rel_id]:
reqs = ''
else:
# parse requirements
cfg = dict(dataset_configs[dataset])
cfg['options']['job'] = job_index
cfg['options']['iter'] = 0
cfg['options']['jobs_submitted'] = total_jobs
reqs = Config(config=cfg).parseObject(task_rel_reqs[task_rel_id], {})
# only store job-specific requirements if they
# are distinct from the value in task_rel
if reqs != task_rel_reqs[task_rel_id]:
reqs = json_encode(reqs)
else:
reqs = ''
# task table
bindings = (task_ids[index], 'idle', 'idle', now,
'', '', 0, 0, 0.0, 0.0, 0,
','.join(deps), reqs, task_rel_id)
db_updates_sql.append(sql)
db_updates_bindings.append(bindings)
# search table
name = task_rels[task_rel_id][1]
bindings2 = (task_ids[index], job_id, dataset, gs, name, 'idle')
db_updates_sql.append(sql2)
db_updates_bindings.append(bindings2)
job_index += 1
num_jobs -= 1
# write to database
yield self.parent.db.query(db_updates_sql, db_updates_bindings)
for i in range(len(db_updates_sql)):
sql = db_updates_sql[i]
bindings = db_updates_bindings[i]
if self._is_master():
master_update_history_id = yield self.parent.db.increment_id('master_update_history')
sql3 = 'insert into master_update_history (master_update_history_id,table_name,update_index,timestamp) values (?,?,?,?)'
bindings3 = (master_update_history_id,sql.split()[2],bindings[0],now)
try:
yield self.parent.db.query(sql3, bindings3)
except Exception:
logger.info('error updating master_update_history',
exc_info=True)
else:
yield self._send_to_master((sql.split()[2],bindings[0],now,sql,bindings))
except Exception:
logger.warning('error buffering dataset %s', dataset, exc_info=True)
continue
[docs] @tornado.gen.coroutine
def queue_get_queueing_datasets(self, gridspec=None):
"""
Get datasets that are currently in processing status on gridspec.
Args:
gridspec (str): The gridspec
Returns:
dict: {dataset_id: dataset info}
"""
bindings = []
sql = 'select * from dataset where status in ("processing","truncated") '
if gridspec:
sql += ' and gridspec like ?'
bindings.append('%'+gridspec+'%')
bindings = tuple(bindings)
ret = yield self.parent.db.query(sql, bindings)
datasets = {}
for row in ret:
d = self._list_to_dict('dataset',row)
datasets[d['dataset_id']] = d
raise tornado.gen.Return(datasets)
[docs] @tornado.gen.coroutine
def queue_get_queueing_tasks(self, dataset_prios, num=20,
resources=None, gridspec_assignment=None,
global_queueing=False):
"""
Get tasks to queue based on dataset priorities.
Args:
dataset_prios (dict): {dataset_id:priority} where sum(priorities)=1
num (int): (optional) number of tasks to queue
resources (dict): (optional) available resources on grid
gridspec_assignment (str): (optional) the grid to assign the tasks to
global_queueing (bool): Global queueing mode (default: False)
Returns:
dict: {task_id:task}
"""
if dataset_prios is None or not isinstance(dataset_prios,dict):
raise Exception('dataset_prios not a dict')
logger.info('queue() num=%r, global=%r, prios=%r, gridspec_assign=%r, resources=%r',
num, global_queueing, dataset_prios, gridspec_assignment, resources)
with (yield self.parent.db.acquire_lock('queue')):
# get all tasks for processing datasets so we can do dependency check
try:
sql = 'select task_rel_id, dataset_id, requirements from task_rel '
sql += 'where dataset_id in (%s)'
task_rel_ids = {}
for f in self._bulk_select(sql, dataset_prios):
for task_rel_id, dataset_id, reqs in (yield f):
if reqs:
reqs = json_decode(reqs)
task_rel_ids[task_rel_id] = (dataset_id, reqs)
if not task_rel_ids:
raise tornado.gen.Return({})
sql = 'select task_id, status, depends, requirements, task_rel_id '
sql += 'from task where task_rel_id in (%s)'
tasks = {}
datasets = {k:{} for k in dataset_prios}
for f in self._bulk_select(sql, task_rel_ids):
for task_id, status, depends, reqs, task_rel_id in (yield f):
dataset, task_rel_reqs = task_rel_ids[task_rel_id]
tasks[task_id] = {'dataset':dataset, 'status':status}
if (status == 'idle' or
((not global_queueing) and status == 'waiting')):
for dep in depends.split(','):
if dep in tasks and tasks[dep]['status'] != 'complete':
break
else:
if reqs:
reqs = json_decode(reqs)
else:
reqs = task_rel_reqs
datasets[dataset][task_id] = (depends,reqs,task_rel_id)
except Exception:
logger.info('error getting processing tasks', exc_info=True)
raise
# get actual tasks
task_prio = {}
for dataset in dataset_prios:
limit = num
dataset_task_prio = []
logger.info('queue() dataset %s, limit is %d, available is %d',
dataset, limit, len(datasets[dataset]))
def sort_key(k):
if datasets[dataset][k][0]:
return datasets[dataset][k][-1]
else:
return ''
for task_id in sorted(datasets[dataset], key=sort_key, reverse=True):
depends = datasets[dataset][task_id][0]
reqs = datasets[dataset][task_id][1]
logger.info('now examining %r, with %r %r',task_id,depends,reqs)
satisfied = True
if depends == 'unknown': # depends not yet computed
satisfied = False
logger.info('task %r has unknown depends', task_id)
elif depends:
for dep in depends.split(','):
if dep not in tasks:
logger.info('look up depend status: %r',dep)
sql = 'select task_status from search where task_id = ?'
bindings = (dep,)
try:
ret = yield self.parent.db.query(sql, bindings)
except Exception:
logger.info('error getting depend task status for %s',
dep, exc_info=True)
satisfied = False
break
if (not ret) or len(ret[0]) < 0:
logger.info('bad depend task status result: %r',ret)
satisfied = False
break
elif ret[0][0] != 'complete':
logger.info('depends not yet satisfied: %r', task_id)
satisfied = False
break
elif tasks[dep]['status'] != 'complete':
logger.info('depends not yet satisfied: %r', task_id)
satisfied = False
break
if satisfied and reqs and resources:
# now match based on resources
try:
for r in reqs:
if r not in resources:
logger.info('reqs not satisfied: %r', task_id)
satisfied = False
break
except Exception:
logger.info('failed to check resources',
exc_info=True)
if satisfied:
# task can be queued now
dataset_task_prio.append(task_id)
limit -= 1
if limit <= 0:
break
task_prio[dataset] = dataset_task_prio
logger.info('queue() %d tasks can queue',
sum(len(task_prio[t]) for t in task_prio))
if not task_prio:
raise tornado.gen.Return({})
# grab tasks from task_prio in order of dataset priority
dataset_ids = set()
tasks = set()
num_to_queue = num
while num_to_queue > 0 and task_prio:
for dataset in sorted(task_prio, key=lambda k:dataset_prios[k], reverse=True):
if not task_prio[dataset]:
del task_prio[dataset]
continue
dataset_ids.add(dataset)
tasks.add(task_prio[dataset].pop())
num_to_queue -= 1
sql = 'select dataset_id, jobs_submitted, debug from dataset '
sql += ' where dataset_id in (%s)'
try:
dataset_debug = {}
for f in self._bulk_select(sql, dataset_ids):
for d_id,js,debug in (yield f):
dataset_debug[d_id] = (js,bool(debug))
except Exception:
logger.debug('error getting dataset debug', exc_info=True)
raise
sql = 'select * from search where task_id in (%s)'
try:
ret = []
for f in self._bulk_select(sql, tasks):
ret2 = yield f
ret.extend(ret2)
except Exception:
logger.debug('error queueing tasks', exc_info=True)
raise
tasks = {}
job_ids = {}
for row in ret:
tmp = self._list_to_dict('search',row)
if tmp['dataset_id'] not in dataset_debug:
logger.warning('found a bad dataset: %r', tmp['dataset_id'])
continue
tmp['jobs_submitted'] = dataset_debug[tmp['dataset_id']][0]
tmp['debug'] = dataset_debug[tmp['dataset_id']][1]
tmp['reqs'] = datasets[tmp['dataset_id']][tmp['task_id']][1]
tasks[tmp['task_id']] = tmp
if tmp['job_id'] not in job_ids:
job_ids[tmp['job_id']] = [tmp['task_id']]
else:
job_ids[tmp['job_id']].append(tmp['task_id'])
if job_ids:
# get the job index for each task
sql = 'select job_id,job_index from job where job_id in ('
sql += ','.join('?' for _ in job_ids)+')'
bindings = tuple(job_ids)
ret = yield self.parent.db.query(sql, bindings)
if (not ret) or not ret[0]:
logger.info('sql %r',sql)
logger.info('bindings %r',bindings)
logger.info('ret %r',ret)
logger.warning('failed to find job with known job_id %r for task_id %r',
job_ids, list(tasks.keys()))
raise Exception('no job_index')
for job_id,job_index in ret:
for task_id in job_ids[job_id]:
tasks[task_id]['job'] = job_index
if tasks:
# update status
new_status = 'waiting' if global_queueing else 'queued'
now = nowstr()
sql = 'update search set task_status = ? '
bindings = [new_status]
if gridspec_assignment:
sql += ', gridspec = ? '
bindings.append(gridspec_assignment)
sql += 'where task_id in ('
sql += ','.join('?' for _ in tasks)
sql += ')'
bindings.extend(tasks)
bindings = tuple(bindings)
sql2 = 'update task set prev_status = status, '
sql2 += 'status = ?, '
sql2 += 'status_changed = ? '
bindings2 = [new_status, now]
sql2 += 'where task_id in ('
sql2 += ','.join('?' for _ in tasks)
sql2 += ')'
bindings2.extend(tasks)
bindings2 = tuple(bindings2)
yield self.parent.db.query([sql,sql2], [bindings,bindings2])
if self._is_master():
for t in tasks:
sql3 = 'insert into master_update_history (master_update_history_id,table_name,update_index,timestamp) values (?,?,?,?)'
master_update_history_id = yield self.parent.db.increment_id('master_update_history')
bindings3 = (master_update_history_id,'search',t,now)
master_update_history_id = yield self.parent.db.increment_id('master_update_history')
bindings4 = (master_update_history_id,'task',t,now)
try:
yield self.parent.db.query([sql3,sql3], [bindings3,bindings4])
except Exception:
logger.info('error updating master_update_history',
exc_info=True)
else:
sql = 'update search set task_status=? '
if gridspec_assignment:
sql += ', gridspec = ? '
sql += 'where task_id = ?'
sql2 = 'update task set prev_status = status, '
sql2 += 'status = ?, '
sql2 += 'status_changed = ? '
sql2 += 'where task_id = ?'
for t in tasks:
if gridspec_assignment:
bindings = (new_status,gridspec_assignment,t)
else:
bindings = (new_status,t)
bindings2 = (new_status,now,t)
yield self._send_to_master(('search',t,now,sql,bindings))
yield self._send_to_master(('task',t,now,sql2,bindings2))
for t in tasks:
tasks[t]['task_status'] = new_status
if gridspec_assignment:
tasks[t]['gridspec'] = gridspec_assignment
raise tornado.gen.Return(tasks)
[docs] @tornado.gen.coroutine
def queue_new_pilot_ids(self, num):
"""
Get new ids for pilots.
A pre-cursor to :func:`queue_add_pilot`.
Args:
num (int): The number of ids to get.
Returns:
list: A list of pilot ids.
"""
try:
ret = []
for _ in range(num):
x = yield self.parent.db.increment_id('pilot')
ret.append(x)
except Exception:
logger.info('new pilot_ids error', exc_info=True)
raise
else:
raise tornado.gen.Return(ret)
[docs] @tornado.gen.coroutine
def queue_add_pilot(self, pilot):
"""
Add a pilot to the DB
Args:
pilot: The pilot dict.
"""
try:
now = nowstr()
s = 'insert into pilot (pilot_id, grid_queue_id, submit_time, '
s += 'submit_dir, tasks, requirements, avail_cpu, avail_gpu, '
s += 'avail_memory, avail_disk, avail_time, claim_cpu, claim_gpu, '
s += 'claim_memory, claim_disk, claim_time) values (?,?,?,?,?,?,'
s += '0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0)'
sql = []
bindings = []
for i,pilot_id in enumerate(pilot['pilot_ids']):
grid_queue_id = str(pilot['grid_queue_id'])+'.'+str(i)
sql.append(s)
reqs = json_encode(pilot['reqs'])
bindings.append((pilot_id, grid_queue_id, now, pilot['submit_dir'],'',reqs))
yield self.parent.db.query(sql, bindings)
except Exception as e:
logger.debug('error adding pilot', exc_info=True)
raise
[docs] @tornado.gen.coroutine
def queue_get_pilots(self, active=None):
"""
Get pilot information.
When `active=True`, get only pilots with tasks.
When `active=False`, get only idle pilots.
By default, get all pilots
Args:
active (bool): Get only pilots with active tasks (default: None).
Returns:
list: [pilot dict]
"""
sql = 'select * from pilot'
if active is not None:
sql += ' where tasks '+('!=' if active else '=')+' "" '
bindings = tuple()
ret = yield self.parent.db.query(sql, bindings)
pilots = []
for row in ret:
tmp = self._list_to_dict('pilot',row)
tmp['submit_time'] = str2datetime(tmp['submit_time'])
tmp['tasks'] = tmp['tasks'].split(',')
if tmp['requirements']:
tmp['requirements'] = json_decode(tmp['requirements'])
pilots.append(tmp)
raise tornado.gen.Return(pilots)
[docs] @tornado.gen.coroutine
def queue_del_pilots(self, pilots):
"""
Remove pilots from the DB.
Args:
pilots (iterable): List of pilot_ids
"""
now = nowstr()
if not isinstance(pilots,list):
pilots = list(pilots)
task_ids = set()
try:
# work in batches of 900
while pilots:
p = pilots[:900]
pilots = pilots[900:]
sql = 'select tasks from pilot where pilot_id in ('
sql += ','.join('?' for _ in p)+')'
bindings = tuple(p)
ret = yield self.parent.db.query(sql, bindings)
for row in ret:
tasks = row[0].strip()
if tasks:
task_ids.update(x for x in row[0].split(',') if x)
sql = 'delete from pilot where pilot_id in ('
sql += ','.join('?' for _ in p)+')'
bindings = tuple(p)
yield self.parent.db.query(sql, bindings)
except Exception:
logger.debug('error deleting pilots', exc_info=True)
raise
if task_ids:
with (yield self.parent.db.acquire_lock('queue')):
sql = 'select task_id from search '
sql += 'where task_status = "processing" and task_id in (%s)'
reset_tasks = set()
for f in self._bulk_select(sql, task_ids):
reset_tasks.update([row[0] for row in (yield f)])
yield self.queue_set_task_status(reset_tasks,'reset')
[docs] @tornado.gen.coroutine
def queue_get_cfg_for_task(self, task_id):
"""
Get a config for a task.
Args:
task_id (str): A task id
Returns:
str: config as a json blob
"""
if not task_id:
raise Exception('bad task_id')
sql = 'select task_id,dataset_id from search where task_id = ?'
bindings = (task_id,)
ret = yield self.parent.db.query(sql, bindings)
if not ret or len(ret) < 1 or len(ret[0]) < 2:
raise Exception('get_cfg_for_task did not return a dataset_id')
else:
ret = yield self.queue_get_cfg_for_dataset(ret[0][1])
raise tornado.gen.Return(ret)
[docs] @tornado.gen.coroutine
def queue_get_cfg_for_dataset(self, dataset_id):
"""
Get a config for a dataset.
Args:
dataset_id (str): A dataset id
Returns:
str: config as a json blob
"""
if not dataset_id:
raise Exception('bad dataset_id')
sql = 'select dataset_id,config_data from config where dataset_id = ?'
bindings = (dataset_id,)
ret = yield self.parent.db.query(sql, bindings)
if not ret or len(ret) < 1 or len(ret[0]) < 2:
raise Exception('get_cfg_for_dataset did not return a config')
else:
logger.debug('config for dataset: %r',ret)
data = None
for dataset_id,config_data in ret:
data = config_data
raise tornado.gen.Return(data)
[docs] @tornado.gen.coroutine
def queue_add_task_lookup(self, tasks):
"""
Add the tasks currently available for lookup by pilots.
Args:
tasks (dict): dict of {task_id: resources}
"""
now = time.time()
keys = next(iter(tasks.values()))
sql = 'replace into task_lookup (task_id,queue,insert_time,'
sql += ','.join('req_'+k for k in keys)
sql += ') values (?,?,?,'
sql += ','.join('?' for k in keys)+')'
bindings = []
for t in tasks:
reqs = tasks[t]
queue = task_queue.get_queue(reqs)
bindings.append((t,queue,now)+tuple(reqs[k] for k in keys))
yield self.parent.db.query([sql for _ in bindings], bindings)
[docs] @tornado.gen.coroutine
def queue_get_task_lookup(self):
"""
Get the resources for all tasks in the lookup.
Returns:
dict: {task_id: resources}
"""
with (yield self.parent.db.acquire_lock('task_lookup')):
# get tasks from lookup
sql = 'select * from task_lookup'
bindings = tuple()
ret = yield self.parent.db.query(sql, bindings)
task_ids = {}
for row in ret:
row = self._list_to_dict('task_lookup',row)
tid = row.pop('task_id')
task_ids[tid] = {k.replace('req_',''):row[k] for k in row if k.startswith('req_')}
# check that these are still valid
sql = 'select task_id from search where task_id in (%s) and task_status = ?'
bindings = ('queued',)
ret = {}
for f in self._bulk_select(sql, task_ids, extra_bindings=bindings):
for row in (yield f):
tid = row[0]
ret[tid] = task_ids[tid]
invalid_tasks = set(task_ids).difference(ret)
if invalid_tasks:
logger.info('tasks not valid, remove from task_lookup: %s',
invalid_tasks)
sql = 'delete from task_lookup where task_id in (%s)'
for f in self._bulk_select(sql, invalid_tasks):
yield f
reset_tasks = set(ret).difference(task_ids)
if reset_tasks:
logger.info('tasks queued, but not in task_lookup: %s',
reset_tasks)
yield self.parent.service['queue_set_task_status'](reset_tasks,'waiting')
raise tornado.gen.Return(ret)