"""
Cron database methods
"""
import os
import logging
from datetime import datetime, timedelta
from functools import partial
from collections import defaultdict, OrderedDict, Counter
import tornado.gen
from iceprod.core import functions
from iceprod.core.gridftp import GridFTP
from iceprod.core.jsonUtil import json_encode,json_decode,json_compressor
from iceprod.server.dbmethods import _Methods_Base,datetime2str,str2datetime, nowstr
from iceprod.server import GlobalID
from iceprod.server.master_communication import send_master
logger = logging.getLogger('dbmethods.cron')
[docs]class cron(_Methods_Base):
"""
The scheduled (cron) DB methods.
Takes a handle to a subclass of iceprod.server.modules.db.DBAPI
as an argument.
"""
@tornado.gen.coroutine
[docs] def cron_dataset_completion(self):
"""Check for newly completed datasets and mark them as such"""
with (yield self.parent.db.acquire_lock('dataset')):
sql = 'select dataset_id,jobs_submitted,tasks_submitted '
sql += ' from dataset where status = ? '
bindings = ('processing',)
ret = yield self.parent.db.query(sql, bindings)
datasets = OrderedDict()
for dataset_id,njobs,ntasks in ret:
datasets[dataset_id] = {'jobs_submitted':njobs,
'tasks_submitted':ntasks,
'task_status':set(),
'ntasks':0}
if not datasets:
return
sql = 'select dataset_id,task_status from search '
sql += ' where dataset_id in ('
sql += ','.join(['?' for _ in datasets])
sql += ')'
bindings = tuple(datasets.keys())
ret = yield self.parent.db.query(sql, bindings)
for dataset_id,task_status in ret:
datasets[dataset_id]['ntasks'] += 1
datasets[dataset_id]['task_status'].add(task_status)
dataset_status = {}
for dataset_id in datasets:
total_tasks = datasets[dataset_id]['tasks_submitted']
#tasks_per_job = int(total_tasks/total_jobs)
ntasks = datasets[dataset_id]['ntasks']
if ntasks < total_tasks:
continue # not all tasks accounted for
task_statuses = datasets[dataset_id]['task_status']
if not task_statuses&{'waiting','queued','processing','resume','reset'}:
logger.info('dataset %s task statues %r',dataset_id,task_statuses)
if not task_statuses-{'complete'}:
dataset_status[dataset_id] = 'complete'
elif not task_statuses-{'complete','failed'}:
dataset_status[dataset_id] = 'errors'
elif not task_statuses-{'complete','failed','suspended'}:
dataset_status[dataset_id] = 'suspended'
if dataset_status:
# update dataset statuses
now = nowstr()
statuses = {}
for dataset_id in dataset_status:
status = dataset_status[dataset_id]
logger.info('dataset %s marked as %s',dataset_id,status)
if status not in statuses:
statuses[status] = set()
statuses[status].add(dataset_id)
multi_sql = []
multi_bindings = []
master_sql = []
master_bindings = []
for s in statuses:
bindings = (s,)
sql = 'update dataset set status = ?'
if s == 'complete':
sql += ', end_date = ? '
bindings += (now,)
sql += ' where dataset_id in ('
sql += ','.join(['?' for _ in statuses[s]])
sql += ')'
bindings += tuple([d for d in statuses[s]])
multi_sql.append(sql)
multi_bindings.append(bindings)
# now prepare individual master sqls
bindings = (s,)
sql = 'update dataset set status = ?'
if s == 'complete':
sql += ', end_date = ? '
bindings += (now,)
sql += ' where dataset_id = ? '
for d in statuses[s]:
master_sql.append(sql)
master_bindings.append(bindings+(d,))
yield self.parent.db.query(multi_sql, multi_bindings)
if self._is_master():
master_update_history_id = yield self.parent.db.increment_id('master_update_history')
sql3 = 'insert into master_update_history (master_update_history_id,table_name,update_index,timestamp) values (?,?,?,?)'
for sql,bindings in zip(master_sql,master_bindings):
bindings3 = (master_update_history_id, 'dataset',bindings[-1],now)
try:
yield self.parent.db.query(sql3, bindings3)
except Exception:
logger.info('error updating master_update_history',
exc_info=True)
else:
for sql,bindings in zip(master_sql,master_bindings):
yield self._send_to_master(('dataset',bindings[-1],now,sql,bindings))
# TODO: consolidate dataset statistics
@tornado.gen.coroutine
[docs] def cron_job_completion(self, delete_jobs=False):
"""
Check for job status changes.
If this is the master, mark jobs complete, suspended, or failed
as necessary. Completed jobs also delete the job temp space.
If this is not the master, and if all tasks in a job are not in
an active state, then delete the job and tasks.
"""
sql = 'select dataset_id,status,jobs_submitted,tasks_submitted '
sql += ' from dataset '
bindings = tuple()
ret = yield self.parent.db.query(sql, bindings)
datasets = {}
for dataset_id,status,njobs,ntasks in ret:
try:
datasets[dataset_id] = {
'status': status,
'tasks': int(ntasks)//int(njobs),
}
except ValueError:
logger.info('something strange with dataset %s', dataset_id,
exc_info=True)
if not datasets:
return
# filter by jobs that need updating
sql = 'select job_id from job where status = "processing" '
bindings = tuple()
ret = yield self.parent.db.query(sql, bindings)
job_ids = [row[0] for row in ret]
# get the jobs by status and number of tasks
sql = 'select dataset_id, job_id, task_status from search '
sql += ' where job_id in (%s)'
jobs = defaultdict(lambda:[Counter(),None])
for f in self._bulk_select(sql, job_ids):
for dataset_id,job_id,task_status in (yield f):
jobs[job_id][0][task_status] += 1
jobs[job_id][1] = dataset_id
complete_jobs = []
errors_jobs = []
suspended_jobs = []
clean_jobs = []
for job_id in job_ids:
if job_id not in jobs:
logger.error('unknown job id: %r', job_id)
if self._is_master():
suspended_jobs.append(job_id)
else:
clean_jobs.append(job_id)
continue
statuses = jobs[job_id][0]
dataset_id = jobs[job_id][1]
have_all_jobs = sum(statuses.values()) >= datasets[dataset_id]['tasks']
statuses = set(statuses)
if (datasets[dataset_id]['status'] in ('suspended','errors') and
not statuses&{'processing'}):
if self._is_master():
if not have_all_jobs:
logger.error('not all tasks in job %r buffered',job_id)
continue
if not statuses-{'complete'}:
complete_jobs.append(job_id)
elif statuses&{'failed'}:
errors_jobs.append(job_id)
else:
suspended_jobs.append(job_id)
else:
logger.info('job %r can be removed', job_id)
clean_jobs.append(job_id)
elif not statuses&{'waiting','queued','processing','resume','reset'}:
if self._is_master():
if not have_all_jobs:
logger.error('not all tasks in job %r buffered',job_id)
continue
if not statuses-{'complete'}:
complete_jobs.append(job_id)
elif not statuses-{'complete','failed'}:
errors_jobs.append(job_id)
elif not statuses-{'complete','failed','suspended'}:
suspended_jobs.append(job_id)
else:
logger.info('job %r can be removed', job_id)
clean_jobs.append(job_id)
if delete_jobs and (not self._is_master()) and clean_jobs:
# we are not the master, and just cleaning these jobs
with (yield self.parent.db.acquire_lock('queue')):
sql = 'select task_id from search where job_id in (%s)'
task_ids = set()
for f in self._bulk_select(sql, clean_jobs):
task_ids.update([row[0] for row in (yield f)])
if task_ids:
sql = 'delete from search where task_id in (%s)'
for f in self._bulk_select(sql, task_ids):
yield f
sql = 'delete from task where task_id in (%s)'
for f in self._bulk_select(sql, task_ids):
yield f
sql = 'delete from task_stat where task_id in (%s)'
for f in self._bulk_select(sql, task_ids):
yield f
sql = 'delete from task_log where task_id in (%s)'
for f in self._bulk_select(sql, task_ids):
yield f
sql = 'delete from task_lookup where task_id in (%s)'
for f in self._bulk_select(sql, task_ids):
yield f
sql = 'delete from job where job_id in (%s)'
for f in self._bulk_select(sql, clean_jobs):
yield f
sql = 'delete from job_stat where job_id in (%s)'
for f in self._bulk_select(sql, clean_jobs):
yield f
else:
# we are the master, and are updating job statuses
now = nowstr()
# errors jobs
sql = 'update job set status = "errors", status_changed = ? '
sql += ' where job_id = ?'
for job_id in errors_jobs:
# update job status
logger.info('job %s marked as errors',job_id)
bindings = (now,job_id)
yield self.parent.db.query(sql, bindings)
master_update_history_id = yield self.parent.db.increment_id('master_update_history')
sql3 = 'insert into master_update_history (master_update_history_id,table_name,update_index,timestamp) values (?,?,?,?)'
bindings3 = (master_update_history_id,'job',job_id,now)
try:
yield self.parent.db.query(sql3, bindings3)
except Exception as e:
logger.info('error updating master_update_history',
exc_info=True)
# suspended jobs
sql = 'update job set status = "suspended", status_changed = ? '
sql += ' where job_id = ?'
for job_id in suspended_jobs:
# update job status
logger.info('job %s marked as suspended',job_id)
bindings = (now,job_id)
yield self.parent.db.query(sql, bindings)
master_update_history_id = yield self.parent.db.increment_id('master_update_history')
sql3 = 'insert into master_update_history (master_update_history_id, table_name,update_index,timestamp) values (?,?,?,?)'
bindings3 = (master_update_history_id,'job',job_id,now)
try:
yield self.parent.db.query(sql3, bindings3)
except Exception as e:
logger.info('error updating master_update_history',
exc_info=True)
# complete jobs
sql = 'update job set status = "complete", status_changed = ? '
sql += ' where job_id = ?'
for job_id in complete_jobs:
dataset_id = jobs[job_id][1]
# update job status
logger.info('job %s marked as complete',job_id)
bindings = (now,job_id)
yield self.parent.db.query(sql, bindings)
master_update_history_id = yield self.parent.db.increment_id('master_update_history')
sql3 = 'insert into master_update_history (master_update_history_id,table_name,update_index,timestamp) values (?,?,?,?)'
bindings3 = (master_update_history_id,'job',job_id,now)
try:
yield self.parent.db.query(sql3, bindings3)
except Exception as e:
logger.info('error updating master_update_history',
exc_info=True)
# TODO: collate task stats
# clean dagtemp
if 'site_temp' in self.parent.cfg['queue']:
temp_dir = self.parent.cfg['queue']['site_temp']
dataset = GlobalID.localID_ret(dataset_id, type='int')
sql2 = 'select job_index from job where job_id = ?'
bindings = (job_id,)
try:
ret = yield self.parent.db.query(sql2, bindings)
job = ret[0][0]
dagtemp = os.path.join(temp_dir, str(dataset), str(job))
logger.info('cleaning site_temp %r', dagtemp)
yield self._executor_wrapper(partial(functions.delete, dagtemp))
except Exception as e:
logger.warning('failed to clean site_temp', exc_info=True)
@tornado.gen.coroutine
[docs] def cron_clean_completed_jobs(self):
"""Check old files in the dagtemp from completed jobs"""
if 'site_temp' not in self.parent.cfg['queue']:
return
sql = 'select job_id,job_index from job where status = "complete"'
sql += ' or (status in ("suspended","errors") and status_changed < ?)'
timelimit = datetime.utcnow() - timedelta(days=30)
bindings = (timelimit.isoformat(),)
ret = yield self.parent.db.query(sql, bindings)
jobs = {job_id:str(index) for job_id,index in ret}
sql = 'select dataset_id, job_id from search '
sql += ' where task_status != "idle"'
bindings = tuple()
ret = yield self.parent.db.query(sql, bindings)
datasets = defaultdict(set)
for dataset_id,job_id in ret:
if job_id in jobs:
dataset = str(GlobalID.localID_ret(dataset_id, type='int'))
datasets[dataset].add(jobs[job_id])
# get all the job_ids currently in tmp
temp_dir = self.parent.cfg['queue']['site_temp']
dataset_dirs = yield self._executor_wrapper(partial(GridFTP.list, temp_dir))
for d in dataset_dirs:
job_dirs = yield self._executor_wrapper(partial(GridFTP.list, os.path.join(temp_dir, d)))
for j in job_dirs:
if d in datasets and j in datasets[d]:
try:
dagtemp = os.path.join(temp_dir, d, j)
logger.info('cleaning site_temp %r', dagtemp)
yield self._executor_wrapper(partial(functions.delete, dagtemp))
except Exception as e:
logger.warning('failed to clean site_temp', exc_info=True)
[docs] def cron_remove_old_passkeys(self):
now = nowstr()
sql = 'delete from passkey where expire < ?'
bindings = (now,)
return self.parent.db.query(sql, bindings)
@tornado.gen.coroutine
[docs] def cron_generate_web_graphs(self):
sql = 'select task_status, count(*) from search '
sql += 'where task_status not in (?,?,?) group by task_status'
bindings = ('idle','waiting','complete')
ret = yield self.parent.db.query(sql, bindings)
now = nowstr()
results = {}
for status, count in ret:
results[status] = count
graph_id = yield self.parent.db.increment_id('graph')
sql = 'insert into graph (graph_id, name, value, timestamp) '
sql += 'values (?,?,?,?)'
bindings = (graph_id, 'active_tasks', json_encode(results), now)
yield self.parent.db.query(sql, bindings)
time_interval = datetime2str(datetime.utcnow()-timedelta(minutes=1))
sql = 'select count(*) from task where status = ? and '
sql += 'status_changed > ?'
bindings = ('complete', time_interval)
ret = yield self.parent.db.query(sql, bindings)
now = nowstr()
results = {'completions':ret[0][0] if ret and ret[0] else 0}
graph_id = yield self.parent.db.increment_id('graph')
sql = 'insert into graph (graph_id, name, value, timestamp) '
sql += 'values (?,?,?,?)'
bindings = (graph_id, 'completed_tasks', json_encode(results), now)
yield self.parent.db.query(sql, bindings)
@tornado.gen.coroutine
[docs] def cron_pilot_monitoring(self):
sql = 'select sum(avail_cpu), sum(avail_gpu), sum(avail_memory), '
sql += 'sum(avail_disk), sum(avail_time), sum(claim_cpu), '
sql += 'sum(claim_gpu), sum(claim_memory), sum(claim_disk), '
sql += 'sum(claim_time), count(*) from pilot'
ret = yield self.parent.db.query(sql, tuple())
for (avail_cpu, avail_gpu, avail_memory, avail_disk, avail_time,
claim_cpu, claim_gpu, claim_memory, claim_disk, claim_time,
num) in ret:
self.parent.statsd.gauge('pilot_resources.available.cpu', avail_cpu if avail_cpu and avail_cpu > 0 else 0)
self.parent.statsd.gauge('pilot_resources.available.gpu', avail_gpu if avail_gpu and avail_gpu > 0 else 0)
self.parent.statsd.gauge('pilot_resources.available.memory', avail_memory if avail_memory and avail_memory > 0 else 0)
self.parent.statsd.gauge('pilot_resources.available.disk', avail_disk if avail_disk and avail_disk > 0 else 0)
self.parent.statsd.gauge('pilot_resources.available.time', avail_time if avail_time and avail_time > 0 else 0)
self.parent.statsd.gauge('pilot_resources.claimed.cpu', claim_cpu if claim_cpu and claim_cpu > 0 else 0)
self.parent.statsd.gauge('pilot_resources.claimed.gpu', claim_gpu if claim_gpu and claim_gpu > 0 else 0)
self.parent.statsd.gauge('pilot_resources.claimed.memory', claim_memory if claim_memory and claim_memory > 0 else 0)
self.parent.statsd.gauge('pilot_resources.claimed.disk', claim_disk if claim_disk and claim_disk > 0 else 0)
self.parent.statsd.gauge('pilot_resources.claimed.time', claim_time if claim_time and claim_time > 0 else 0)
self.parent.statsd.gauge('pilot_count', num if num and num > 0 else 0)
break
@tornado.gen.coroutine
[docs] def cron_dataset_update(self):
"""Update the dataset table on clients"""
if 'master_updater' in self.parent.modules:
ret = yield send_master(self.parent.cfg, 'master_get_tables',
tablenames=['dataset'])
if ret:
yield self.parent.service['misc_update_tables'](ret)
@tornado.gen.coroutine
[docs] def cron_suspend_overusage_tasks(self):
"""Suspend very high resource usage tasks"""
with (yield self.parent.db.acquire_lock('task_lookup')):
sql = 'select task_id, req_memory, req_time '
sql += ' from task_lookup where req_memory > 50 or req_time > 24'
ret = yield self.parent.db.query(sql, tuple())
task_ids_all = []
task_ids_mem = []
task_ids_time = []
for task_id, mem, time in ret:
task_ids_all.append(task_id)
if mem > 50:
task_ids_mem.append(task_id)
elif time > 24:
task_ids_time.append(task_id)
if task_ids_all:
sql = 'delete from task_lookup where task_id in (%s)'
for f in self._bulk_select(sql, task_ids_all):
yield f
# release lock
if task_ids_all:
yield self.parent.service['queue_set_task_status'](task_ids_all,'suspended')
now = nowstr()
def add_log(task_ids, data):
sql = 'insert into task_log (task_log_id,task_id,name,data) '
sql += ' values (?,?,?,?)'
for task_id in task_ids:
task_log_id = yield self.parent.db.increment_id('task_log')
bindings = (task_log_id,task_id,'stderr',data)
ret = yield self.parent.db.query(sql, bindings)
yield self._send_to_master(('task_log',task_log_id,now,sql,bindings))
if task_ids_mem:
data = json_compressor.compress(b'task held: requested >50GB memory')
add_log(task_ids_mem, data)
if task_ids_time:
data = json_compressor.compress(b'task held: requested >24hr time')
add_log(task_ids_time, data)
@tornado.gen.coroutine
[docs] def cron_check_active_pilots_tasks(self):
"""
Reset processing tasks that are not listed as running by
an active pilot.
"""
sql = 'select task_id from search where task_status="processing"'
bindings = tuple()
ret = yield self.parent.db.query(sql, bindings)
task_ids = {row[0] for row in ret}
tasks = yield self.parent.service['queue_get_pilots'](active=True)
for task in tasks:
if task['tasks']:
task_ids -= set(task['tasks'])
if task_ids:
yield self.parent.service['queue_set_task_status'](task_ids,'reset')
now = nowstr()
sql = 'insert into task_log (task_log_id,task_id,name,data) '
sql += ' values (?,?,?,?)'
data = json_compressor.compress(b'task reset: not running in an active pilot')
for task_id in task_ids:
task_log_id = yield self.parent.db.increment_id('task_log')
bindings = (task_log_id,task_id,'stderr',data)
ret = yield self.parent.db.query(sql, bindings)
yield self._send_to_master(('task_log',task_log_id,now,sql,bindings))
@tornado.gen.coroutine
[docs] def cron_dataset_status_monitoring(self):
"""
Monitor all datasets for job/task status summary.
"""
sql = 'select dataset_id from dataset where status = "processing"'
bindings = tuple()
ret = yield self.parent.db.query(sql, bindings)
processing_datasets = {row[0] for row in ret}
sql = 'select dataset_id, status, count(*) from job group by dataset_id,status'
bindings = tuple()
ret = yield self.parent.db.query(sql, bindings)
for dataset_id, status, num in ret:
dataset_num = GlobalID.localID_ret(dataset_id,type='int')
if dataset_id not in processing_datasets:
num = 0
self.parent.statsd.gauge('datasets.{}.jobs.{}'.format(dataset_num,status), num)
sql = 'select dataset_id, name, task_status, count(*) from search group by dataset_id,name,task_status'
bindings = tuple()
ret = yield self.parent.db.query(sql, bindings)
for dataset_id, name, status, num in ret:
dataset_num = GlobalID.localID_ret(dataset_id,type='int')
if dataset_id not in processing_datasets:
num = 0
self.parent.statsd.gauge('datasets.{}.tasks.{}.{}'.format(dataset_num,name,status), num)
@tornado.gen.coroutine
[docs] def cron_task_stat_monitoring(self, limit=1000):
"""
Monitor task statistics in ES.
"""
if 'elasticsearch' not in self.parent.cfg or not self.parent.cfg['elasticsearch']:
return
sql = 'select task_stat_id from task_stat'
bindings = tuple()
ret = yield self.parent.db.query(sql, bindings)
task_stat_ids = {row[0] for row in ret}
task_stat_updates = []
for ts_id in task_stat_ids:
if not self.parent.elasticsearch.head('task_stat',ts_id):
task_stat_updates.append(ts_id)
if len(task_stat_updates) >= limit:
logger.info('task_stat_monitoring hit limit')
break
if task_stat_updates:
sql = 'select * from task_stat where task_stat_id in (%s)'
for f in self._bulk_select(sql, task_stat_updates):
ret = yield f
for task_stat, task_id, data in ret:
payload = {
'task_stat_id': task_stat,
'task_id': task_id,
}
data = json_decode(data)
if 'error' in data:
payload['error'] = data['error']
else:
payload['error'] = any(k.startswith('error') for k in data)
if 'hostname' in data and isinstance(data['hostname'],str):
payload['hostname'] = data['hostname']
if 'resources' in data:
for r in data['resources']:
payload['resources_'+r] = data['resources'][r]
elif 'time_used' in data:
payload['resources_time'] = data['time_used']
if 'time' in data:
payload['time'] = data['time']
for k in data:
if k.startswith('error') and isinstance(data[k],dict):
if 'time_used' in data[k] and 'resources_time' not in payload:
payload['resources_time'] = data[k]['time_used']/3600.
if 'hostname' in data[k] and 'hostname' not in payload:
if isinstance(data[k]['hostname'], str):
payload['hostname'] = data[k]['hostname']
if isinstance(data[k]['hostname'], set):
payload['hostname'] = list(data[k]['hostname'])[0]
if 'time' not in payload and '_' in k and ':' in k and '-' in k:
payload['time'] = k.split('_',1)[-1]
self.parent.elasticsearch.put('task_stat', task_stat, payload)
yield None # yield to other events
@tornado.gen.coroutine
[docs] def cron_task_monitoring(self, limit=1000):
"""
Monitor task status in ES.
"""
if 'elasticsearch' not in self.parent.cfg or not self.parent.cfg['elasticsearch']:
return
sql = 'select task_id from task'
bindings = tuple()
ret = yield self.parent.db.query(sql, bindings)
result = {row[0] for row in ret}
task_ids = []
for ts_id in result:
if not self.parent.elasticsearch.head('task',ts_id):
task_ids.append(ts_id)
if len(task_ids) >= limit:
logger.info('task_monitoring hit limit')
break
logger.info('monitoring for task_ids: %r', task_ids)
if task_ids:
tasks = {}
task_rel_ids = set()
sql = 'select * from task where task_id in (%s)'
for f in self._bulk_select(sql, task_ids):
ret = yield f
for row in ret:
row = self._list_to_dict('task',row)
tasks[row['task_id']] = row
task_rel_ids.add(row['task_rel_id'])
job_ids = defaultdict(list)
sql = 'select task_id,dataset_id,job_id,name from search where task_id in (%s)'
for f in self._bulk_select(sql, task_ids):
ret = yield f
for task_id,dataset_id,job_id,name in ret:
job_ids[job_id].append(task_id)
tasks[task_id]['job_id'] = job_id
tasks[task_id]['dataset_id'] = dataset_id
tasks[task_id]['dataset'] = GlobalID.localID_ret(dataset_id,type='int')
tasks[task_id]['name'] = name
jobs = {}
sql = 'select job_id,job_index from job where job_id in (%s)'
for f in self._bulk_select(sql, job_ids):
ret = yield f
for job_id,job_index in ret:
for task_id in job_ids[job_id]:
tasks[task_id]['job_index'] = job_index
task_rels = {}
sql = 'select task_rel_id,task_index,requirements from task_rel where task_rel_id in (%s)'
for f in self._bulk_select(sql, task_rel_ids):
ret = yield f
for task_rel_id,task_index,requirements in ret:
task_rels[task_rel_id] = {
'task_index': task_index,
'requirements': json_decode(requirements),
}
for task_id in tasks:
data = tasks[task_id]
task_rel = task_rels[data['task_rel_id']]
data['task_index'] = task_rel['task_index']
if data['requirements']:
req = json_decode(data['requirements'])
else:
req = task_rel['requirements']
del data['requirements']
for k in req:
data['requirements_'+k] = req[k]
ret = self.parent.elasticsearch.post('task_stat','_search', {
"query": {
"term" : { "task_id" : task_id }
}
})
if ret:
for k in ret:
if k.startswith('resources'):
data[k] = ret[k]
self.parent.elasticsearch.put('task', task_id, data)
yield None # yield to other events