Coverage for /home/shudson/libensemble/rsync-to-clusters/nkag-fresh-checkout/libensemble_master/libensemble/libE_manager.py : 33%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
""" libEnsemble manager routines ==================================================== """
EVAL_SIM_TAG, FINISHED_PERSISTENT_SIM_TAG, \ EVAL_GEN_TAG, FINISHED_PERSISTENT_GEN_TAG, \ STOP_TAG, UNSET_TAG, \ WORKER_KILL, WORKER_KILL_ON_ERR, WORKER_KILL_ON_TIMEOUT, \ JOB_FAILED, WORKER_DONE, \ MAN_SIGNAL_FINISH, MAN_SIGNAL_KILL, \ MAN_SIGNAL_REQ_RESEND, MAN_SIGNAL_REQ_PICKLE_DUMP, \ ABORT_ENSEMBLE
#For debug messages - uncomment # logger.setLevel(logging.DEBUG)
sim_specs, gen_specs, exit_criteria, persis_info): """Manager routine to coordinate the generation and simulation evaluations """ sim_specs, gen_specs, exit_criteria)
"Return an elapsed time function, starting now" "Return time elapsed since start."
"Filter out NaNs from a numpy array."
"""Manager class for libensemble."""
('active', int), ('persis_state', int), ('blocked', bool)]
sim_specs, gen_specs, exit_criteria): """Initialize the manager.""" [(2, 'elapsed_wallclock_time', self.term_test_wallclock), (1, 'sim_max', self.term_test_sim_max), (1, 'gen_max', self.term_test_gen_max), (1, 'stop_val', self.term_test_stop_val)]
def _make_worker_pool(comm): """Set up an array of worker states."""
# --- Termination logic routines
"""Check against wallclock timeout"""
"""Check against max simulations"""
"""Check against max generator calls.""" return self.hist.index >= gen_max + self.hist.offset
"""Check against stop value criterion."""
"""Check termination criteria"""
# --- Low-level communication routines (use MPI directly)
"Check whether there is a message from a worker." return self.comm.Iprobe(source=w, tag=MPI.ANY_TAG, status=status)
"Receive from a worker." return self.comm.recv(source=w, tag=MPI.ANY_TAG, status=status)
"Send to a worker." return self.comm.send(obj=obj, dest=w, tag=tag)
"Broadcast sim_spec/gen_spec input dtypes to workers." self.comm.bcast(obj=self.hist.H[self.gen_specs['in']].dtype)
"""Kill the workers""" for w in self.W['worker_id']: self.send(MAN_SIGNAL_FINISH, w, tag=STOP_TAG)
"Request the worker resend data on error." self.send(MAN_SIGNAL_REQ_RESEND, w, tag=STOP_TAG) return self.recv(w)
"Request the worker dump a pickle on error." self.send(MAN_SIGNAL_REQ_PICKLE_DUMP, w, tag=STOP_TAG) pkl_recv = self.recv(w) D_recv = pickle.load(open(pkl_recv, "rb")) os.remove(pkl_recv) #If want to delete file return D_recv
# --- Checkpointing logic
"Save history every kth step." count = k*(count//k) filename = fname.format(count) if not os.path.isfile(filename) and count > 0: np.save(filename, self.hist.H)
"Save history every kth sim step." self._save_every_k('libE_history_after_sim_{}.npy', self.hist.sim_count, self.sim_specs['save_every_k'])
"Save history every kth gen step." self._save_every_k('libE_history_after_gen_{}.npy', self.hist.index, self.gen_specs['save_every_k'])
# --- Handle outgoing messages to workers (work orders from alloc)
"""Check validity of an allocation function order. """ assert w != 0, "Can't send to worker 0; this is the manager. Aborting" assert self.W[w-1]['active'] == 0, \ "Allocation function requested work to an already active worker. Aborting" work_rows = Work['libE_info']['H_rows'] if len(work_rows): work_fields = set(Work['H_fields']) hist_fields = self.hist.H.dtype.names diff_fields = list(work_fields.difference(hist_fields)) assert not diff_fields, \ "Allocation function requested invalid fields {}" \ "be sent to worker={}.".format(diff_fields, w)
"""Send an allocation function order to a worker. """ logger.debug("Manager sending work unit to worker {}".format(w)) self.send(Work, w, tag=Work['tag']) work_rows = Work['libE_info']['H_rows'] if len(work_rows): self.send(self.hist.H[Work['H_fields']][work_rows], w)
"""Update worker active/idle status following an allocation order."""
self.W[w-1]['active'] = Work['tag'] if 'libE_info' in Work and 'persistent' in Work['libE_info']: self.W[w-1]['persis_state'] = Work['tag']
if 'blocking' in Work['libE_info']: for w_i in Work['libE_info']['blocking']: assert self.W[w_i-1]['active'] == 0, \ "Active worker being blocked; aborting" self.W[w_i-1]['blocked'] = 1 self.W[w_i-1]['active'] = 1
if Work['tag'] == EVAL_SIM_TAG: work_rows = Work['libE_info']['H_rows'] self.hist.update_history_x_out(work_rows, w)
# --- Handle incoming messages from workers
def _check_received_calc(D_recv): "Check the type and status fields on a receive calculation." calc_type = D_recv['calc_type'] calc_status = D_recv['calc_status'] assert calc_type in [EVAL_SIM_TAG, EVAL_GEN_TAG], \ 'Aborting, Unknown calculation type received. Received type: ' + str(calc_type) assert calc_status in [FINISHED_PERSISTENT_SIM_TAG, FINISHED_PERSISTENT_GEN_TAG, UNSET_TAG, MAN_SIGNAL_FINISH, MAN_SIGNAL_KILL, WORKER_KILL_ON_ERR, WORKER_KILL_ON_TIMEOUT, WORKER_KILL, JOB_FAILED, WORKER_DONE], \ 'Aborting: Unknown calculation status received. Received status: ' + str(calc_status)
"""Receive calculation output from workers. Loops over all active workers and probes to see if worker is ready to communticate. If any output is received, all other workers are looped back over. """ status = MPI.Status()
new_stuff = True while new_stuff and any(self.W['active']): new_stuff = False for w in self.W['worker_id'][self.W['active'] > 0]: if self.Iprobe(w, status): new_stuff = True self._handle_msg_from_worker(persis_info, w, status)
if 'save_every_k' in self.sim_specs: self._save_every_k_sims() if 'save_every_k' in self.gen_specs: self._save_every_k_gens() return persis_info
"""Update history and worker info on worker message. """ calc_type = D_recv['calc_type'] calc_status = D_recv['calc_status'] Manager._check_received_calc(D_recv)
self.W[w-1]['active'] = 0 if calc_status in [FINISHED_PERSISTENT_SIM_TAG, FINISHED_PERSISTENT_GEN_TAG]: self.W[w-1]['persis_state'] = 0 else: if calc_type == EVAL_SIM_TAG: self.hist.update_history_f(D_recv) if calc_type == EVAL_GEN_TAG: self.hist.update_history_x_in(w, D_recv['calc_out']) if 'libE_info' in D_recv and 'persistent' in D_recv['libE_info']: # Now a waiting, persistent worker self.W[w-1]['persis_state'] = calc_type
if 'libE_info' in D_recv and 'blocking' in D_recv['libE_info']: # Now done blocking these workers for w_i in D_recv['libE_info']['blocking']: self.W[w_i-1]['blocked'] = 0 self.W[w_i-1]['active'] = 0
if 'persis_info' in D_recv: persis_info[w].update(D_recv['persis_info'])
"""Handle a message from worker w. """ logger.debug("Manager receiving from Worker: {}".format(w)) try: D_recv = self.recv(w) logger.debug("Message size {}".format(status.Get_count())) except Exception as e: logger.error("Exception caught on Manager receive: {}".format(e)) logger.error("From worker: {}".format(w)) logger.error("Message size of errored message {}". \ format(status.Get_count())) logger.error("Message status error code {}". \ format(status.Get_error()))
# Check on working with peristent data - curently only use one #D_recv = _man_request_resend_on_error(w) D_recv = self._man_request_pkl_dump_on_error(w)
if status.Get_tag() == ABORT_ENSEMBLE: raise ManagerException('Received abort signal from worker')
self._update_state_on_worker_msg(persis_info, D_recv, w)
# --- Handle termination
"""Read final messages from any active workers""" for w in self.W['worker_id'][self.W['active'] > 0]: if self.Iprobe(w): self.recv(w)
""" Tries to receive from any active workers.
If time expires before all active workers have been received from, a nonblocking receive is posted (though the manager will not receive this data) and a kill signal is sent. """ exit_flag = 0 while any(self.W['active']) and exit_flag == 0: persis_info = self._receive_from_workers(persis_info) if self.term_test(logged=False) == 2 and any(self.W['active']): self._print_wallclock_term() self._read_final_messages() exit_flag = 2
self._kill_workers() print("\nlibEnsemble manager total time:", self.elapsed()) return persis_info, exit_flag
def _print_wallclock_term(): """Print termination message for wall clock elapsed.""" print("Termination due to elapsed_wallclock_time has occurred.\n"\ "A last attempt has been made to receive any completed work.\n"\ "Posting nonblocking receives and kill messages for all active workers\n") sys.stdout.flush() sys.stderr.flush()
# --- Main loop
"Call queue update function from libE_specs (if defined)" if 'queue_update_function' not in self.libE_specs or not len(H): return persis_info qfun = self.libE_specs['queue_update_function'] return qfun(H, self.gen_specs, persis_info)
"Call work allocation function from alloc_specs" alloc_f = self.alloc_specs['alloc_f'] return alloc_f(self.W, H, self.sim_specs, self.gen_specs, persis_info)
"Run the manager." format(self.comm.Get_rank(), socket.gethostname()))
# Send initial info to workers
### Continue receiving and giving until termination test is satisfied while not self.term_test(): persis_info = self._receive_from_workers(persis_info) persis_info = self._queue_update(self.hist.trim_H(), persis_info) if any(self.W['active'] == 0): Work, persis_info = self._alloc_work(self.hist.trim_H(), persis_info) for w in Work: if self.term_test(): break self._check_work_order(Work[w], w) self._send_work_order(Work[w], w) self._update_state_on_alloc(Work[w], w)
# Return persis_info, exit_flag return self._final_receive_and_kill(persis_info) |