Coverage for /home/shudson/libensemble/rsync-to-clusters/nkag-fresh-checkout/libensemble_master/libensemble/resources.py : 75%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
""" Module for detecting and returning system resources
"""
#For debug messages in this module - uncomment (see libE.py to change root logging level) #logger.setLevel(logging.DEBUG)
"""Provide system resources to libEnsemble and job controller with knowledge of workers.
This is intialised when the job_controller is created with auto_resources set to True.
**Class Attributes:**
:cvar string default_nodelist_env_slurm: Default SLRUM nodelist environment variable :cvar string default_nodelist_env_cobalt: Default Cobal nodelist environment variable
**Object Attributes:**
These are set on initialisation.
:ivar string top_level_dir: Directory where searches for worker_list file. :ivar boolean central_mode: If true, then running in central mode, else distributed. :ivar string nodelist_env_slurm: Slurm environment variable giving node-list. :ivar string nodelist_env_cobalt: Cobalt environment variable giving node-list. :ivar list global_nodelist: A list of all nodes available for running user applications :ivar int num_workers: Total number of workers :ivar int logical_cores_avail_per_node: Logical cores (including SMT threads) available on a node. :ivar int physical_cores_avail_per_node: Physical cores available on a node. :ivar int workerID: workerID :ivar list local_nodelist: A list of all nodes assigned to this worker :ivar int local_node_count: The number of nodes available to this worker (rounded up to whole number) :ivar int workers_per_node: The number of workers per node (if using sub-node workers)
"""
# These can be overridden by passing in (e.g. nodelist_env_slurm) on init.
nodelist_env_slurm=None, nodelist_env_cobalt=None): """Initialise new Resources instance
Parameters ----------
top_level_dir: string, optional: Directory libEnsemble runs in (default is current working directory)
workerID: int, optional: workerID of current process
central_mode, optional: boolean: If true, then running in central mode, else distributed. Central mode means libE processes (manager and workers) are grouped together and do not share nodes with applications. Distributed mode means Workers share nodes with applications.
nodelist_env_slurm: String, optional The environment variable giving a node list in Slurm format (Default: Uses SLURM_NODELIST) Note: This is only queried if a worker_list file is not provided and auto_resources=True.
nodelist_env_cobalt: String, optional The environment variable giving a node list in Cobalt format (Default: Uses COBALT_PARTNAME) Note: This is only queried if a worker_list file is not provided and auto_resources=True.
"""
# These presence of these env vars will be used to detect scheduler
#This is global nodelist avail to workers - may change to global_worker_nodelist nodelist_env_slurm=self.nodelist_env_slurm, nodelist_env_cobalt=self.nodelist_env_cobalt)
self.workerID = workerID or Resources.get_workerID() self.local_nodelist = self.get_available_nodes() self.local_node_count = len(self.local_nodelist) self.workers_per_node = self.get_workers_on_a_node()
#Will be in comms module ------------------------------------------------
def am_I_manager(): """ Returns True if manager"""
def get_workerID(): """Returns workerID""" from mpi4py import MPI if MPI.COMM_WORLD.Get_rank() == 0: logger.warning('get_workerID called by manager - returning 0') return MPI.COMM_WORLD.Get_rank()
def get_num_workers(): """Returns total number of workers""" #Will use MPI_MODE from settyings.py global - for now assume using mpi. #Or the function may be in some worker_concurrency module
#Call from all libE tasks (pref. inc. manager) def get_libE_nodes(): """Returns a list of nodes running libE workers"""
#This is a libE node
def get_MPI_variant(): """Returns MPI base implementation
Returns ------- mpi_variant: string: MPI variant 'mpich' or 'openmpi'
""" # Explore mpi4py.MPI.get_vendor() and mpi4py.MPI.Get_library_version() for mpi4py stderr=subprocess.STDOUT) return 'openmpi'
#---------------------------------------------------------------------------
def _range_split(s): """Split ID range string."""
def get_slurm_nodelist(node_list_env): """Get global libEnsemble nodelist from the Slurm environment"""
def get_cobalt_nodelist(node_list_env): """Get global libEnsemble nodelist from the Cobalt environment""" nnum_len = len(hostname[len(prefix):])
#This is for central mode where libE nodes will not share with app nodes #ie this is not for removing a manager node in distributed mode. def remove_libE_nodes(global_nodelist_in): """Any node containing a libensemble task is removed from the global nodelist"""
def best_split(a, n): """Create the most even split of list a into n parts and return list of lists"""
#Consider changing from static - top_level_dir could be moved to resources attribute - set once on init #Also nodelist_env_slurm etc could just use self values. nodelist_env_slurm=None, nodelist_env_cobalt=None): """ Return the list of nodes available to all libEnsemble workers
If a worker_list file exists this is used, otherwise the environment is interrogated for a node list. If a dedicated manager node is used, then a worker_list file is recommended.
In central mode, any node with a libE worker is removed from the list. """
else: else: #Assume a standalone machine if all workers on same node - though give warning. if len(set(Resources.get_libE_nodes())) == 1: logger.info("Can not find nodelist from environment. Assuming standalone") global_nodelist.append(socket.gethostname()) else: raise ResourcesException("Error. Can not find nodelist from environment")
""" Returns the number of workers that can be placed on each node""" num_workers = self.num_workers num_nodes = len(self.global_nodelist)
#Round up if theres a remainder workers_per_node = num_workers//num_nodes + (num_workers % num_nodes > 0)
return workers_per_node
"""Returns the list of nodes available to the current worker
Assumes that self.global_nodelist has been calculated (in __init__). Also self.global_nodelist will have already removed non-application nodes """
# Check if current host in nodelist - if it is then in distributed mode.
# If not in central mode (ie. in distrib mode) then this host should be in nodelist. # Either an error - or set to central mode. Currently make error for transparency
# If multiple workers per node - create global node_list with N duplicates (for N workers per node)
# Currently require even split for distrib mode - to match machinefile - throw away remainder #Could just read in the libe machinefile and use that - but this should match #Alt. create machinefile/host-list with same algorithm as best_split - future soln. # Worker node may not be at head of list after truncation - should perhaps be warning or enforced .format(num_workers, num_nodes))
# Divide global list between workers #logger.debug("split_list is {}".format(split_list))
raise ResourcesException("Worker has no workerID - aborting")
# If in distrib_mode local host must be in local nodelist
def _open_binary(fname, **kwargs): return open(fname, "rb", **kwargs)
#@staticmethod ? may use self.physical_cores if already set. def _cpu_count_physical(): """Returns the number of physical cores on the node.""" mapping = {} current_info = {} with Resources._open_binary('/proc/cpuinfo') as f: for line in f: line = line.strip().lower() if not line: # new section if (b'physical id' in current_info and b'cpu cores' in current_info): mapping[current_info[b'physical id']] = current_info[b'cpu cores'] current_info = {} else: if (line.startswith(b'physical id') or line.startswith(b'cpu cores')): key, value = line.split(b'\t:', 1) current_info[key] = int(value)
return sum(mapping.values()) or None
#@staticmethod ? may use self.num_cores if already set. """Returns the number of cores on the node.
If hyperthreads is true, this is the logical cpu cores, else the physical cores are returned.
Note: This returns cores available on the current node - will not work for systems of multiple node types """ except ImportError: #logger if hyperthreads: import multiprocessing ranks_per_node = multiprocessing.cpu_count() else: try: ranks_per_node = Resources._cpu_count_physical() except: import multiprocessing ranks_per_node = multiprocessing.cpu_count() logger.warning('Could not detect physical cores - Logical cores (with hyperthreads) returned - specify ranks_per_node to override') |