Copyright (c) 2018 LG Electronics Inc.
SPDX-License-Identifier: GPL-3.0-or-later
Abstract Interface of Resource Managers.
Using :func:`get_resource_manager` to create the corresponding object with the following resource type.
For different resource supports, see :doc:`environment`.
import abc
import copy
import importlib
import logging
import random
import threading
import time
import numpy as np
import math
import warnings
from ...utils import DEFAULT_AUPTIMIZER_PATH
from .utils.curve_fitting import CurveModel
ABC = abc.ABCMeta('ABC', (object,), {'__slots__': ()})
logger = logging.getLogger(__name__)
_SupportResource = {"gpu": "GPUResourceManager",
"cpu": "CPUResourceManager",
"node": "SSHResourceManager",
"aws": "AWSResourceManager",
"passive": "PassiveResourceManager"}
[docs]def get_resource_manager(resource, connector, n_parallel, auppath=DEFAULT_AUPTIMIZER_PATH, **kwargs):
Get resource manager for a specific resource type
:param resource: gpu or cpu type resource
:type resource: str
:param connector: database connector
:type connector: AbstractConnector
:param n_parallel: how many parallel jobs to be run
:type n_parallel: int
:param auppath: aup environment folder
:type auppath: str
:return: resource manager
:rtype: AbstractResourceManager
resource = _SupportResource[resource]
except KeyError:
raise KeyError("%s not implemented" % resource)
mod = importlib.import_module(".%s" % resource, "aup.EE.Resource")
return mod.__dict__[resource](connector, n_parallel, auppath=auppath, **kwargs)
[docs]class AbstractResourceManager(ABC):
Create Resource to run jobs.
:param connector: Connector to database
:type connector: AbstractConnector
def __init__(self, connector, n_parallel, *args, **kwargs):
self.connector = connector
self.jobs = dict()
self.curr_global_iteration = 0
self.maximize = kwargs.get("maximize", True)
self.stopped_jobs = None
self.stopped_jobs_lock = threading.Lock()
self.n_parallel = n_parallel
self.eid = kwargs.get("eid", None)
self.result_labels = kwargs.get('multi_res_labels', None)
self.track_intermediate_results = kwargs.get("track_intermediate_results", False)
self.interm_job_res = None
if self.track_intermediate_results:
self.interm_job_res = dict()
# variables for early stop impl
if "early_stop" in kwargs:
self.policy = kwargs["early_stop"]["aup_policy"]
self.policy_steps = kwargs["early_stop"]["aup_policy_steps"]
self.warmup = kwargs["early_stop"].get("warmup", 0)
self.bandit_factor = kwargs["early_stop"].get("bandit_factor", 0.5)
self.truncation_percentage = kwargs["early_stop"].get("truncation_percentage", 0.3)
self.curve_fitting_threshold = kwargs["early_stop"].get("curve_fitting_threshold", 0.95)
self.curve_fitting_max_iters = kwargs["early_stop"].get("curve_fitting_max_iters", None)
self.curve_fitting_timeout = kwargs["early_stop"].get("curve_fitting_timeout", 60)
self.job_checked = dict()
self.early_stop_daemon_finished = False
self.stopped_jobs = set()
if self.policy == "curve_fitting" and self.curve_fitting_max_iters is None:
raise ValueError("Curve fitting policy requires argument \"curve_fitting_max_iters\" representing " +
"the total number of intermediate results that the script will provide.")
self.early_stop_daemon = threading.Thread(target=AbstractResourceManager.early_stop_daemon_fun,
args=(self,), daemon=True)
self.warmup = None
self.policy = None
self.policy_steps = 0
self.job_checked = dict()
self.early_stop_daemon_finished = True
self.early_stop_daemon = None
self.stopped_jobs = None
[docs] def finish(self, status="FINISHED"):
Finish up the resource allocation.
:param status: status of the experiment
:type status: string
:return: Max/Min result in experiment (job id, score)
:rtype: None | [int, float]
self.connector.end_experiment(self.eid, status)
if self.early_stop_daemon != None:
self.early_stop_daemon_finished = True
return self.connector.get_best_result(self.eid, maximize=self.maximize)
[docs] def finish_job(self, jid, score, status=None):
Finish one job
:param jid: job ID
:type jid: int
:param score: job for the experiment
:type score: float | None
if jid in self.jobs:
rid = self.jobs.pop(jid)
self.connector.job_finished(rid, jid, score, status)
logger.warning("Job %d finished after job suspension, result may lose" % jid)
[docs] def get_available(self, username, rtype, rid_blacklist=None):
method to get the available resource to run a job
:param username: username for job running
:type username: str
:param rtype: resource type
:type rtype: str
:param rid_blacklist: resource ids to ignore
:type rid_blacklist: [int]
:return: a random selection of all available resource IDs
:rtype: int
rids = self.connector.get_available_resource(username, rtype, rid_blacklist)
logger.debug("Request resource (%s) for user %s and get %s" % (rtype, username, rids.__str__()))
return random.choice(rids) if rids else None
[docs] def run_job(self, job, rid, exp_config, call_back_func, **kwargs):
Job running interface, this is called by :mod:`aup.EE.Experiment`.
It is a wrapper for :func:`run`.
:param job: Job to run
:type job: Job
:param rid: resource ID
:type rid: int
:param exp_config: experiment configuration
:type exp_config: BasicConfig
:param call_back_func: call back function to update result
:type call_back_func: function object
self.jobs[job.jid] = rid
if self.interm_job_res != None:
self.interm_job_res[job.jid] = list()
self.job_checked[job.jid] = list()
self.run(job, rid, exp_config, call_back_func, **kwargs)
except EnvironmentError as e:
logger.fatal("Experiment interrupted.")
[docs] def append_interm_res(self, jid, interm_res):
if self.interm_job_res == None:
return None
if jid in self.interm_job_res:
if self.connector:
return self.connector.save_intermediate_result(jid, interm_res)
logger.warning("Could not save intermediate result: no connector attached to resource manager")
return None
logger.fatal("Job {} should have already started!".format(jid))
return None
[docs] def append_multiple_results(self, jid, irid, eid, scores):
if self.result_labels is None or len(scores) == 0:
assert len(self.result_labels) == len(scores), \
"labels size mismatch with the provided scores"
if self.connector is not None:
self.connector.save_multiple_results(jid, irid, eid, self.result_labels, scores)
[docs] def set_last_multiple_results(self, eid, jid):
if self.result_labels is None:
if self.connector is not None:
self.connector.set_last_multiple_results(eid, jid, len(self.result_labels))
[docs] def stop_job(self, jid):
Stop a job for early stopping strategies
:param jid: job ID
:type jid: int
if jid not in self.jobs:
logger.debug("Tried to stop job {} not in currently running jobs.".format(jid))
with self.stopped_jobs_lock:
[docs] def is_job_stopped(self, jid):
Returns whether or not a specific job stop is pending
:param jid: job ID
:type jid: int
:return: whether or not the given job ID is in the list of pending job stops
:rtype: bool
with self.stopped_jobs_lock:
return self.stopped_jobs is not None and jid in self.stopped_jobs
[docs] @abc.abstractmethod
def run(self, job, rid, exp_config, call_back_func, **kwargs):
Job running implemented for the specific resource manager.
It is called by :func:`run_job`.
:param job: a job object
:type job: Job
:param rid: resource id returned from :func:`get_available`.
:type rid: int
:param exp_config: experiment configuration
:type exp_config: BasicConfig
:param call_back_func: function to trigger after job finished
:type call_back_func: function object
raise NotImplementedError
[docs] def suspend(self):
Suspend job upon request
for jid in list(self.jobs.keys()):
self.finish_job(jid, None)
logger.warning("Job %d is canceled" % jid)
[docs] def run_curve_fitting(self, interm_res, c_jid, step, comp_fn, curve_fitting_threshold, best_val):
curvemodel = CurveModel(self.curve_fitting_max_iters)
predict_y = curvemodel.predict(interm_res, timeout=self.curve_fitting_timeout)
if predict_y is None:
if not comp_fn(predict_y, curve_fitting_threshold * best_val):
if self.is_job_stopped(c_jid):
logger.info("Stopping job {} early (step {}): predicted end value {:.4f} is lower than the best value so far {:.4f} within the given {:.2f}% threshold (={:.4f})".format(
c_jid, step, predict_y, best_val, 100. * curve_fitting_threshold, curve_fitting_threshold * best_val))
[docs] def early_stop_daemon_fun(self):
while not self.early_stop_daemon_finished:
# do not consider the early stopped jobs
with self.stopped_jobs_lock:
current_jobs = set(self.jobs) - self.stopped_jobs
finished_interm_job_res = self.connector.get_intermediate_results_experiment(self.eid, "FINISHED")
current_interm_job_res = self.connector.get_intermediate_results_jobs(list(current_jobs))
interm_job_res = {**current_interm_job_res, **finished_interm_job_res}
best_fn = np.max if self.maximize else np.min
comp_fn = (lambda x, target: x >= target) if self.maximize else \
(lambda x, target: x <= target)
curve_fitting_threads = []
for c_jid, c_interm_res in current_interm_job_res.items():
if len(c_interm_res) < self.warmup:
if c_jid not in self.job_checked:
self.job_checked[c_jid] = []
k = len(c_interm_res) // self.policy_steps
if k < 1:
step = k * self.policy_steps
if step in self.job_checked[c_jid]: # job already compared up until this step, waiting for next k multiple
comp_interm_job_res = {jid: vals for jid, vals in interm_job_res.items() if len(vals) >= step and jid != c_jid}
if len(comp_interm_job_res) < 1: # too few jobs
if self.policy == "median":
avgs = [np.average(vals[:step]) for vals in comp_interm_job_res.values()]
median = np.median(avgs)
best_val = np.average(c_interm_res[:step])
if not comp_fn(best_val, median):
logger.info("Stopping job {} early (step {}): best value so far {:.4f} worse than median of averages {:.4f} for {} other jobs".format(
c_jid, step, best_val, median, len(comp_interm_job_res)))
elif self.policy == "bandit":
bandit_best_val = best_fn([best_fn(vals[:step]) for vals in comp_interm_job_res.values()])
best_val = best_fn(c_interm_res[:step])
bandit_factor = self.bandit_factor if ((self.maximize and np.sign(bandit_best_val) == 1) or (not self.maximize and np.sign(bandit_best_val) == -1)) else \
2 - self.bandit_factor
if not comp_fn(best_val, bandit_factor * bandit_best_val):
logger.info("Stopping job {} early (step {}): best value so far {:.4f} worse than a factor {:.4f} of best overall value {:.4f} (={:.4f}) for {} other jobs".format(
c_jid, step, best_val, bandit_factor, bandit_best_val, bandit_factor * bandit_best_val, len(comp_interm_job_res)))
elif self.policy == "truncation":
best_vals = sorted([(jid, best_fn(vals[:step])) for jid, vals in (list(comp_interm_job_res.items()) + [(c_jid, c_interm_res)])],
key=lambda t: t[1], reverse=not self.maximize)
best_val_idx = next((idx for idx, (jid, val) in enumerate(best_vals) if jid == c_jid)) + 1
perc = float(best_val_idx) / len(best_vals)
if perc <= self.truncation_percentage:
logger.info("Stopping job {} early (step {}): best value so far {:.4f} is in the bottom {:.2f}% of {} jobs, which is lower than the {:.2f}% cutoff".format(
c_jid, step, best_vals[best_val_idx-1][1], 100. * best_val_idx / len(best_vals), len(best_vals), 100. * self.truncation_percentage))
elif self.policy == "curve_fitting":
if len(finished_interm_job_res) < 1 or step <= CURVE_FITTING_MIN_ITS:
interm_res = copy.deepcopy(c_interm_res)
best_val = best_fn([best_fn(vals) for vals in finished_interm_job_res.values()])
if not self.maximize:
interm_res *= -1
best_val *= -1
curve_fitting_threshold = self.curve_fitting_threshold if np.sign(best_val) == 1 else \
2 - self.curve_fitting_threshold
cf_thread = threading.Thread(target=self.run_curve_fitting,
args=(interm_res[:step], c_jid, step, (lambda x, target: x >= target), curve_fitting_threshold, best_val))
curve_fitting_threads += [cf_thread]
self.job_checked[c_jid] += [step]
for thread in curve_fitting_threads:
[docs] def refresh(self):
Method for refreshing timers/variables etc
[docs] def log_error_message(self, msg):
self.connector.log_error_message(self.eid, msg)