-
-
Notifications
You must be signed in to change notification settings - Fork 379
Make worker action cancellable #1472
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,12 @@ | ||
| import logging | ||
| from asyncio import Future | ||
| from concurrent.futures import ProcessPoolExecutor | ||
| import os | ||
| import signal | ||
| from concurrent.futures import CancelledError, Future, InvalidStateError, ProcessPoolExecutor | ||
| from functools import wraps | ||
| from threading import Thread | ||
| from threading import Lock, Thread | ||
| from time import sleep, time | ||
|
|
||
| from giskard.settings import settings | ||
|
|
||
| LOGGER = logging.getLogger(__name__) | ||
|
|
||
|
|
@@ -21,19 +25,23 @@ class WorkerPool: | |
|
|
||
| def __init__(self): | ||
| self.pool = None | ||
| self.nb_cancellable = 0 | ||
| self.max_workers = 0 | ||
|
|
||
| def start(self, *args, **kwargs): | ||
| def start(self, max_workers: int = None): | ||
Inokinoki marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if self.pool is not None: | ||
| return | ||
| LOGGER.info("Starting worker pool...") | ||
| self.pool = ProcessPoolExecutor(*args, **kwargs) | ||
| self.max_workers = max(max_workers, settings.min_workers) if max_workers is not None else os.cpu_count() | ||
| LOGGER.info("Starting worker pool with %s workers...", self.max_workers) | ||
| self.pool = ProcessPoolExecutor(max_workers=self.max_workers) | ||
| LOGGER.info("Pool is started") | ||
|
|
||
| def shutdown(self, wait=True, cancel_futures=False): | ||
| if self.pool is None: | ||
| return | ||
| self.pool.shutdown(wait=wait, cancel_futures=cancel_futures) | ||
| self.pool = None | ||
| with NB_CANCELLABLE_WORKER_LOCK: | ||
| self.nb_cancellable = 0 | ||
|
|
||
| def submit(self, *args, **kwargs) -> Future: | ||
| if self.pool is None: | ||
|
|
@@ -50,9 +58,10 @@ def log_stats(self): | |
| LOGGER.debug("Pool is not yet started") | ||
| return | ||
| LOGGER.debug( | ||
| "Pool is currently having :\n - %s pending items\n - %s workers", | ||
| "Pool is currently having :\n - %s pending items\n - %s workers\n - %s cancellable tasks", | ||
| len(self.pool._pending_work_items), | ||
| len(self.pool._processes), | ||
| self.nb_cancellable, | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -83,18 +92,6 @@ def shutdown_pool(): | |
| POOL.shutdown(wait=True, cancel_futures=True) | ||
|
|
||
|
|
||
| def call_in_pool(fn, *args, **kwargs): | ||
| """Submit the function call with args and kwargs inside the process pool | ||
|
|
||
| Args: | ||
| fn (function): the function to call | ||
|
|
||
| Returns: | ||
| Future: the promise of the results | ||
| """ | ||
| return POOL.submit(fn, *args, **kwargs) | ||
|
|
||
|
|
||
| def pooled(fn): | ||
| """Decorator to make a function be called inside the pool. | ||
|
|
||
|
|
@@ -109,6 +106,64 @@ def wrapper(*args, **kwargs): | |
| return wrapper | ||
|
|
||
|
|
||
| NB_CANCELLABLE_WORKER_LOCK = Lock() | ||
|
|
||
|
|
||
| @threaded | ||
| def start_killer(timeout: float, future: Future, pid: int, executor: ProcessPoolExecutor): | ||
| start = time() | ||
| # Try to get the result in proper time | ||
| while (time() - start) < timeout: | ||
| # future.result(timeout=timeout) => Not working with WSL and python 3.10, switchin to something safer | ||
| LOGGER.debug("Sleeping for pid %s", pid) | ||
| sleep(1) | ||
| if future.done(): | ||
| executor.shutdown(wait=True, cancel_futures=False) | ||
| with NB_CANCELLABLE_WORKER_LOCK: | ||
| POOL.nb_cancellable -= 1 | ||
| return | ||
| LOGGER.warning("Thread gonna kill pid %s", pid) | ||
| # Manually setting exception, to allow customisation | ||
| # TODO(Bazire): See if we need a custom error to handle that properly | ||
| try: | ||
| future.set_exception(CancelledError("Background task was taking too much time and was cancelled")) | ||
| except InvalidStateError: | ||
| pass | ||
| # Shutting down an executor is actually not stopping the running processes | ||
| executor.shutdown(wait=False, cancel_futures=False) | ||
| # Kill the process running by targeting its pid | ||
| os.kill(pid, signal.SIGINT) | ||
| # Let's clean up the executor also | ||
| # Also, does not matter to call shutdown several times | ||
| executor.shutdown(wait=True, cancel_futures=False) | ||
| with NB_CANCELLABLE_WORKER_LOCK: | ||
| POOL.nb_cancellable -= 1 | ||
| LOGGER.debug("Executor has been successfully shutdown") | ||
| log_pool_stats() | ||
|
|
||
|
|
||
| def call_in_pool(fn, *args, timeout=None, **kwargs): | ||
| """Submit the function call with args and kwargs inside the process pool | ||
|
|
||
| Args: | ||
| fn (function): the function to call | ||
|
|
||
| Returns: | ||
| Future: the promise of the results | ||
| """ | ||
| if timeout is None: | ||
| return POOL.submit(fn, *args, **kwargs) | ||
| # Create independant process pool | ||
| # If we kill a running process, it breaking the Process pool, making it unusable | ||
| one_shot_executor = ProcessPoolExecutor(max_workers=1) | ||
| pid = one_shot_executor.submit(os.getpid).result(timeout=5) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't get it, the comment above said
here it's expected to work? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I should remove the timeout here anyway, no reason for basic os.getpid to fail or take a long time. Basically, when I tested it, in the killer thread, the timeout was not respected, and it never stopped, so I had to do the loop. Btw, the timeout was working out on MacOS with python 3.11. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the overall killing mechanism could be simplified to avoid having a killer thread and a cancellable counter. We could use an inter-process data structure to communicate pids between a spawned worker process and the main process: In this case the first thing the pool process will do is identify its pid and add it to queue, then do the actual work. When the timeout is expired we'd find a PID related to a given task and kill it from the main process. WDYT? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Hartorn , actually, it looks like https://pypi.org/project/Pebble/ does exactly what we need: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Cancellable counter is only for logs, we can remove it if we want. Also, it cannot be the main executor, since killing a process is breaking the pool, which only raises BrokenPoolException after that. I'm pretty sure we could avoid the thread is the whole code we were running was async, since a coroutine could do this job, but here I would not be confident. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Looking at it ! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From what I saw, they are kinda doing the same stuff, when using a separate process, they have a handler thread for watching it and handling timeout. Should we get this merged and change to use it ? Or want me to switch it ? Although I'm a bit concerned it's not that much used, the code looks clean There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, while having a clean API and being easy to use Pebble is a LGPL library so we won't be able to use it (it might be the reason why it's not that widely adopted). I also read their code and found similarities with your implementation. I suggest we stick to your current code (and merge it since we need these changes ASAP). As an improvement, I think in a separate PR we could inspire from Pebble's API and also encapsulate There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shame, the code was working well with it also. |
||
| future = one_shot_executor.submit(fn, *args, **kwargs) | ||
| start_killer(timeout, future, pid, one_shot_executor) | ||
| with NB_CANCELLABLE_WORKER_LOCK: | ||
| POOL.nb_cancellable += 1 | ||
| return future | ||
|
|
||
|
|
||
| def fullname(o): | ||
| klass = o.__class__ | ||
| module = klass.__module__ | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.