From 3e28b1962eddd1c13eb932e4894792d04e7e21d0 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Thu, 4 May 2023 13:44:46 -0600 Subject: [PATCH 1/6] sphinx-quickstart --- docs/Makefile | 20 ++++++++++++++++++++ docs/conf.py | 28 ++++++++++++++++++++++++++++ docs/index.rst | 20 ++++++++++++++++++++ docs/make.bat | 35 +++++++++++++++++++++++++++++++++++ 4 files changed, 103 insertions(+) create mode 100644 docs/Makefile create mode 100644 docs/conf.py create mode 100644 docs/index.rst create mode 100644 docs/make.bat diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..d4bb2cbb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..cfc5f6f6 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,28 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = 'pysqa' +copyright = '2023, Jan Janssen' +author = 'Jan Janssen' +release = '0.0.22' + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [] + +templates_path = ['_templates'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = 'alabaster' +html_static_path = ['_static'] diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000..419d0fe4 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,20 @@ +.. pysqa documentation master file, created by + sphinx-quickstart on Thu May 4 13:39:22 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to pysqa's documentation! +================================= + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000..954237b9 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd From 04a6de1fb7171f9fc6478f9667196ccd10157c7d Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Thu, 4 May 2023 18:55:13 -0600 Subject: [PATCH 2/6] First draft for documentation --- docs/Makefile | 4 +- docs/index.rst | 20 ----- docs/make.bat | 4 +- docs/source/advanced.md | 5 ++ docs/source/command.md | 21 +++++ docs/{ => source}/conf.py | 4 +- docs/source/index.rst | 41 +++++++++ docs/source/installation.md | 24 +++++ docs/source/python.md | 7 ++ docs/source/queue.md | 174 ++++++++++++++++++++++++++++++++++++ 10 files changed, 278 insertions(+), 26 deletions(-) delete mode 100644 docs/index.rst create mode 100644 docs/source/advanced.md create mode 100644 docs/source/command.md rename docs/{ => source}/conf.py (92%) create mode 100644 docs/source/index.rst create mode 100644 docs/source/installation.md create mode 100644 docs/source/python.md create mode 100644 docs/source/queue.md diff --git a/docs/Makefile b/docs/Makefile index d4bb2cbb..d0c3cbf1 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -5,8 +5,8 @@ # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build -SOURCEDIR = . -BUILDDIR = _build +SOURCEDIR = source +BUILDDIR = build # Put it first so that "make" without argument is like "make help". help: diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index 419d0fe4..00000000 --- a/docs/index.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. pysqa documentation master file, created by - sphinx-quickstart on Thu May 4 13:39:22 2023. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to pysqa's documentation! -================================= - -.. toctree:: - :maxdepth: 2 - :caption: Contents: - - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat index 954237b9..dc1312ab 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -7,8 +7,8 @@ REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) -set SOURCEDIR=. -set BUILDDIR=_build +set SOURCEDIR=source +set BUILDDIR=build %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( diff --git a/docs/source/advanced.md b/docs/source/advanced.md new file mode 100644 index 00000000..e1c1df26 --- /dev/null +++ b/docs/source/advanced.md @@ -0,0 +1,5 @@ +# Advanced Configuration + +## Remote HPC Configuration + +## Access to Multiple HPCs diff --git a/docs/source/command.md b/docs/source/command.md new file mode 100644 index 00000000..025507a4 --- /dev/null +++ b/docs/source/command.md @@ -0,0 +1,21 @@ +# Command Line Interface +``` +python -m pysqa --help +``` + +* `-f`, `--config_directory` +* `-p`, `--submit` +* `-q`, `--queue` +* `-j`, `--job_name` +* `-w`, `--working_directory` +* `-n`, `--cores` +* `-m`, `--memory` +* `-t`, `--run_time` +* `-b`, `--dependency` +* `-c`, `--command` +* `-r`, `--reservation` +* `-i`, `--id` +* `-d`, `--delete` +* `-s`, `--status` +* `-l`, `--list` +* `-h`, `--help` diff --git a/docs/conf.py b/docs/source/conf.py similarity index 92% rename from docs/conf.py rename to docs/source/conf.py index cfc5f6f6..abd8317a 100644 --- a/docs/conf.py +++ b/docs/source/conf.py @@ -14,10 +14,10 @@ # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration -extensions = [] +extensions = ["myst_parser"] templates_path = ['_templates'] -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = [] diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 00000000..1d6a49cc --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,41 @@ +.. pysqa documentation master file, created by + sphinx-quickstart on Thu May 4 14:01:49 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +pysqa - a simple queue adapter for python +========================================= + +High-performance computing (HPC) does not have to be hard. In this context the aim of pysqa is to simplify the submission of calculation to an HPC cluster as easy as starting another subprocess locally. This is achieved based on the assumption that even though modern HPC queuing systems offer a wide range of different configuration options, most users submit the majority of their jobs with very similar parameters. + +Therefore in pysqa users define submission script templates once and reuse them to submit many different calculation of workflows. These templates are defined in the jinja2 template language, so current submission scripts can be easily extended to templates. In addition to the submission of new jobs to the queuing system pysqa also allows the users to track the progress of their jobs, delete them or enable reservations using the built-in functionality of the queuing system. + +Features +-------- +The core feature of pysqa is the communication to an HPC queuing system. This includes: + +* Submission of new calculation to the queuing system. +* List of calculation currently waiting or running on the queuing system. +* Deleting calculation which are currently waiting or running on the queuing system. +* List of available queue templates created by the user. +* Restriction of templates to a specific number of cores, run time or other computing resources. With integrated checks if a given calculation follows these restrictions. + +In addition to these core features, pysqa is continously extended to support more usecases for a larger group of users. These new features include the support for remote queuing systems: + +* Remote connection via the secure shell protocol to access remote HPC clusters. +* Transfer of file to and from remote HPC clusters, based on a predefined mapping of the remote file system into the local file system. +* Support for both individual connections as well as continous connections depending on the network availability. + +Finally, there is current work in progress to support a combination of multiple local and remote queuing systems from within pysqa, which are represented to the user as a single resource. + +Documentation +------------- + +.. toctree:: + :maxdepth: 2 + + installation + queue + python + command + advanced diff --git a/docs/source/installation.md b/docs/source/installation.md new file mode 100644 index 00000000..806fc646 --- /dev/null +++ b/docs/source/installation.md @@ -0,0 +1,24 @@ +# Installation +The `pysqa` package can be installed either via `pip` or `conda`. While most HPC systems use Linux these days, the `pysqa` package can be installed on all major operation systems. In particular for connections to remote HPC clusters it is required to install `pysqa` on both the local system as well as the remote HPC cluster. In this case it is highly recommended to use the same version of `pysqa` on both systems. + +## pypi-based installation +`pysqa` can be installed from the python package index (pypi) using the following command: +``` +pip install pysqa +``` +On `pypi` the `pysqa` package exists in three different versions: + +* `pip install pysaq` - base version - with minimal requirements only depends on `jinja2`, `pandas` and `pyyaml`. +* `pip install pysaq[sge]` - sun grid engine (SGE) version - in addition to the base dependencies this installs `defusedxml` which is required to parse the `xml` files from `qstat`. +* `pip install pysaq[remote]` - remote version - in addition to the base dependencies this installs `paramiko` and `tqdm`, to connect to remote HPC clusters using SSH and report the progress of the data transfer visually. + +## conda-based installation +The `conda` package combines all dependencies in one package: +``` +conda install -c conda-forge pysqa +``` +When resolving the dependencies with `conda` gets slow it is recommended to use `mamba` instead of `conda`. So you can also install `pysqa` using: +``` +mamba install -c conda-forge pysqa +``` + diff --git a/docs/source/python.md b/docs/source/python.md new file mode 100644 index 00000000..64b6079e --- /dev/null +++ b/docs/source/python.md @@ -0,0 +1,7 @@ +# Python Interface + +``` +from pysqa import QueueAdapter +``` + + diff --git a/docs/source/queue.md b/docs/source/queue.md new file mode 100644 index 00000000..77abc1b3 --- /dev/null +++ b/docs/source/queue.md @@ -0,0 +1,174 @@ +# Queuing Systems +`pysqa` is based on the idea of reusable templates. These templates are defined in the `jinja2` templating language. By default `pysqa` expects to find these templates in `~/.queues`. Still it is also possible to store them in a different directory. + +In this directory `pysqa` expects to find one queue configuration and one jinja template per queue. The `queue.yaml` file which defines the available queues and their restrictions in terms of minimum and maximum number of CPU cores, required memory or run time. In addition, this file defines the type of the queuing system and the default queue. + +A typical `queue.yaml` file looks like this: +``` +queue_type: +queue_primary: +queues: + : { + cores_max: , + cores_min: , + run_time_max: , + script: + } +``` +The `queue.yaml` files and some templates for the most common queuing systems are defined below. By default `pysqa` supports the following variable for the submission script templates: + +* `job_name` - the name of the calculation which appears on the queuing system +* `working_directory` - the directory on the file system the calculation is executed in +* `cores` - the number of cores used for the calculation +* `memory_max` - the amount of memory requested for the total calculation +* `run_time_max` - the run time requested for a given calculation - typically in seconds +* `command` - the command which is executed on the queuing system + +Beyond these standardized keywords, additional flags can be added to the template which are then available through the python interface. + +## Flux +For the flux framework the `queue.yaml` file defines the `queue_type` as `FLUX`: +``` +queue_type: FLUX +queue_primary: flux +queues: + flux: {cores_max: 64, cores_min: 1, run_time_max: 172800, script: flux.sh} +``` +The queue named `flux` is defined based on a submission script template named `flux.sh` with the following content: +``` +#!/bin/bash +#flux: -n{{cores}} --job-name={{job_name}} --env=CORES={{cores}} --output=time.out --error=error.out +{{command}} +``` +In this case only the number of cores `cores`, the name of the job `job_name` and the command `command` are communicated. + +## LFS +For the load sharing facility framework from IBM the `queue.yaml` file defines the `queue_type` as `LSF`: +``` +queue_type: LSF +queue_primary: lsf +queues: + lsf: {cores_max: 100, cores_min: 10, run_time_max: 259200, script: lsf.sh} +``` +The queue named `lsf` is defined based on a submission script template named `lsf.sh` with the following content: +``` +#!/bin/bash +#BSUB -q queue +#BSUB -J {{job_name}} +#BSUB -o time.out +#BSUB -n {{cores}} +#BSUB -cwd {{working_directory}} +#BSUB -e error.out +{%- if run_time_max %} +#BSUB -W {{run_time_max}} +{%- endif %} +{%- if memory_max %} +#BSUB -M {{memory_max}} +{%- endif %} + +{{command}} +``` +In this case the name of the job `job_name`, the number of cores `cores,` the working directory of the job `working_directory` and the command that is executed `command` are defined as mendatory inputs. Beyond these two optional inputs can be defined, namely the maximum run time for the job `run_time_max` and the maximum memory used by the job `memory_max`. + +## MOAB +For the Maui Cluster Scheduler the `queue.yaml` file defines the `queue_type` as `MOAB`: +``` +queue_type: MOAB +queue_primary: moab +queues: + moab: {cores_max: 100, cores_min: 10, run_time_max: 259200, script: moab.sh} +``` +The queue named `moab` is defined based on a submission script template named `moab.sh` with the following content: +``` +#!/bin/bash + +{{command}} +``` +Currently, no template for the Maui Cluster Scheduler is available. + +## SGE +For the sun grid engine (SGE) the `queue.yaml` file defines the `queue_type` as `SGE`: +``` +queue_type: SGE +queue_primary: sge +queues: + sge: {cores_max: 1280, cores_min: 40, run_time_max: 259200, script: sge.sh} +``` +The queue named `sge` is defined based on a submission script template named `sge.sh` with the following content: +``` +#!/bin/bash +#$ -N {{job_name}} +#$ -wd {{working_directory}} +{%- if cores %} +#$ -pe impi_hy* {{cores}} +{%- endif %} +{%- if memory_max %} +#$ -l h_vmem={{memory_max}} +{%- endif %} +{%- if run_time_max %} +#$ -l h_rt={{run_time_max}} +{%- endif %} +#$ -o time.out +#$ -e error.out + +{{command}} +``` +In this case the name of the job `job_name`, the number of cores `cores,` the working directory of the job `working_directory` and the command that is executed `command` are defined as mendatory inputs. Beyond these two optional inputs can be defined, namely the maximum run time for the job `run_time_max` and the maximum memory used by the job `memory_max`. + +## SLURM +For the Simple Linux Utility for Resource Management (SLURM) the `queue.yaml` file defines the `queue_type` as `SLURM`: +``` +queue_type: SLURM +queue_primary: slurm +queues: + slurm: {cores_max: 100, cores_min: 10, run_time_max: 259200, script: slurm.sh} +``` +The queue named `slurm` is defined based on a submission script template named `slurm.sh` with the following content: +``` +#!/bin/bash +#SBATCH --output=time.out +#SBATCH --job-name={{job_name}} +#SBATCH --chdir={{working_directory}} +#SBATCH --get-user-env=L +#SBATCH --partition=slurm +{%- if run_time_max %} +#SBATCH --time={{ [1, run_time_max // 60]|max }} +{%- endif %} +{%- if memory_max %} +#SBATCH --mem={{memory_max}}G +{%- endif %} +#SBATCH --cpus-per-task={{cores}} + +{{command}} +``` +In this case the name of the job `job_name`, the number of cores `cores,` the working directory of the job `working_directory` and the command that is executed `command` are defined as mendatory inputs. Beyond these two optional inputs can be defined, namely the maximum run time for the job `run_time_max` and the maximum memory used by the job `memory_max`. + +## TORQUE +For the Terascale Open-source Resource and Queue Manager (TORQUE) the `queue.yaml` file defines the `queue_type` as `TORQUE`: +``` +queue_type: TORQUE +queue_primary: torque +queues: + torque: {cores_max: 100, cores_min: 10, run_time_max: 259200, script: torque.sh} + +``` +The queue named `torque` is defined based on a submission script template named `torque.sh` with the following content: +``` +#!/bin/bash +#PBS -q normal +#PBS -l ncpus={{cores}} +#PBS -N {{job_name}} +{%- if memory_max %} +#PBS -l mem={{ [16, memory_max]| int |max }}GB +{%- endif %} +{%- if run_time_max %} +#PBS -l walltime={{ [1*3600, run_time_max*3600]|max }} +{%- endif %} +#PBS -l wd +#PBS -l software=vasp +#PBS -l storage=scratch/a01+gdata/a01 +#PBS -P a01 + +{{command}} +``` +In this case the name of the job `job_name`, the number of cores `cores,` the working directory of the job `working_directory` and the command that is executed `command` are defined as mendatory inputs. Beyond these two optional inputs can be defined, namely the maximum run time for the job `run_time_max` and the maximum memory used by the job `memory_max`. From b8df9bcfe1e8ec163694bc2b3be3b1a3eba7e28d Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Thu, 4 May 2023 19:02:31 -0600 Subject: [PATCH 3/6] Add build environment for documentation --- .ci_support/environment-docs.yml | 11 +++++++++++ .readthedocs.yml | 28 ++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 .ci_support/environment-docs.yml create mode 100644 .readthedocs.yml diff --git a/.ci_support/environment-docs.yml b/.ci_support/environment-docs.yml new file mode 100644 index 00000000..e9e0c2b2 --- /dev/null +++ b/.ci_support/environment-docs.yml @@ -0,0 +1,11 @@ +channels: +- conda-forge +dependencies: + - nbsphinx + - sphinx <6.1 + - defusedxml + - pandas + - pyyaml + - jinja2 + - paramiko + - tqdm \ No newline at end of file diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 00000000..b4de3480 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,28 @@ +# .readthedocs.yml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +build: + os: "ubuntu-20.04" + tools: + python: "mambaforge-4.10" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/source/conf.py + +# Optionally build your docs in additional formats such as PDF and ePub +formats: [] + +# Install pyiron from conda +conda: + environment: .ci_support/environment-docs.yml + +# Optionally set the version of Python and requirements required to build your docs +python: + install: + - method: pip + path: . \ No newline at end of file From ed45f36bcf571bc6f488c96fc520257495908f1e Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Thu, 4 May 2023 19:05:53 -0600 Subject: [PATCH 4/6] Update environment-docs.yml --- .ci_support/environment-docs.yml | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/.ci_support/environment-docs.yml b/.ci_support/environment-docs.yml index e9e0c2b2..b8b88cfe 100644 --- a/.ci_support/environment-docs.yml +++ b/.ci_support/environment-docs.yml @@ -2,10 +2,5 @@ channels: - conda-forge dependencies: - nbsphinx - - sphinx <6.1 - - defusedxml - - pandas - - pyyaml - - jinja2 - - paramiko - - tqdm \ No newline at end of file + - sphinx + - myst-parser From 878cc90813920fca561c76ee31bb8d75b3055cf5 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Thu, 4 May 2023 19:52:20 -0600 Subject: [PATCH 5/6] Update current state of the documentation --- docs/source/advanced.md | 53 +++++++++++++++++++++++++++++++++++++++++ docs/source/command.md | 6 +++++ docs/source/debug.md | 3 +++ docs/source/python.md | 47 +++++++++++++++++++++++++++++++++++- 4 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 docs/source/debug.md diff --git a/docs/source/advanced.md b/docs/source/advanced.md index e1c1df26..50739468 100644 --- a/docs/source/advanced.md +++ b/docs/source/advanced.md @@ -1,5 +1,58 @@ # Advanced Configuration ## Remote HPC Configuration +`queue.yaml` file for remote access: +``` +queue_type: REMOTE +queue_primary: remote +ssh_host: hpc-cluster.university.edu +ssh_username: hpcuser +known_hosts: ~/.ssh/known_hosts +ssh_key: ~/.ssh/id_rsa +ssh_remote_config_dir: /u/share/pysqa/resources/queues/ +ssh_remote_path: /u/hpcuser/remote/ +ssh_local_path: /home/localuser/projects/ +ssh_continous_connection: True +ssh_delete_file_on_remote: False +queues: + remote: {cores_max: 100, cores_min: 10, run_time_max: 259200} +``` +In addition to `queue_type`, `queue_primary` and `queues` this also has the following required keywords: + +* `ssh_host` the remote HPC login node to connect to +* `ssh_username` the username on the HPC login node +* `known_hosts` the local file of known hosts which needs to contain the `ssh_host` defined above. +* `ssh_key` the local key for the SSH connection +* `ssh_remote_config_dir` the `pysqa` configuration directory on the remote HPC cluster +* `ssh_remote_path` the remote directory on the HPC cluster to transfer calculations to +* `ssh_local_path` the local directory to transfer calculations from + +And optional keywords: + +* `ssh_delete_file_on_remote` specify whether files on the remote HPC should be deleted after they are transferred back to the local system - defaults to `True` +* `ssh_port` the port used for the SSH connection on the remote HPC cluster - defaults to `22` + +A definition of the `queues` in the local system is required to enable the parameter checks locally. Still it is sufficient to only store the individual submission script templates only on the remote HPC. ## Access to Multiple HPCs +To support multiple remote HPC clusters additional functionality was added to `pysqa`. + +Namely, a `clusters.yaml` file can be defined in the configuration directory, which defines multiple `queue.yaml` files for different clusters: +``` +cluster_primary: local_slurm +cluster: { + local_slurm: local_slurm_queues.yaml, + remote_slurm: remote_queues.yaml +} +``` +These `queue.yaml` files can again include all the functionality defined previously, including the configuration for remote connection using SSH. + +Furthermore, the `QueueAdapter` class was extended with the following two functions: +``` +qa.list_clusters() +``` +To list the available clusters in the configuration and: +``` +qa.switch_cluster(cluster_name) +``` +To switch from one cluster to another, with the `cluster_name` providing the name of the cluster like `local_slurm` and `remote_slurm` in the configuration above. \ No newline at end of file diff --git a/docs/source/command.md b/docs/source/command.md index 025507a4..3fe1db19 100644 --- a/docs/source/command.md +++ b/docs/source/command.md @@ -1,7 +1,13 @@ # Command Line Interface +The command line interface implements a subset of the functionality of the python interface. While it can be used locally to check the status of your calculation, the primary use case is accessing the `pysqa` installation on a remote HPC cluster from your local `pysqa` installation. Still here the local execution of the commands is discussed. + +Starting with the `--help` command: ``` python -m pysqa --help ``` +This prints a short version of this documentation page. + +Besides the `--help` command there are * `-f`, `--config_directory` * `-p`, `--submit` diff --git a/docs/source/debug.md b/docs/source/debug.md new file mode 100644 index 00000000..0beea3ec --- /dev/null +++ b/docs/source/debug.md @@ -0,0 +1,3 @@ +# Debugging +The configuration of a queuing system adapter, in particular in a remote configuration with a local installation of `pysqa` communicating to a remote installation on your HPC can be tricky. To simplify the process `pysqa` provides a series of utility functions. + diff --git a/docs/source/python.md b/docs/source/python.md index 64b6079e..a7cf8de0 100644 --- a/docs/source/python.md +++ b/docs/source/python.md @@ -1,7 +1,52 @@ # Python Interface - +The `pysqa` package primarily defines one class, that is the `QueueAdapter`. It loads the configuration from a configuration directory, initializes the corrsponding adapter for the specific queuing system and provides a high level interface for users to interact with the queuing system. The `QueueAdapter` can be imported using: ``` from pysqa import QueueAdapter ``` +After the initial import the class is initialized using the configuration directory specificed by the `directory` parameter which defaults to `"~/.queues"`: +``` +qa = QueueAdapter(directory="~/.queues") +``` +Another optional parameter of the `QueueAdapter` class is the `execute_command`, still this is primarily used for testing purposes to call the underlying shell commands. + +## List available queues +List available queues as list of queue names: +``` +qa.queue_list +``` +List available queues in an pandas dataframe: +``` +qa.queue_view +``` +## Submit job to queue +Submit a job to the queue - if no queue is specified it is submitted to the default queue defined in the queue configuration: +``` +qa.submit_job( + queue=None, + job_name=None, + working_directory=None, + cores=None, + memory_max=None, + run_time_max=None, + dependency_list=None, + command=‘python test.py’, + **kwargs +) +``` + +## Show jobs in queue +Get status of all jobs currently handled by the queuing system: +``` +qa.get_queue_status() +``` +Get status of a specifc job from the queuing system: +``` +qa.get_status_of_job(process_id=1234) +``` +## Delete job from queue +Delete a job from the queuing sytem: +``` +qa.delete_job(process_id=1234) +``` \ No newline at end of file From 148d3d8bfc7c0673300b186a5a5c7eb67f307bfd Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Thu, 4 May 2023 23:10:49 -0600 Subject: [PATCH 6/6] Update documentation --- docs/source/advanced.md | 5 ++- docs/source/command.md | 92 ++++++++++++++++++++++++++++++----------- docs/source/debug.md | 8 +++- docs/source/index.rst | 1 + docs/source/python.md | 23 ++++++++++- pysqa/queueadapter.py | 19 +++++---- 6 files changed, 112 insertions(+), 36 deletions(-) diff --git a/docs/source/advanced.md b/docs/source/advanced.md index 50739468..fd6714b2 100644 --- a/docs/source/advanced.md +++ b/docs/source/advanced.md @@ -1,7 +1,8 @@ # Advanced Configuration +Initially `pysqa` was only designed to interact with the local queuing systems of an HPC cluster. This functionality has recently been extended to support remote HPC clusters in addition to local HPC clusters. These two developments, the support for remote HPC clusters and the support for multiple clusters in `pysqa` are discussed in the following. Both of these features are under active development so this part of the interface might change more frequently than the rest. ## Remote HPC Configuration -`queue.yaml` file for remote access: +Remote clusters can be defined in the `queue.yaml` file by setting the `queue_type` to `REMOTE`: ``` queue_type: REMOTE queue_primary: remote @@ -17,7 +18,7 @@ ssh_delete_file_on_remote: False queues: remote: {cores_max: 100, cores_min: 10, run_time_max: 259200} ``` -In addition to `queue_type`, `queue_primary` and `queues` this also has the following required keywords: +In addition to `queue_type`, `queue_primary` and `queues` parameters, this also has the following required keywords: * `ssh_host` the remote HPC login node to connect to * `ssh_username` the username on the HPC login node diff --git a/docs/source/command.md b/docs/source/command.md index 3fe1db19..da5aaef9 100644 --- a/docs/source/command.md +++ b/docs/source/command.md @@ -1,27 +1,73 @@ # Command Line Interface -The command line interface implements a subset of the functionality of the python interface. While it can be used locally to check the status of your calculation, the primary use case is accessing the `pysqa` installation on a remote HPC cluster from your local `pysqa` installation. Still here the local execution of the commands is discussed. +The command line interface implements a subset of the functionality of the python interface. While it can be used locally to check the status of your calculation, the primary use case is accessing the `pysqa` installation on a remote HPC cluster from your local `pysqa` installation. Still here the local execution of the commands is discussed. -Starting with the `--help` command: +The available options are the submission of new jobs to the queuing system using the submit option `--submit`, enabling reservation for a job already submitted using the `--reservation` option, listing jobs on the queuing using the status option `--status`, deleting a job from the queuing system using the delete option `--delete`, listing files in the working directory using the list option `--list` and the help option `--help` to print a summary of the available options. + +## Submit job +Submission of jobs to the queuing system with the submit option `--submit` is similar to the submit job function `QueueAdapter().submit_job()`. Example call to submit the `hostname` command to the default queue: ``` -python -m pysqa --help +python -m pysqa --submit --command hostname +``` +The options used and their short forms are: +* `-p`, `--submit` the submit option enables the submission of a job to the queuing system +* `-c`, `--command` the command that is executed as part of the job + +Additional options for the submission of the job with their short forms are: +* `-f`, `--config_directory` the directory which contains the `pysqa` configuration, by default `~/.queues`. +* `-q`, `--queue` the queue the job is submitted to. If this option is not defined the `primary_queue` defined in the configuration is used. +* `-j`, `--job_name` the name of the job submitted to the queuing system. +* `-w`, `--working_directory` the working directory the job submitted to the queuing system is executed in. +* `-n`, `--cores` the number of cores used for the calculation. If the cores are not defined the minimum number of cores defined for the selected queue are used. +* `-m`, `--memory` the memory used for the calculation. +* `-t`, `--run_time` the run time for the calculation. If the run time is not defined the maximum run time defined for the selected queue is used. +* `-b`, `--dependency` other jobs the calculation depends on. + +## Enable reservation +Enable reservation for a job already submitted to the queuing system using the reservation option `--reservation` is similar to the enable reservation function `QueueAdapter().enable_reservation()`. Example call to enable the reservation for a job with the id `123`: +``` +python -m pysqa --reservation --id 123 ``` -This prints a short version of this documentation page. - -Besides the `--help` command there are - -* `-f`, `--config_directory` -* `-p`, `--submit` -* `-q`, `--queue` -* `-j`, `--job_name` -* `-w`, `--working_directory` -* `-n`, `--cores` -* `-m`, `--memory` -* `-t`, `--run_time` -* `-b`, `--dependency` -* `-c`, `--command` -* `-r`, `--reservation` -* `-i`, `--id` -* `-d`, `--delete` -* `-s`, `--status` -* `-l`, `--list` -* `-h`, `--help` +The options used and their short forms are: +* `-r`, `--reservation` the reservation option enables a reservation for a specific job. +* `-i`, `--id` the id option specifies the job id of the job which should be added to the reservation. + +Additional options for enabling the reservation with their short forms are: +* `-f`, `--config_directory` the directory which contains the `pysqa` configuration, by default `~/.queues`. + +## List jobs +List jobs on the queuing system option `--status`, list calculations currently running and waiting on the queuing system for all users on the HPC cluster: +``` +python -m pysqa --status +``` +The options used and their short forms are: +* `-s`, `--status` the status option lists the status of all calculation currently running and waiting on the queuing system. + +Additional options for listing jobs on the queuing system with their short forms are: +* `-f`, `--config_directory` the directory which contains the `pysqa` configuration, by default `~/.queues`. + +## Delete job +The delete job option `--delete` deletes a job from the queuing system: +``` +python -m pysqa --delete --id 123 +``` +The options used and their short forms are: +* `-d`, `--delete` the delete option enables the deletion of a job from the queuing system. +* `-i`, `--id` the id option specifies the job id of the job which should be deleted. + +Additional options for deleting jobs from the queuing system with their short forms are: +* `-f`, `--config_directory` the directory which contains the `pysqa` configuration, by default `~/.queues`. + +## List files +The list files option `--list` lists the files in working directory: +``` +python -m pysqa --list --working_directory /path/on/remote/hpc +``` +The options used and their short forms are: +* `-l`, `--list` the list files option lists the files in the working directory. +* `-w`, `--working_directory` the working directory defines the folder whose files are listed. + +## Help +The help option `--help` prints a short version of this documentation page: +``` +python -m pysqa --help +``` \ No newline at end of file diff --git a/docs/source/debug.md b/docs/source/debug.md index 0beea3ec..2051ee36 100644 --- a/docs/source/debug.md +++ b/docs/source/debug.md @@ -1,3 +1,9 @@ # Debugging -The configuration of a queuing system adapter, in particular in a remote configuration with a local installation of `pysqa` communicating to a remote installation on your HPC can be tricky. To simplify the process `pysqa` provides a series of utility functions. +The configuration of a queuing system adapter, in particular in a remote configuration with a local installation of `pysqa` communicating to a remote installation on your HPC can be tricky. To simplify the process `pysqa` provides a series of utility functions: +* Login to the remote HPC cluster and import `pysqa` on a python shell. +* Validate the queue configuration by importing the queue adapter using `from pysqa import QueueAdapter` then initialize the object from the configuration dictionary `qa = QueueAdapter(directory="~/.queues")`. The current configuration can be printed using `qa.config`. +* Try to submit a calculation to print the hostname from the python shell on the remote HPC cluster using the `qa.submit_job(command="hostname")`. +* If this works successfully then the next step is to try the same on the command line using `python -m pysqa --submit --command hostname`. + +This is the same command the local `pysqa` instance calls on the `pysqa` instance on the remote HPC cluster, so if the steps above were executed successfully, then the remote HPC configuration seems to be correct. The final step is validating the local configuration to see the SSH connection is successfully established and maintained. diff --git a/docs/source/index.rst b/docs/source/index.rst index 1d6a49cc..18629183 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -39,3 +39,4 @@ Documentation python command advanced + debug \ No newline at end of file diff --git a/docs/source/python.md b/docs/source/python.md index a7cf8de0..63ea03eb 100644 --- a/docs/source/python.md +++ b/docs/source/python.md @@ -34,19 +34,38 @@ qa.submit_job( **kwargs ) ``` +The only required parameter is: +* `command` the command that is executed as part of the job + +Additional options for the submission of the job are: +* `queue` the queue the job is submitted to. If this option is not defined the `primary_queue` defined in the configuration is used. +* `job_name` the name of the job submitted to the queuing system. +* `working_directory` the working directory the job submitted to the queuing system is executed in. +* `cores` the number of cores used for the calculation. If the cores are not defined the minimum number of cores defined for the selected queue are used. +* `memory_max` the memory used for the calculation. +* `run_time_max` the run time for the calculation. If the run time is not defined the maximum run time defined for the selected queue is used. +* `dependency_list` other jobs the calculation depends on. +* `**kwargs` allows writing additional parameters to the job submission script if they are available in the corresponding template. ## Show jobs in queue Get status of all jobs currently handled by the queuing system: ``` qa.get_queue_status() ``` -Get status of a specifc job from the queuing system: +With the additional parameter `user` a specific user can be defined to only list the jobs of this specific user. + +In analogy the jobs of the current user can be listed with: +``` +qa.get_status_of_my_jobs() +``` + +Finally, the status of a specific job with the queue id `1234` can be received from the queuing system using: ``` qa.get_status_of_job(process_id=1234) ``` ## Delete job from queue -Delete a job from the queuing sytem: +Delete a job with the queue id `1234` from the queuing system: ``` qa.delete_job(process_id=1234) ``` \ No newline at end of file diff --git a/pysqa/queueadapter.py b/pysqa/queueadapter.py index 95a02dfb..1506a0c7 100644 --- a/pysqa/queueadapter.py +++ b/pysqa/queueadapter.py @@ -152,16 +152,19 @@ def submit_job( Args: queue (str/None): Name of the queue to submit to, must be one of the names configured for this adapter - job_name (str/None): Name of the job for the underlying queuing system - working_directory (str/None): Directory to run the job in - cores (int/None): Number of hardware threads requested - memory_max (int/None): Amount of memory requested per node in GB - run_time_max (int/None): Maximum runtime in seconds - dependency_list(list[str]/None: Job ids of jobs to be completed before starting - command (str/None): shell command to run in the job + (optional) + job_name (str/None): Name of the job for the underlying queuing system (optional) + working_directory (str/None): Directory to run the job in (optional) + cores (int/None): Number of hardware threads requested (optional) + memory_max (int/None): Amount of memory requested per node in GB (optional) + run_time_max (int/None): Maximum runtime in seconds (optional) + dependency_list(list[str]/None: Job ids of jobs to be completed before starting (optional) + command (str/None): shell command to run in the job + **kwargs: allows writing additional parameters to the job submission script if they are available in the + corresponding template. Returns: - int: + int: Job id received from the queuing system for the job which was submitted \ """ return self._adapter.submit_job( queue=queue,