diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 055fcfde..faed63ad 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -11,11 +11,25 @@ name: build on: [push, pull_request, workflow_dispatch] jobs: + get-python-versions: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + - name: Install jq + run: sudo apt-get install jq + - name: Get Python versions + id: set-matrix + run: | + echo "MATRIX_RESULT=$(jq -c . python_versions.json)" >> $GITHUB_ENV + outputs: + matrix: ${{ env.MATRIX_RESULT }} build: + needs: get-python-versions runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.11", "3.12"] + python-version: ${{ fromJSON(needs.get-python-versions.outputs.matrix) }} steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} diff --git a/.github/workflows/update_readme.yml b/.github/workflows/update_readme.yml new file mode 100644 index 00000000..212ef5d0 --- /dev/null +++ b/.github/workflows/update_readme.yml @@ -0,0 +1,33 @@ +# ----------------------------------------------------------------------------- +# - invoked on push to any branch +# ----------------------------------------------------------------------------- + name: update README + + on: push + + jobs: + update-readme: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.11 + - name: Update README + run: | + pip install packaging + python update_readme.py + - name: Commit and push changes + run: | + git config --local user.email "action@github.com" + git config --local user.name "github-actions" + git diff --quiet && git diff --staged --quiet || ( + git add README.rst + git commit -am "update README with supported Python versions" + git pull --rebase origin ${{ github.ref_name }} + git push origin ${{ github.ref_name }} + ) + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/CHANGELOG.rst b/CHANGELOG.rst index a21f9c34..59cad7c1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,9 @@ +**0.1.3 - 1/7/25** + + - Validate currently-installed python version during setup + - Automatically update README when supported python versions change + - Automatically extract github actions supported python version test matrix + **0.1.2 - 12/16/24** - Add optional arg to pass allowable schemas to the Config constructor diff --git a/Makefile b/Makefile index bdb00c4d..c22a7a01 100644 --- a/Makefile +++ b/Makefile @@ -63,31 +63,25 @@ install: # Install setuptools, install this package in editable mode format: setup.py pyproject.toml $(MAKE_SOURCES) # Run the code formatter and import sorter -black $(LOCATIONS) -isort $(LOCATIONS) - @echo "Ignore, Created by Makefile, `date`" > $@ lint: .flake8 .bandit $(MAKE_SOURCES) # Run the code linter and package security vulnerability checker -flake8 $(LOCATIONS) -safety check - @echo "Ignore, Created by Makefile, `date`" > $@ typecheck: pytype.cfg $(MAKE_SOURCES) # Run the type checker -pytype --config=pytype.cfg $(LOCATIONS) - @echo "Ignore, Created by Makefile, `date`" > $@ e2e: $(MAKE_SOURCES) # Run the e2e tests export COVERAGE_FILE=./output/.coverage.e2e pytest -vvv --runslow --cov --cov-report term --cov-report html:./output/htmlcov_e2e tests/e2e/ - @echo "Ignore, Created by Makefile, `date`" > $@ integration: $(MAKE_SOURCES) # Run unit tests export COVERAGE_FILE=./output/.coverage.integration pytest -vvv --runslow --cov --cov-report term --cov-report html:./output/htmlcov_integration tests/integration/ - @echo "Ignore, Created by Makefile, `date`" > $@ unit: $(MAKE_SOURCES) # Run unit tests export COVERAGE_FILE=./output/.coverage.unit pytest -vvv --runslow --cov --cov-report term --cov-report html:./output/htmlcov_unit tests/unit/ - @echo "Ignore, Created by Makefile, `date`" > $@ build-package: $(MAKE_SOURCES) # Build the package as a pip wheel pip install build @@ -96,7 +90,6 @@ build-package: $(MAKE_SOURCES) # Build the package as a pip wheel build-doc: $(MAKE_SOURCES) # Build the Sphinx docs $(MAKE) -C docs/ html - @echo "Ignore, Created by Makefile, `date`" > $@ clean: # Delete build artifacts and do any custom cleanup such as spinning down services @rm -rf format lint typecheck build-doc build-package unit e2e integration .pytest_cache .pytype diff --git a/README.rst b/README.rst index 5a3b4d46..157755db 100644 --- a/README.rst +++ b/README.rst @@ -7,7 +7,7 @@ entity resolution (ER) pipelines. .. _python_support: -Supported Python versions: 3.11, 3.12 +**Supported Python versions: 3.11, 3.12** .. _end_python_support: @@ -22,21 +22,20 @@ There are a few things to install in order to use this package: likely need to request it from your system admin. Refer to https://docs.sylabs.io/guides/4.1/admin-guide/installation.html -.. highlight:: console - Install graphviz via: - - :: + + .. code-block:: console $ conda install graphviz - Install EasyLink. - Option 1 - Install from PyPI with pip:: + Option 1 - Install from PyPI with pip:: $ pip install easylink - Option 2 - Build from source with pip:: + Option 2 - Build from source with pip:: $ git clone git@github.com:ihmeuw/easylink.git # or git clone https://github.com/ihmeuw/easylink.git $ cd easylink diff --git a/docs/nitpick-exceptions b/docs/nitpick-exceptions index e69de29b..4cc29772 100644 --- a/docs/nitpick-exceptions +++ b/docs/nitpick-exceptions @@ -0,0 +1,3 @@ +py:class LayeredConfigTree +py:class layered_config_tree.LayeredConfigTree +py:class layered_config_tree.main.LayeredConfigTree diff --git a/docs/source/api_reference/configuration.rst b/docs/source/api_reference/configuration.rst new file mode 100644 index 00000000..673b39fb --- /dev/null +++ b/docs/source/api_reference/configuration.rst @@ -0,0 +1 @@ +.. automodule:: easylink.configuration diff --git a/docs/source/api_reference/graph_components.rst b/docs/source/api_reference/graph_components.rst new file mode 100644 index 00000000..50f09d3b --- /dev/null +++ b/docs/source/api_reference/graph_components.rst @@ -0,0 +1 @@ +.. automodule:: easylink.graph_components diff --git a/docs/source/api_reference/implementation.rst b/docs/source/api_reference/implementation.rst new file mode 100644 index 00000000..4d6eeb25 --- /dev/null +++ b/docs/source/api_reference/implementation.rst @@ -0,0 +1 @@ +.. automodule:: easylink.implementation diff --git a/docs/source/api_reference/index.rst b/docs/source/api_reference/index.rst index bcb43b41..8840e798 100644 --- a/docs/source/api_reference/index.rst +++ b/docs/source/api_reference/index.rst @@ -1,2 +1,11 @@ API Reference ============= + +.. automodule:: easylink + +.. toctree:: + :maxdepth: 1 + :glob: + + * + */index diff --git a/docs/source/api_reference/pipeline.rst b/docs/source/api_reference/pipeline.rst new file mode 100644 index 00000000..a1820382 --- /dev/null +++ b/docs/source/api_reference/pipeline.rst @@ -0,0 +1 @@ +.. automodule:: easylink.pipeline diff --git a/docs/source/api_reference/pipeline_graph.rst b/docs/source/api_reference/pipeline_graph.rst new file mode 100644 index 00000000..8476e95b --- /dev/null +++ b/docs/source/api_reference/pipeline_graph.rst @@ -0,0 +1 @@ +.. automodule:: easylink.pipeline_graph diff --git a/docs/source/api_reference/pipeline_schema.rst b/docs/source/api_reference/pipeline_schema.rst new file mode 100644 index 00000000..8ae40e0a --- /dev/null +++ b/docs/source/api_reference/pipeline_schema.rst @@ -0,0 +1 @@ +.. automodule:: easylink.pipeline_schema diff --git a/docs/source/api_reference/pipeline_schema_constants/development.rst b/docs/source/api_reference/pipeline_schema_constants/development.rst new file mode 100644 index 00000000..ba21c009 --- /dev/null +++ b/docs/source/api_reference/pipeline_schema_constants/development.rst @@ -0,0 +1 @@ +.. automodule:: easylink.pipeline_schema_constants.development diff --git a/docs/source/api_reference/pipeline_schema_constants/index.rst b/docs/source/api_reference/pipeline_schema_constants/index.rst new file mode 100644 index 00000000..81344678 --- /dev/null +++ b/docs/source/api_reference/pipeline_schema_constants/index.rst @@ -0,0 +1,10 @@ +PipelineSchema Constants +======================== + +.. automodule:: easylink.pipeline_schema_constants + +.. toctree:: + :maxdepth: 1 + :glob: + + * diff --git a/docs/source/api_reference/pipeline_schema_constants/tests.rst b/docs/source/api_reference/pipeline_schema_constants/tests.rst new file mode 100644 index 00000000..ced28c81 --- /dev/null +++ b/docs/source/api_reference/pipeline_schema_constants/tests.rst @@ -0,0 +1 @@ +.. automodule:: easylink.pipeline_schema_constants.tests diff --git a/docs/source/api_reference/rule.rst b/docs/source/api_reference/rule.rst new file mode 100644 index 00000000..cf37a8c2 --- /dev/null +++ b/docs/source/api_reference/rule.rst @@ -0,0 +1 @@ +.. automodule:: easylink.rule diff --git a/docs/source/api_reference/runner.rst b/docs/source/api_reference/runner.rst new file mode 100644 index 00000000..1e5c24a9 --- /dev/null +++ b/docs/source/api_reference/runner.rst @@ -0,0 +1 @@ +.. automodule:: easylink.runner diff --git a/docs/source/api_reference/step.rst b/docs/source/api_reference/step.rst new file mode 100644 index 00000000..71401277 --- /dev/null +++ b/docs/source/api_reference/step.rst @@ -0,0 +1 @@ +.. automodule:: easylink.step diff --git a/docs/source/api_reference/utilities/data_utils.rst b/docs/source/api_reference/utilities/data_utils.rst new file mode 100644 index 00000000..865c2df4 --- /dev/null +++ b/docs/source/api_reference/utilities/data_utils.rst @@ -0,0 +1,2 @@ +.. automodule:: easylink.utilities.data_utils + \ No newline at end of file diff --git a/docs/source/api_reference/utilities/general_utils.rst b/docs/source/api_reference/utilities/general_utils.rst new file mode 100644 index 00000000..33aa6c4e --- /dev/null +++ b/docs/source/api_reference/utilities/general_utils.rst @@ -0,0 +1,2 @@ +.. automodule:: easylink.utilities.general_utils + \ No newline at end of file diff --git a/docs/source/api_reference/utilities/index.rst b/docs/source/api_reference/utilities/index.rst new file mode 100644 index 00000000..1257675b --- /dev/null +++ b/docs/source/api_reference/utilities/index.rst @@ -0,0 +1,10 @@ +Utilities +========= + +.. automodule:: easylink.utilities + +.. toctree:: + :maxdepth: 1 + :glob: + + * diff --git a/docs/source/api_reference/utilities/paths.rst b/docs/source/api_reference/utilities/paths.rst new file mode 100644 index 00000000..aaca70de --- /dev/null +++ b/docs/source/api_reference/utilities/paths.rst @@ -0,0 +1,2 @@ +.. automodule:: easylink.utilities.paths + \ No newline at end of file diff --git a/docs/source/api_reference/utilities/validation_utils.rst b/docs/source/api_reference/utilities/validation_utils.rst new file mode 100644 index 00000000..0fb92a9c --- /dev/null +++ b/docs/source/api_reference/utilities/validation_utils.rst @@ -0,0 +1,2 @@ +.. automodule:: easylink.utilities.validation_utils + \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index b25561be..030f3ed1 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -198,6 +198,7 @@ "numpy": ("https://numpy.org/doc/stable/", None), "networkx": ("https://networkx.org/documentation/stable/", None), "layered_config_tree": ("https://layered-config-tree.readthedocs.io/en/latest/", None), + "loguru": ("https://loguru.readthedocs.io/en/stable/", None), } @@ -213,10 +214,16 @@ # Generate docs even if an item has no docstring. "undoc-members": True, # Don't document things with a leading underscore. - "private-members": False, + "private-members": True, + # Show class inheritance. + "show-inheritance": True, } # Display type hints in the description instead of the signature. autodoc_typehints = "description" +# Mock problematic imports +autodoc_mock_imports = [ + "networkx", +] # -- nitpicky mode -------------------------------------------------------- diff --git a/docs/source/user_guide/cli.rst b/docs/source/user_guide/cli.rst index a8057fa8..09434266 100644 --- a/docs/source/user_guide/cli.rst +++ b/docs/source/user_guide/cli.rst @@ -4,8 +4,7 @@ Command Line Interface ====================== -.. automodule:: easylink.cli - .. click:: easylink.cli:easylink :prog: easylink - :show-nested: + :nested: full + :commands: run, generate-dag \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index d787dc52..dcef0406 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,6 @@ +[build-system] +requires = ["packaging", "setuptools"] + [tool.black] line_length = 94 diff --git a/python_versions.json b/python_versions.json new file mode 100644 index 00000000..99eeb9b6 --- /dev/null +++ b/python_versions.json @@ -0,0 +1 @@ +["3.11", "3.12"] \ No newline at end of file diff --git a/setup.py b/setup.py index 63c04e6f..236e8f7d 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,36 @@ #!/usr/bin/env python +import json import os +import sys +from packaging.version import parse from setuptools import find_packages, setup +with open("python_versions.json", "r") as f: + supported_python_versions = json.load(f) + +python_versions = [parse(v) for v in supported_python_versions] +min_version = min(python_versions) +max_version = max(python_versions) +if not ( + min_version <= parse(".".join([str(v) for v in sys.version_info[:2]])) <= max_version +): + py_version = ".".join([str(v) for v in sys.version_info[:3]]) + # NOTE: Python 3.5 does not support f-strings + error = ( + "\n--------------------------------------------\n" + "Error: EasyLink runs under python {min_version}-{max_version}.\n" + "You are running python {py_version}.\n".format( + min_version=min_version.base_version, + max_version=max_version.base_version, + py_version=py_version, + ) + + "--------------------------------------------\n" + ) + print(error, file=sys.stderr) + sys.exit(1) + + if __name__ == "__main__": base_dir = os.path.dirname(__file__) src_dir = os.path.join(base_dir, "src") @@ -39,10 +67,11 @@ "pytest-mock", ] doc_requirements = [ - "sphinx>=4.0,<8.0.0", - "sphinx-rtd-theme>=0.6", + "sphinx", + "sphinx-rtd-theme", "sphinx-autodoc-typehints", "sphinx-click", + "typing_extensions", ] lint_requirements = [ "black==22.3.0", @@ -54,6 +83,7 @@ name=about["__title__"], description=about["__summary__"], long_description=long_description, + long_description_content_type="text/x-rst", license=about["__license__"], url=about["__uri__"], author=about["__author__"], diff --git a/src/easylink/__init__.py b/src/easylink/__init__.py index 1e2ad34d..4a0a535e 100644 --- a/src/easylink/__init__.py +++ b/src/easylink/__init__.py @@ -3,7 +3,7 @@ EasyLink ======== -Research repository for the EasyLink ER ecosystem project. +Research repository for the EasyLink entity resolution (ER) ecosystem project. """ diff --git a/src/easylink/cli.py b/src/easylink/cli.py index c5f26222..79bbfeec 100644 --- a/src/easylink/cli.py +++ b/src/easylink/cli.py @@ -37,10 +37,10 @@ ), ), click.option( - "--timestamp/--no-timestamp", - default=True, - show_default=True, - help="Save the results in a timestamped sub-directory of --output-dir.", + "--no-timestamp", + is_flag=True, + default=False, + help="Do not save the results in a timestamped sub-directory of ``--output-dir``.", ), ] @@ -66,9 +66,8 @@ def easylink(): show_default=True, type=click.Path(exists=True, dir_okay=False, resolve_path=True), help=( - "Path to the specification yaml defining the computing environment to " - "run the pipeline on. If no value is passed, the pipeline will be run " - "locally." + "Path to the computing environment specification yaml. If no value is passed, " + "the pipeline will be run locally." ), ) @click.option("-v", "--verbose", count=True, help="Increase logging verbosity.", hidden=True) @@ -83,15 +82,20 @@ def run( pipeline_specification: str, input_data: str, output_dir: str | None, - timestamp: bool, + no_timestamp: bool, computing_environment: str | None, verbose: int, with_debugger: bool, ) -> None: - """Run a pipeline from the command line.""" + """Runs a pipeline from the command line. + + In addition to running the pipeline, this command will also generate the directed + acyclic graph (DAG) image. If you only want to generate the image without actually + running the pipeline, use the ``easylink generate-dag`` command. + """ configure_logging_to_terminal(verbose) logger.info("Running pipeline") - results_dir = get_results_directory(output_dir, timestamp).as_posix() + results_dir = get_results_directory(output_dir, no_timestamp).as_posix() logger.info(f"Results directory: {results_dir}") # TODO [MIC-4493]: Add configuration validation @@ -114,11 +118,15 @@ def generate_dag( pipeline_specification: str, input_data: str, output_dir: str | None, - timestamp: bool, + no_timestamp: bool, ) -> None: - """Generate an image of the proposed pipeline DAG.""" + """Generates an image of the proposed pipeline directed acyclic graph (DAG). + + This command only generates the DAG image of the pipeline; it does not actually + run it. To run the pipeline, use the ``easylink run`` command. + """ logger.info("Generating DAG") - results_dir = get_results_directory(output_dir, timestamp).as_posix() + results_dir = get_results_directory(output_dir, no_timestamp).as_posix() logger.info(f"Results directory: {results_dir}") # TODO [MIC-4493]: Add configuration validation runner.main( diff --git a/src/easylink/configuration.py b/src/easylink/configuration.py index 19619ed3..d567b2d0 100644 --- a/src/easylink/configuration.py +++ b/src/easylink/configuration.py @@ -1,3 +1,13 @@ +""" +============= +Configuration +============= + +This module is responsible for managing an easylink run's configuration as defined +by various user-input specification files. + +""" + from collections import defaultdict from pathlib import Path from typing import Any @@ -23,6 +33,8 @@ }, } } +"""The default environment configuration settings.""" + SPARK_DEFAULTS = { "workers": { "num_workers": 2, @@ -32,24 +44,62 @@ }, "keep_alive": False, } +"""The default spark configuration settings.""" -# Allow some buffer so that slurm doesn't kill spark workers -SLURM_SPARK_MEM_BUFFER = 500 +SLURM_SPARK_MEM_BUFFER = 500 # MB class Config(LayeredConfigTree): """A container for configuration information. - This class combines the pipeline, input data, and computing environment - specifications into a single LayeredConfigTree object. It is also responsible - for validating these specifications. + A ``Config`` (which inherits from :class:`~layered_config_tree.LayeredConfigTree`) + is a container that includes the combination of the user-provided pipeline, + input data, and computing environment specifications. It is a nested + dictionary-like object that supports prioritized layers of configuration settings + as well as dot-notation access to its attributes. + + The ``Config`` is also reponsible for various validation checks on the provided + specifications. If any of these are invalid, a validation error is raised with + as much information as can possibly be provided. + + Parameters + ---------- + config_params + A dictionary of all specifications required to run the pipeline. This + includes the pipeline, input data, and computing environment specifications, + as well as the results directory. + potential_schemas + A list of potential schemas to validate the pipeline configuration against. + This is primarily used for testing purposes. Defaults to the supported schemas. + + Attributes + ---------- + environment + The environment configuration, including computing environment, + container engine, implementation resources, and slurm- and spark-specific + requests. + pipeline + The pipeline configuration. + input_data + The input data filepaths. + schema + The :class:`~easylink.pipeline_schema.PipelineSchema` that successfully + validated the requested pipeline. + + Notes + ----- + The requested pipeline is checked against a set of supported + :class:`pipeline schemas `. The first + schema that successfully validates is assumed to be the correct one and is attached + to the ``Config`` object and its :meth:`~easylink.pipeline_schema.PipelineSchema.configure_pipeline` + method is called. """ def __init__( self, config_params: dict[str, Any], potential_schemas: list[PipelineSchema] | PipelineSchema = PIPELINE_SCHEMAS, - ): + ) -> None: super().__init__(layers=["initial_data", "default", "user_configured"]) self.update(DEFAULT_ENVIRONMENT, layer="default") self.update(config_params, layer="user_configured") @@ -67,13 +117,13 @@ def __init__( self.freeze() @property - def computing_environment(self) -> dict[str, Any]: - """The computing environment to run on (generally either 'local' or 'slurm').""" + def computing_environment(self) -> str: + """The computing environment to run on ('local' or 'slurm').""" return self.environment.computing_environment @property def slurm(self) -> dict[str, Any]: - """A dictionary of slurm configuration settings.""" + """A dictionary of slurm-specific configuration settings.""" if not self.environment.computing_environment == "slurm": return {} else: @@ -81,12 +131,12 @@ def slurm(self) -> dict[str, Any]: @property def spark(self) -> dict[str, Any]: - """A dictionary of spark configuration settings.""" + """A dictionary of spark-specific configuration settings.""" return self.environment.spark.to_dict() @property def slurm_resources(self) -> dict[str, str]: - """A flat dictionary of the slurm resources.""" + """A flat dictionary of slurm resource requests.""" if not self.computing_environment == "slurm": return {} raw_slurm_resources = { @@ -103,7 +153,7 @@ def slurm_resources(self) -> dict[str, str]: @property def spark_resources(self) -> dict[str, Any]: - """A flat dictionary of the spark resources.""" + """A flat dictionary of spark resource requests.""" spark_workers_raw = self.spark["workers"] spark_workers = { "num_workers": spark_workers_raw.get("num_workers"), @@ -125,14 +175,26 @@ def spark_resources(self) -> dict[str, Any]: ################# def _get_schema(self, potential_schemas: list[PipelineSchema]) -> PipelineSchema: - """Validates the requested pipeline against supported schemas. + """Returns the first pipeline schema that successfully validates the requested pipeline. + + Parameters + ---------- + potential_schemas + Pipeline schemas to validate the pipeline configuration against. + + Returns + ------- + The first pipeline schema that successfully validates the requested pipeline. + If no validated pipeline schema is found, `exit()` is called with `errno.EINVAL` + and any validation errors are logged. Notes ----- This acts as the pipeline configuration file's validation method since - we can only find a matching schema if the file is valid. + we can only find a matching schema if that file is valid. - We use the first schema that validates the pipeline configuration. + This method returns the first schema that successfully validates and does + not attempt to validate additional ones. """ errors = defaultdict(dict) # Try each schema until one is validated @@ -147,6 +209,18 @@ def _get_schema(self, potential_schemas: list[PipelineSchema]) -> PipelineSchema exit_with_validation_error(dict(errors)) def _validate(self) -> None: + """Validates the ``Config``. + + Raises + ------ + SystemExit + If any errors are found, they are batch-logged into a dictionary and + the program exits with a non-zero code. + + Notes + ----- + Pipeline validations are handled in :meth:`~easylink.configuration.Config._get_schema`. + """ # TODO [MIC-4880]: refactor into validation object errors = { # NOTE: pipeline configuration validation happens in '_get_schema()' @@ -157,6 +231,12 @@ def _validate(self) -> None: exit_with_validation_error(errors) def _validate_input_data(self) -> dict[Any, Any]: + """Validates the input data configuration. + + Returns + ------- + A dictionary of input data configuration validation errors. + """ errors = defaultdict(dict) input_data_dict = self.input_data.to_dict() if not input_data_dict: @@ -168,6 +248,12 @@ def _validate_input_data(self) -> dict[Any, Any]: return errors def _validate_environment(self) -> dict[Any, Any]: + """Validates the environment configuration. + + Returns + ------- + A dictionary of environment configuration validation errors. + """ errors = defaultdict(dict) if not self.environment.container_engine in ["docker", "singularity", "undefined"]: errors[ENVIRONMENT_ERRORS_KEY]["container_engine"] = [ @@ -189,11 +275,26 @@ def load_params_from_specification( computing_environment: str | None, results_dir: str, ) -> dict[str, Any]: - """Gather together all specification data. + """Gathers together all specification data. This gathers the pipeline, input data, and computing environment specifications as well as the results directory into a single dictionary for insertion into - the Config object. + the ``Config`` object. + + Parameters + ---------- + pipeline_specification + The path to the pipeline specification yaml file. + input_data + The path to the input data yaml file. + computing_environment + The path to the computing environment yaml file. + results_dir + The path to the results directory. + + Returns + ------- + A dictionary of all provided specification data. """ return { "pipeline": load_yaml(pipeline_specification), @@ -206,7 +307,7 @@ def load_params_from_specification( def _load_input_data_paths( input_data_specification_path: str | Path, ) -> dict[str, list[Path]]: - """Create dictionary of input data paths from the input data yaml file.""" + """Creates a dictionary of input data paths from the input data yaml file.""" input_data_paths = load_yaml(input_data_specification_path) if not isinstance(input_data_paths, dict): raise TypeError( @@ -222,7 +323,7 @@ def _load_input_data_paths( def _load_computing_environment( computing_environment_specification_path: str | None, ) -> dict[Any, Any]: - """Load the computing environment yaml file and return the contents as a dict.""" + """Loads the computing environment yaml file and returns the contents as a dict.""" if not computing_environment_specification_path: return {} # handles empty environment.yaml elif not Path(computing_environment_specification_path).is_file(): diff --git a/src/easylink/graph_components.py b/src/easylink/graph_components.py index f0d4c959..8fe3c9aa 100644 --- a/src/easylink/graph_components.py +++ b/src/easylink/graph_components.py @@ -1,3 +1,13 @@ +""" +================ +Graph Components +================ + +This module is responsible for defining the modular building-block objects that +can be composed to create graph representations of pipelines. + +""" + from __future__ import annotations from abc import ABC, abstractmethod @@ -14,35 +24,92 @@ @dataclass(frozen=True) class InputSlot: - """InputSlot represents a single input slot for a step.""" + """An abstraction representing a single input slot to a specific node. + + ``InputSlots`` represent distinct semantic categories of input files, between + which a node must be able to differentiate. In order to pass data between nodes, + an ``InputSlot`` of one node can be connected to an :class:`OutputSlot` of another + node via an :class:`EdgeParams` instance. + + Notes + ----- + Nodes can be either :class:`Steps` or :class:`Implementations`. + """ name: str + """The name of the input slot.""" env_var: str | None - validator: Callable + """The environment variable that this input slot will use to pass a list of data filepaths + to an Implementation.""" + validator: Callable[[str], None] + """A callable that validates the input data being passed into the pipeline + via this input slot. If the data is invalid, the callable should raise an exception + with a descriptive error message which will then be reported to the user.""" @dataclass(frozen=True) class OutputSlot: - """OutputSlot represents a single output slot for a step.""" + """An abstraction representing a single output slot from a specific node. + + In order to pass data between nodes, an OutputSlot of one node can be connected + to an :class:`InputSlot` of another node via an :class:`EdgeParams` instance. + + Notes + ----- + Nodes can be either :class:`Steps` or :class:`Implementations`. + + Input data is validated via the :class:`InputSlot's` required + :attr:`~InputSlot.validator` attribute. In order to prevent multiple + validations of the same files (since outputs of one node can be inputs to another), + no such validator is stored here on the OutputSlot. + """ name: str + """The name of the output slot.""" @dataclass(frozen=True) class EdgeParams: - """A dataclass representation of an edge between two nodes in a networkx graph. + """A representation of an edge between two nodes in a graph. + + EdgeParams connect the :class:`OutputSlot` of a source node to the :class:`InputSlot` + of a target node. - Edges connect the output slot of the source node to the input slot of the target node. + Notes + ----- + Nodes can be either :class:`Steps` or :class:`Implementations`. """ source_node: str + """The name of the source node.""" target_node: str + """The name of the target node.""" output_slot: str + """The name of the :class:`OutputSlot` of the source node.""" input_slot: str + """The name of the :class:`InputSlot` of the target node.""" filepaths: tuple[str] | None = None + """The filepaths that are passed from the source node to the target node.""" @classmethod - def from_graph_edge(cls, source, sink, edge_attrs) -> EdgeParams: + def from_graph_edge( + cls: type["EdgeParams"], + source: str, + sink: str, + edge_attrs: dict[str, OutputSlot | InputSlot | str | None], + ) -> EdgeParams: + """A convenience method to create an EdgeParams instance. + + Parameters + ---------- + source + The name of the source node. + sink + The name of the target node. + edge_attrs + The attributes of the edge connecting the source and target nodes. + 'output_slot' and 'input_slot' are required keys and 'filepaths' is optional. + """ return cls( source, sink, @@ -53,27 +120,54 @@ def from_graph_edge(cls, source, sink, edge_attrs) -> EdgeParams: class StepGraph(nx.MultiDiGraph): - """A graph of Steps, with edges representing file dependencies between them. + """A directed acyclic graph (DAG) of :class:`Steps` and the data dependencies between them. - StepGraphs are contained as an attribute of Steps, with the highest level being - the PipelineSchema. + StepGraphs are DAGs with :class:`Steps` + for nodes and the file dependencies between them for edges. Multiple edges + between nodes are permitted. + + Notes + ----- + These are high-level abstractions; they represent a conceptual pipeline + graph with no detail as to how each :class:`~easylink.step.Step` is implemented. + + The highest level StepGraph is the that of the entire :class:`~easylink.pipeline_schema.PipelineSchema`. + + See Also + -------- + :class:`ImplementationGraph` + :class:`~easylink.pipeline_schema.PipelineSchema` """ @property def step_nodes(self) -> list[str]: - """Return list of nodes tied to specific steps.""" + """The topologically sorted list of node/:class:`~easylink.step.Step` names.""" ordered_nodes = list(nx.topological_sort(self)) return [node for node in ordered_nodes if node != "input_data" and node != "results"] @property def steps(self) -> list[Step]: - """Convenience property to get all steps in the graph.""" + """The list of all :class:`Steps` in the graph.""" return [self.nodes[node]["step"] for node in self.step_nodes] def add_node_from_step(self, step: Step) -> None: + """Adds a new node to the StepGraph. + + Parameters + ---------- + step + The :class:`~easylink.step.Step` to add to the graph as a new node. + """ self.add_node(step.name, step=step) def add_edge_from_params(self, edge_params: EdgeParams) -> None: + """Adds a new edge to the StepGraph. + + Parameters + ---------- + edge_params + The :class:`EdgeParams` to add to the graph as a new edge. + """ return self.add_edge( edge_params.source_node, edge_params.target_node, @@ -87,27 +181,63 @@ def add_edge_from_params(self, edge_params: EdgeParams) -> None: class ImplementationGraph(nx.MultiDiGraph): - """A graph of Implementations, with edges representing file dependencies between them. - - ImplementationGraphs are subgraphs of a PipelineGraph generated by a particular Step, - including the PipelineGraph itself. + """A graph of :class:`Implementations`. + + ImplementationGraphs are directed graphs with :class:`Implementations` + for nodes and the file dependencies between them for edges. Self-edges as well + as multiple edges between nodes are permitted. + + Notes + ----- + An ImplementationGraph is a low-level abstraction; it represents the *actual + implementations* of each :class:`~easylink.step.Step` in the pipeline. This + is in contrast to a :class:`StepGraph`, which can be an intricate nested structure + due to the various complex and self-similar :class:`~easylink.step.Step` instances + (which represent abstract operations such as "loop this step N times"). An + ImplementationGraph is the flattened and concrete graph of + :class:`Implementations` to run. + + The highest level ImplementationGraph is the that of the entire + :class:`~easylink.pipeline_graph.PipelineGraph`. + + See Also + -------- + :class:`StepGraph` + :class:`~easylink.pipeline_graph.PipelineGraph` """ @property def implementation_nodes(self) -> list[str]: - """Return list of nodes tied to specific implementations.""" + """The topologically sorted list of node/:class:`~easylink.implementation.Implementation` names.""" ordered_nodes = list(nx.topological_sort(self)) return [node for node in ordered_nodes if node != "input_data" and node != "results"] @property def implementations(self) -> list[Implementation]: - """Convenience property to get all implementations in the graph.""" + """The list of all :class:`Implementations` in the graph.""" return [self.nodes[node]["implementation"] for node in self.implementation_nodes] def add_node_from_implementation(self, node_name, implementation: Implementation) -> None: + """Adds a new node to the ImplementationGraph. + + Parameters + ---------- + node_name + The name of the new node. + implementation + The :class:`~easylink.implementation.Implementation` to add to the graph + as a new node. + """ self.add_node(node_name, implementation=implementation) def add_edge_from_params(self, edge_params: EdgeParams) -> None: + """Adds a new edge to the ImplementationGraph. + + Parameters + ---------- + edge_params + The :class:`EdgeParams` to add to the graph as a new edge. + """ return self.add_edge( edge_params.source_node, edge_params.target_node, @@ -123,19 +253,48 @@ def add_edge_from_params(self, edge_params: EdgeParams) -> None: @dataclass(frozen=True) class SlotMapping(ABC): - """SlotMapping represents a mapping between a parent and child node at different levels of the nested pipeline schema.""" + """A mapping between a slot on a parent Step and a slot on (one of) its child Steps. + + SlotMapping is an interface intended to be used by concrete :class:`InputSlotMapping` + and :class:`OutputSlotMapping` classes. It represents a mapping between + parent and child nodes/:class:`Steps` at different levels + of a potentially-nested :class:`~easylink.pipeline_schema.PipelineSchema`. + """ parent_slot: str + """The name of the parent slot.""" child_node: str + """The name of the child node.""" child_slot: str + """The name of the child slot.""" @abstractmethod def remap_edge(self, edge: EdgeParams) -> EdgeParams: + """Remaps an edge to connect the parent and child nodes.""" pass class InputSlotMapping(SlotMapping): + """A mapping between :class:`InputSlots` of a parent node and a child node.""" + def remap_edge(self, edge: EdgeParams) -> EdgeParams: + """Remaps an edge's :class:`InputSlot`. + + Parameters + ---------- + edge + The edge to remap. + + Returns + ------- + EdgeParams + The remapped edge. + + Raises + ------ + ValueError + If the parent slot does not match the input slot of the edge. + """ if edge.input_slot != self.parent_slot: raise ValueError("Parent slot does not match input slot") return EdgeParams( @@ -147,7 +306,26 @@ def remap_edge(self, edge: EdgeParams) -> EdgeParams: class OutputSlotMapping(SlotMapping): + """A mapping between :class:`InputSlots` of a parent node and a child node.""" + def remap_edge(self, edge: EdgeParams) -> EdgeParams: + """Remaps an edge's :class:`OutputSlot`. + + Parameters + ---------- + edge + The edge to remap. + + Returns + ------- + EdgeParams + The remapped edge. + + Raises + ------ + ValueError + If the parent slot does not match the output slot of the edge. + """ if edge.output_slot != self.parent_slot: raise ValueError("Parent slot does not match output slot") return EdgeParams( diff --git a/src/easylink/implementation.py b/src/easylink/implementation.py index 99a2408e..291efb45 100644 --- a/src/easylink/implementation.py +++ b/src/easylink/implementation.py @@ -1,3 +1,14 @@ +""" +=============== +Implementations +=============== + +This module is responsible for defining the abstractions that represent actual +implementations of steps in a pipeline. Typically, these abstractions contain +information about what container to run for the step and other related details. + +""" + from collections.abc import Iterable from pathlib import Path @@ -9,11 +20,24 @@ class Implementation: - """A representation of an actual container that will be executed for a particular step. - - Implementations exist at a lower level than Steps. This class - contains information about what container to use, what environment variables to set - inside the container, and some metadata about the container. + """A representation of an actual container that will be executed for a :class:`~easylink.step.Step`. + + Implementations exist at a lower level than :class:`Steps`. + This class contains information about what container to use, what environment + variables to set inside the container, and some metadata about the container. + + Parameters + ---------- + schema_steps + The requested :class:`~easylink.pipeline_schema.PipelineSchema` + :class:`~easylink.step.Step` names for which this Implementation is + expected to be responsible. + implementation_config + The configuration for this Implementation. + input_slots + The :class:`InputSlots` for this Implementation. + output_slots + The :class:`OutputSlots` for this Implementation. """ def __init__( @@ -24,20 +48,25 @@ def __init__( output_slots: Iterable["OutputSlot"] = (), ): self.name = implementation_config.name - """The name of the implementation.""" + """The name of this Implementation.""" self.input_slots = {slot.name: slot for slot in input_slots} - """A mapping of input slot names to InputSlot instances.""" + """A mapping of :class:`InputSlots` + names to their instances.""" self.output_slots = {slot.name: slot for slot in output_slots} - """A mapping of output slot names to OutputSlot instances.""" + """A mapping of :class:`OutputSlots` + names to their instances.""" self._metadata = self._load_metadata() self.environment_variables = self._get_env_vars(implementation_config) """A mapping of environment variables to set.""" self.metadata_steps = self._metadata["steps"] - """The specific step details that this implementation is associated with.""" + """The names of the specific :class:`Steps` for which + this Implementation is responsible.""" self.schema_steps = schema_steps - """The high-level pipeline schema steps that this implementation is associated with.""" + """The requested :class:`~easylink.pipeline_schema.PipelineSchema` + :class:`~easylink.step.Step` names for which this Implementation is + requested to be responsible in the pipeline.""" self.requires_spark = self._metadata.get("requires_spark", False) - """Whether this implementation requires a Spark environment.""" + """Whether this Implementation requires a Spark environment.""" def __repr__(self) -> str: return f"Implementation.{self.name}" @@ -45,10 +74,18 @@ def __repr__(self) -> str: def validate(self) -> list[str]: """Validates individual Implementation instances. - This is intended to be run from the Pipeline validate method. + Returns + ------- + A list of logs containing any validation errors. Each item in the list + is a distinct message about a particular validation error (e.g. if a + required container does not exist). + + Notes + ----- + This is intended to be run from :meth:`easylink.pipeline.Pipeline._validate`. """ logs = [] - logs = self._validate_expected_step(logs) + logs = self._validate_expected_steps(logs) logs = self._validate_container_exists(logs) return logs @@ -57,46 +94,65 @@ def validate(self) -> list[str]: ################## def _load_metadata(self) -> dict[str, str]: + """Loads the metadata for this Implementation instance.""" metadata = load_yaml(paths.IMPLEMENTATION_METADATA) return metadata[self.name] - def _validate_expected_step(self, logs: list[str]) -> list[str]: + def _validate_expected_steps(self, logs: list[str]) -> list[str]: + """Validates that the Implementation is responsible for the correct steps.""" if not set(self.schema_steps) == set(self.metadata_steps): logs.append( - f"Pipeline configuration nodes {self.schema_steps} do not match metadata steps {self.metadata_steps}." + f"Pipeline configuration nodes {self.schema_steps} do not match " + f"metadata steps {self.metadata_steps}." ) return logs def _validate_container_exists(self, logs: list[str]) -> list[str]: + """Validates that the container for this Implementation exists.""" err_str = f"Container '{self.singularity_image_path}' does not exist." if not Path(self.singularity_image_path).exists(): logs.append(err_str) return logs def _get_env_vars(self, implementation_config: LayeredConfigTree) -> dict[str, str]: + """Gets the environment variables relevant to this Implementation.""" env_vars = self._metadata.get("env", {}) env_vars.update(implementation_config.get("configuration", {})) return env_vars @property def singularity_image_path(self) -> str: + """The path to the Singularity image for this Implementation.""" return self._metadata["image_path"] @property def script_cmd(self) -> str: + """The command to run inside of the container for this Implementation.""" return self._metadata["script_cmd"] @property def outputs(self) -> dict[str, list[str]]: + """The outputs expected from this Implementation.""" return self._metadata["outputs"] class NullImplementation: - """A NullImplementation is used to represent a step that does not have an implementation. - - For example, the IO steps in the pipeline schema do not correspond to implementations - but ImplementationGraph requires an "implementation" attribute with input and output slots - for each node. + """An partial :class:`Implementation` interface that represents that no container needs to run. + + The primary use case for this class is when adding an :class:`~easylink.step.IOStep` - + which does not have a corresponding :class:`Implementation` - to an + :class:`~easylink.graph_components.ImplementationGraph` since adding any new + node requires an object with :class:`~easylink.graph_components.InputSlot` + and :class:`~easylink.graph_components.OutputSlot` names. + + Parameters + ---------- + name + The name of this NullImplementation. + input_slots + The :class:`InputSlots` for this NullImplementation. + output_slots + The :class:`OutputSlots` for this NullImplementation. """ def __init__( @@ -106,23 +162,49 @@ def __init__( output_slots: Iterable["OutputSlot"] = (), ): self.name = name + """The name of this NullImplementation.""" self.input_slots = {slot.name: slot for slot in input_slots} + """A mapping of :class:`InputSlots` + names to their instances.""" self.output_slots = {slot.name: slot for slot in output_slots} + """A mapping of :class:`OutputSlots` + names to their instances.""" self.schema_steps = [self.name] + """The requested :class:`~easylink.pipeline_schema.PipelineSchema` + :class:`~easylink.step.Step` names this ``NullImplementation`` implements.""" self.combined_name = None + """The name of the combined implementation of which ``NullImplementation`` + is a constituent. This is definitionally None for a ``NullImplementation``.""" class PartialImplementation: - """A representation of one part of a combined implementation that spans multiple steps. - - A PartialImplementation is what is initially added to the implementation graph when - a combined implementation is used (i.e. an implementation that spans multiple steps). - We initially add a node for _each_ step, which has as its "implementation" attribute a - PartialImplementation. Such a graph is not yet fit to run. When we make our second - pass through, after the flat (non-hierarchical) PipelineGraph has been created, we find the set of - PartialImplementation nodes corresponding to each combined implementation and - replace them with a single node with a true Implementation representing the combined - implementation. + """A representation of one part of a combined implementation that spans multiple :class:`Steps`. + + A PartialImplementation is what is initially added to the :class:`~easylink.graph_components.ImplementationGraph` + when a so-called "combined implementation" is used (i.e. an :class:`Implementation` + that spans multiple :class:`Steps`). + We initially add a node for _each_ :class:`~easylink.step.Step`, which has as + its ``implementation`` attribute a PartialImplementation. Such a graph is not + yet fit to run. When we make our second pass through, after the flat (non-hierarchical) + :class:`~easylink.pipeline_graph.PipelineGraph` has been created, we find the + set of PartialImplementation nodes corresponding to each combined implementation + and replace them with a single node with a true :class:`Implementation` representing + the combined implementation. + + Parameters + ---------- + combined_name + The name of the combined implementation of which this ``PartialImplementation`` + is a part. + schema_step + The requested :class:`~easylink.pipeline_schema.PipelineSchema` + :class:`~easylink.step.Step` name that this ``PartialImplementation`` + partially implements. + input_slots + The :class:`InputSlots` for this PartialImplementation. + output_slots + The :class:`OutputSlots` for this PartialImplementation. + """ def __init__( @@ -133,6 +215,15 @@ def __init__( output_slots: Iterable["OutputSlot"] = (), ): self.combined_name = combined_name + """The name of the combined implementation of which this ``PartialImplementation`` + is a part.""" self.schema_step = schema_step + """The requested :class:`~easylink.pipeline_schema.PipelineSchema` + :class:`~easylink.step.Step` name that this ``PartialImplementation`` + partially implements.""" self.input_slots = {slot.name: slot for slot in input_slots} + """A mapping of :class:`InputSlots` + names to their instances.""" self.output_slots = {slot.name: slot for slot in output_slots} + """A mapping of :class:`OutputSlots` + names to their instances.""" diff --git a/src/easylink/pipeline_graph.py b/src/easylink/pipeline_graph.py index d66e694c..9bbfbd3d 100644 --- a/src/easylink/pipeline_graph.py +++ b/src/easylink/pipeline_graph.py @@ -22,9 +22,9 @@ class PipelineGraph(ImplementationGraph): """The structure of the pipeline. - The PipelineGraph is a DAG composed of Implementations and their file dependencies. - It is created by "flattening" the PipelineSchema (a nested StepGraph) with parameters - set in the configuration. + The PipelineGraph is a directed acyclic graph (DAG) composed of Implementations + and their file dependencies. It is created by "flattening" the PipelineSchema + (a nested StepGraph) with parameters set in the configuration. """ def __init__(self, config: Config) -> None: @@ -106,7 +106,7 @@ def _get_combined_slots_and_edges( Returns ------- - The set of InputSlots, OutputSlots, and EdgeParams needed to construct the combined implementation + The set of InputSlots, OutputSlots, and EdgeParams needed to construct the combined implementation """ slot_types = ["input_slot", "output_slot"] combined_slots_by_type = combined_input_slots, combined_output_slots = set(), set() @@ -154,7 +154,7 @@ def _get_edges_by_slot( Returns ------- - A tuple of dictionaries keyed by slot, with values for edges corresponding to that slot. + A tuple of dictionaries keyed by slot, with values for edges corresponding to that slot. """ in_edges_by_slot = defaultdict(list) @@ -196,7 +196,7 @@ def _get_duplicate_slots( Returns ------- - A set of (step_name, slot) tuples that have duplicate names or environment variables. + A set of (step_name, slot) tuples that have duplicate names or environment variables. """ name_freq = Counter([slot.name for step_name, slot in slot_tuples]) duplicate_names = [name for name, count in name_freq.items() if count > 1] diff --git a/src/easylink/pipeline_schema_constants/development.py b/src/easylink/pipeline_schema_constants/development.py index 7a64dc83..ae07c5a1 100644 --- a/src/easylink/pipeline_schema_constants/development.py +++ b/src/easylink/pipeline_schema_constants/development.py @@ -1,3 +1,9 @@ +""" +============================ +Development Pipeline Schemas +============================ +""" + from easylink.graph_components import ( EdgeParams, InputSlot, diff --git a/src/easylink/pipeline_schema_constants/tests.py b/src/easylink/pipeline_schema_constants/tests.py index 44157e2a..244cf359 100644 --- a/src/easylink/pipeline_schema_constants/tests.py +++ b/src/easylink/pipeline_schema_constants/tests.py @@ -1,3 +1,9 @@ +""" +===================== +Test Pipeline Schemas +===================== +""" + from easylink.graph_components import ( EdgeParams, InputSlot, diff --git a/src/easylink/rule.py b/src/easylink/rule.py index 36fde902..5b2d5bf7 100644 --- a/src/easylink/rule.py +++ b/src/easylink/rule.py @@ -29,7 +29,7 @@ def _build_rule(self) -> str: class TargetRule(Rule): """A rule that defines the final output of the pipeline. - Snakemake will determine the DAG based on this target. + Snakemake will determine the directed acyclic graph (DAG) based on this target. """ target_files: list[str] diff --git a/src/easylink/utilities/data_utils.py b/src/easylink/utilities/data_utils.py index 23de922b..6df3f30b 100644 --- a/src/easylink/utilities/data_utils.py +++ b/src/easylink/utilities/data_utils.py @@ -42,9 +42,9 @@ def copy_configuration_files_to_results_directory( shutil.copy(computing_environment, results_dir) -def get_results_directory(output_dir: str | None, timestamp: bool) -> Path: +def get_results_directory(output_dir: str | None, no_timestamp: bool) -> Path: results_dir = Path("results" if output_dir is None else output_dir).resolve() - if timestamp: + if not no_timestamp: launch_time = _get_timestamp() results_dir = results_dir / launch_time return results_dir diff --git a/src/easylink/utilities/general_utils.py b/src/easylink/utilities/general_utils.py index e4486b38..f5c033f2 100644 --- a/src/easylink/utilities/general_utils.py +++ b/src/easylink/utilities/general_utils.py @@ -87,13 +87,23 @@ def _add_logging_sink( def exit_with_validation_error(error_msg: dict) -> None: - """Exits the program with a validation error. + """Logs error messages and exits the program. + + This function logs the provided validation error messages using a structured + YAML format and terminates the program execution with a non-zero exit code + (indicating an error). Parameters ---------- error_msg The error message to print to the user. + Raises + ------ + SystemExit + Exits the program with an EINVAL (invalid argument) code due to + previously-determined validation errors. + """ logger.error( @@ -103,7 +113,7 @@ def exit_with_validation_error(error_msg: dict) -> None: "\nValidation errors found. Please see above." "\n==========================================\n" ) - exit(errno.EINVAL) + sys.exit(errno.EINVAL) def is_on_slurm() -> bool: diff --git a/src/easylink/utilities/paths.py b/src/easylink/utilities/paths.py index 0b81a871..4027fe89 100644 --- a/src/easylink/utilities/paths.py +++ b/src/easylink/utilities/paths.py @@ -1,3 +1,9 @@ +""" +========= +Filepaths +========= +""" + from pathlib import Path # TODO: We'll need to update this to be more generic for external users and have a way of configuring this diff --git a/tests/unit/test_data_utils.py b/tests/unit/test_data_utils.py index eae9d354..13bed35c 100644 --- a/tests/unit/test_data_utils.py +++ b/tests/unit/test_data_utils.py @@ -22,17 +22,17 @@ def test_create_results_directory(test_dir): @pytest.mark.parametrize( - "output_dir_provided, timestamp", + "output_dir_provided, no_timestamp", [ - (False, False), (False, True), - (True, False), + (False, False), (True, True), + (True, False), ], ) -def test_get_results_directory(test_dir, output_dir_provided, timestamp, mocker): +def test_get_results_directory(test_dir, output_dir_provided, no_timestamp, mocker): """Tests expected behavior. If directory is provided then a "results/" folder - is created at the working directory. If timestamp is True, then a timestamped + is created at the working directory. If no_timestamp is False, then a timestamped directory is created within the results directory. """ if output_dir_provided: @@ -44,12 +44,12 @@ def test_get_results_directory(test_dir, output_dir_provided, timestamp, mocker) mocker.patch( "easylink.utilities.data_utils._get_timestamp", return_value="2024_01_01_00_00_00" ) - results_dir = get_results_directory(output_dir, timestamp) + results_dir = get_results_directory(output_dir, no_timestamp) expected_results_dir = Path(test_dir) if not output_dir_provided: expected_results_dir = expected_results_dir / "results" - if timestamp: + if not no_timestamp: expected_results_dir = expected_results_dir / "2024_01_01_00_00_00" assert expected_results_dir == results_dir diff --git a/update_readme.py b/update_readme.py new file mode 100644 index 00000000..5ee52e0d --- /dev/null +++ b/update_readme.py @@ -0,0 +1,32 @@ +""" This script updates the README.rst file with the latest information about +the project. It is intended to be run from the github "update README" workflow. +""" + +import json +import re + +from packaging.version import parse + +# Load supported python versions +with open("python_versions.json", "r") as f: + versions = json.load(f) +versions_str = ", ".join(versions) +versions = [parse(v) for v in versions] +max_version = max(versions).base_version + +# Open README and replace python versions +with open("README.rst", "r") as file: + readme = file.read() +# Update the list of supported python versions +# NOTE: this regex assumes the version format is always major.minor +readme = re.sub( + r"Supported Python versions:\s*(?:\d+\.\d+\s*,\s*)+\d+\.\d+", + r"Supported Python versions: " + versions_str, + readme, +) +# Update the python version used in the installation code snipped example +readme = re.sub(r"python=\d+\.\d+", "python=" + max_version, readme) + +# Write the updated README back to file +with open("README.rst", "w") as file: + file.write(readme)