diff --git a/.typos.toml b/.typos.toml index 694ed501..1c3a4859 100644 --- a/.typos.toml +++ b/.typos.toml @@ -1,5 +1,14 @@ [files] -extend-exclude = ["*.csv", "sign-language-detection-yolov5/*", "orbit-user-analysis/steps/report.py", "customer-satisfaction/pipelines/deployment_pipeline.py", "customer-satisfaction/streamlit_app.py", "nba-pipeline/Building and Using An MLOPs Stack With ZenML.ipynb", "customer-satisfaction/tests/data_test.py"] +extend-exclude = [ + "*.csv", + "sign-language-detection-yolov5/*", + "orbit-user-analysis/steps/report.py", + "customer-satisfaction/pipelines/deployment_pipeline.py", + "customer-satisfaction/streamlit_app.py", + "nba-pipeline/Building and Using An MLOPs Stack With ZenML.ipynb", + "customer-satisfaction/tests/data_test.py", + "end-to-end-computer-vision/**/*.ipynb" +] [default.extend-identifiers] # HashiCorp = "HashiCorp" @@ -14,6 +23,9 @@ lenght = "lenght" preprocesser = "preprocesser" Preprocesser = "Preprocesser" Implicitly = "Implicitly" +fo = "fo" +mapp = "mapp" +polution = "polution" [default] locale = "en-us" diff --git a/end-to-end-computer-vision/.gitignore b/end-to-end-computer-vision/.gitignore index e76e4bbf..15a9d546 100644 --- a/end-to-end-computer-vision/.gitignore +++ b/end-to-end-computer-vision/.gitignore @@ -3,3 +3,4 @@ data/ runs/ **/tmp* +runs_dir diff --git a/end-to-end-computer-vision/README.md b/end-to-end-computer-vision/README.md index 658da4bd..f936a8e8 100644 --- a/end-to-end-computer-vision/README.md +++ b/end-to-end-computer-vision/README.md @@ -4,6 +4,8 @@ This is a project that demonstrates an end-to-end computer vision pipeline using ZenML. The pipeline is designed to be modular and flexible, allowing for easy experimentation and extension. +![diagram.png](_assets/diagram.png) + The project showcases the full lifecycle of a computer vision project, from data collection and preprocessing to model training and evaluation. The pipeline also incorporates a human-in-the-loop (HITL) component, where human annotators can @@ -12,36 +14,230 @@ label images to improve the model's performance, as well as feedback using The project uses the [Ship Detection dataset](https://huggingface.co/datasets/datadrivenscience/ship-detection) from -[DataDrivenScience](https://datadrivenscience.com/) on the Hugging Face Hub, which contains images of ships in -satellite imagery. The goal is to train a model to detect ships in the images. -Note that this isn't something that our YOLOv8 model is particularly good at out -of the box, so it serves as a good example of how to build a pipeline that can -be extended to other use cases. +[DataDrivenScience](https://datadrivenscience.com/) on the Hugging Face Hub, +which contains images of ships in satellite imagery. The goal is to train a +model to detect ships in the images. Note that this isn't something that our +YOLOv8 model is particularly good at out of the box, so it serves as a good +example of how to build a pipeline that can be extended to other use cases. + +This project needs some infrastructure and tool setup to work. Here is a list of +things that you'll need to do. -## Run this pipeline +## ZenML -### Setup +We recommend using our [ZenML Cloud offering](https://cloud.zenml.io/) to get a +deployed instance of zenml: -You'll need to run the following: +### Set up your environment ```bash -zenml integration install label_studio torch gcp mlflow -y pip install -r requirements.txt -pip uninstall wandb +zenml integration install label_studio torch gcp mlflow -y +pip uninstall wandb # This comes in automatically +``` + +And to use the Albumentations and annotation plugins in the last step, you'll +need to install them: + +```bash +fiftyone plugins download https://github.com/jacobmarks/fiftyone-albumentations-plugin + +fiftyone plugins download https://github.com/voxel51/fiftyone-plugins --plugin-names @voxel51/annotation ``` -You can also set the following environment variables: +You should also set up the following environment variables: ```bash export DATA_UPLOAD_MAX_NUMBER_FILES=1000000 export WANDB_DISABLED=True ``` -And to use the Albumentations and annotation plugins, you'll need to install -them: +### Connect to your deployed ZenML instance ```bash -fiftyone plugins download https://github.com/jacobmarks/fiftyone-albumentations-plugin +zenml connect --url +``` -fiftyone plugins download https://github.com/voxel51/fiftyone-plugins --plugin-names @voxel51/annotation +## Cloud Provider + +We will use GCP in the commands listed below, but it will work for other cloud +providers. + +### Follow our guide to set up your credential for GCP + +[Set up a GCP service +connector](https://docs.zenml.io/stacks-and-components/auth-management/gcp-service-connector) + +### Set up a bucket to persist your training data + +### Set up a bucket to use as artifact store within ZenML + +[Learn how to set up a GCP artifact store stack component within zenml +here](https://docs.zenml.io/stacks-and-components/component-guide/artifact-stores) +### Set up vertex for pipeline orchestration + +[Learn how to set up a Vertex orchestrator stack component within zenml +here](https://docs.zenml.io/stacks-and-components/component-guide/orchestrators/vertex) +### For training on accelerators like GPUs/TPUs set up Vertex + +[Learn how to set up a Vertex step operator stack component within zenml +here](https://docs.zenml.io/stacks-and-components/component-guide/step-operators/vertex) +### Set up Container Registry + +[Learn how to set up a google cloud container registry component within zenml +here](https://docs.zenml.io/stacks-and-components/component-guide/container-registries/gcp) + +## Label Studio + +### [Start Label Studio locally](https://labelstud.io/guide/start) +### [Follow these ZenML instructions to set up Label Studio as a stack component](https://docs.zenml.io/stacks-and-components/component-guide/annotators/label-studio) +### Create a project within Label Studio and name it `ship_detection_gcp` +### [Set up Label Studio to use external storage](https://labelstud.io/guide/storage) +use the first bucket that you created to data persistence + +## ZenML Stacks + +### Local Stack + +The local stack should use the `default` orchestrator, a gcp remote artifact +store that we'll call `gcp_artifact_store` here and a local label-studio +annotator that we'll refer to as `label_studio_local`. + +```bash +# Make sure to replace the names with the names that you choose for your setup +zenml stack register -o default -a -an +``` + +### Remote Stack + +The remote stack should use the `vertex_orchestrator` , a `gcp_artifact_store`, +a `gcp_container_registry` and a `vertex_step_operator`. + + +```bash +# Make sure to replace the names with the names that you choose for your setup +zenml stack register -o -a -c -s ``` + +The project consists of the following pipelines: + +## data_ingestion_pipeline + +This pipeline downloads the [Ship Detection +dataset](https://huggingface.co/datasets/datadrivenscience/ship-detection). This +dataset contains some truly huge images with a few hundred million pixels. In +order to make these useable, we break down all source images into manageable +tiles with a maximum height/width of 1000 pixels. After this preprocessing is +done, the images are uploaded into a cloud bucket and the ground truth +annotations are uploaded to a local Label Studio instance. + +### Configure this pipeline +The configuration file for this pipeline lives at `./configs/ingest_data.yaml`. +Make sure in particular to change `data_source` to point at the GCP bucket which +is dedicated to be the storage location for the data. Also make sure to adjust +the `ls_project_id` to correspond to the id of your project within Label Studio. + +### Run this pipeline + +Label Studio should be up and running for the whole duration of this pipeline +run. + +```bash +zenml stack set +python run.py --ingest +``` + +## data_export_pipeline + +This pipeline exports the annotations from Label Studio and loads it into the +ZenML artifact store to make them accessible to downstream pipelines. + +### Configure this pipeline +The configuration file for this pipeline lives at `./configs/data_export.yaml`. +Make sure in particular to change `dataset_name` to reflect the name of the +dataset within Label Studio. + +### Run this pipeline + +Label Studio should be up and running for the whole duration of this pipeline +run. + +```bash +zenml stack set +python run.py --export +``` + +## training_pipeline + +This pipeline trains a YOLOv8 object detection model. + +### Configure this pipeline +You can choose to run this pipeline locally or on the cloud. These two options +use two different configuration files. For local training: +`./configs/training_pipeline.yaml`. For training on the cloud: +`./configs/training_pipeline_remote_gpu.yaml`. Make sure `data_source` points to +your cloud storage bucket. + +### Run this pipeline + +This pipeline requires the associated model (see the model section of the +configuration yaml file) to have a version in the `staging` stage. In order to +promote the model produced by the latest run of the `data_export_pipeline`, run +the following code: + +```bash +zenml model version update latest -s staging +``` + +For local training run the following code: + +```bash +zenml stack set +python run.py --training --local +``` + +For remote training run the following code: + +```bash +zenml stack set +python run.py --training +``` + +## inference_pipeline + +This pipeline performs inference on the object detection model. + +### Configure this pipeline +You can configure this pipeline at the following yaml file +`./configs/inference_pipeline.yaml`. Make sure `data_source` points to your +cloud storage bucket that contains images that you want to perform batch +inference on + +### Run this pipeline + +This pipeline requires the associated model (see the model section of the +configuration yaml file) to have a version in the `production` stage. In order +to promote the model produced by the latest run of the `training_pipeline`, run +the following code: + +```bash +zenml model version update staging -s production +``` + +```bash +zenml stack set +python run.py --inference +``` + + +## Analyze and Curate your data through FiftyOne + +Now to close the loop, we will import the predictions into FiftyOne. All you'll +need to do is run: + +```bash +python run.py --fiftyone +``` + +Within FiftyOne, you can now analyze all the predictions and export them back to +Label Studio for finetuned labeling and retraining. diff --git a/end-to-end-computer-vision/_assets/diagram.png b/end-to-end-computer-vision/_assets/diagram.png new file mode 100644 index 00000000..d02f5213 Binary files /dev/null and b/end-to-end-computer-vision/_assets/diagram.png differ diff --git a/end-to-end-computer-vision/bus.jpg b/end-to-end-computer-vision/bus.jpg new file mode 100644 index 00000000..b43e3111 Binary files /dev/null and b/end-to-end-computer-vision/bus.jpg differ diff --git a/end-to-end-computer-vision/configs/data_export_alexej.yaml b/end-to-end-computer-vision/configs/data_export.yaml similarity index 71% rename from end-to-end-computer-vision/configs/data_export_alexej.yaml rename to end-to-end-computer-vision/configs/data_export.yaml index 22f2464e..62d3157c 100644 --- a/end-to-end-computer-vision/configs/data_export_alexej.yaml +++ b/end-to-end-computer-vision/configs/data_export.yaml @@ -2,7 +2,7 @@ enable_cache: False # pipeline configuration parameters: - dataset_name: "ship_detection_gcp" # "ship_detection" + dataset_name: "ship_detection_gcp" # This is the name of the dataset in label studio # Configuration of the Model Control Plane model: diff --git a/end-to-end-computer-vision/configs/data_export_alex.yaml b/end-to-end-computer-vision/configs/data_export_alex.yaml deleted file mode 100644 index c73b4d80..00000000 --- a/end-to-end-computer-vision/configs/data_export_alex.yaml +++ /dev/null @@ -1,12 +0,0 @@ -enable_cache: False - -# pipeline configuration -parameters: - dataset_name: "FiftyOne_ships_wuxqa5" - -# configuration of the Model Control Plane -model: - name: ShipDetector - license: Apache 2.0 - description: Object Detection Model. - tags: ["object detection"] diff --git a/end-to-end-computer-vision/configs/fiftyone.yaml b/end-to-end-computer-vision/configs/fiftyone.yaml deleted file mode 100644 index e94f5e98..00000000 --- a/end-to-end-computer-vision/configs/fiftyone.yaml +++ /dev/null @@ -1,22 +0,0 @@ -enable_cache: False - -settings: - docker: - apt_packages: - - ffmpeg - - libsm6 - - libxext6 - required_integrations: - - gcp - - github - requirements: - - ultralytics - - fiftyone - -# configuration of the Model Control Plane -# model: -# name: ShipDetector -# license: Apache 2.0 -# description: Object Detection Model. -# tags: ["object detection"] -# version: production diff --git a/end-to-end-computer-vision/configs/cloud_inference.yaml b/end-to-end-computer-vision/configs/inference_pipeline.yaml similarity index 81% rename from end-to-end-computer-vision/configs/cloud_inference.yaml rename to end-to-end-computer-vision/configs/inference_pipeline.yaml index 650dc901..f6dd22d8 100644 --- a/end-to-end-computer-vision/configs/cloud_inference.yaml +++ b/end-to-end-computer-vision/configs/inference_pipeline.yaml @@ -19,7 +19,7 @@ steps: enable_cache: False enable_step_logs: False parameters: - inference_data_source: "gs://zenml-internal-artifact-store/inference_cv_webinar" + inference_data_source: # Insert your bucket path here where the inference images live e.g. "gs://foo/bar" # configuration of the Model Control Plane model: diff --git a/end-to-end-computer-vision/configs/ingest_data.yaml b/end-to-end-computer-vision/configs/ingest_data.yaml index 4f294ab5..df8c48df 100644 --- a/end-to-end-computer-vision/configs/ingest_data.yaml +++ b/end-to-end-computer-vision/configs/ingest_data.yaml @@ -1,10 +1,11 @@ steps: - download_dataset_from_hf: + download_and_tile_dataset_from_hf: enable_cache: True + enable_step_logs: False parameters: dataset: "datadrivenscience/ship-detection" - data_source: "gs://zenml-internal-artifact-store/label_studio_cv_webinar" + data_source: # Insert your bucket path here where the training images will live e.g. "gs://foo/bar" upload_labels_to_label_studio: enable_cache: False parameters: diff --git a/end-to-end-computer-vision/configs/training.yaml b/end-to-end-computer-vision/configs/training_pipeline.yaml similarity index 76% rename from end-to-end-computer-vision/configs/training.yaml rename to end-to-end-computer-vision/configs/training_pipeline.yaml index 6b8d470e..f49ba554 100644 --- a/end-to-end-computer-vision/configs/training.yaml +++ b/end-to-end-computer-vision/configs/training_pipeline.yaml @@ -7,10 +7,10 @@ steps: train_model: enable_cache: False parameters: - data_source: "gs://zenml-internal-artifact-store/label_studio_cv_webinar" - batch_size: 16 - imgsz: 640 - epochs: 30 + data_source: # Insert your bucket path here where the training images lives e.g. "gs://foo/bar" + batch_size: 8 + imgsz: 720 + epochs: 1 settings: docker: diff --git a/end-to-end-computer-vision/configs/training_gpu.yaml b/end-to-end-computer-vision/configs/training_pipeline_remote_gpu.yaml similarity index 78% rename from end-to-end-computer-vision/configs/training_gpu.yaml rename to end-to-end-computer-vision/configs/training_pipeline_remote_gpu.yaml index 5b5a3e4b..1ee643ab 100644 --- a/end-to-end-computer-vision/configs/training_gpu.yaml +++ b/end-to-end-computer-vision/configs/training_pipeline_remote_gpu.yaml @@ -21,14 +21,14 @@ steps: step_operator: gcp_a100 enable_step_logs: False parameters: - data_source: "gs://zenml-internal-artifact-store/label_studio_cv_webinar" + data_source: # Insert your bucket path here where the training images lives e.g. "gs://foo/bar" batch_size: 8 - imgsz: 480 - epochs: 5000 + imgsz: 720 + epochs: 50000 is_quad_gpu_env: True settings: step_operator.vertex: - accelerator_type: NVIDIA_TESLA_T4 # NVIDIA_TESLA_A100 # see https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec#AcceleratorType + accelerator_type: NVIDIA_TESLA_T4 # see https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec#AcceleratorType accelerator_count: 4 disk_size_gb: 25 docker: diff --git a/end-to-end-computer-vision/data/.gitignore b/end-to-end-computer-vision/data/.gitignore new file mode 100644 index 00000000..86d0cb27 --- /dev/null +++ b/end-to-end-computer-vision/data/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore \ No newline at end of file diff --git a/end-to-end-computer-vision/data/README.md b/end-to-end-computer-vision/data/README.md new file mode 100644 index 00000000..b6be7050 --- /dev/null +++ b/end-to-end-computer-vision/data/README.md @@ -0,0 +1 @@ +This directory serves as a place to store and access temporary datafiles. \ No newline at end of file diff --git a/end-to-end-computer-vision/materializers/label_studio_export_materializer.py b/end-to-end-computer-vision/materializers/label_studio_export_materializer.py new file mode 100644 index 00000000..356edc2a --- /dev/null +++ b/end-to-end-computer-vision/materializers/label_studio_export_materializer.py @@ -0,0 +1,99 @@ +# Copyright (c) ZenML GmbH 2022. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Implementation of the PyTorch DataLoader materializer.""" + +import os +import tempfile +from typing import TYPE_CHECKING, Any, ClassVar, Type + +from zenml.io import fileio +from zenml.materializers.base_materializer import BaseMaterializer + +DEFAULT_FILENAME = "annotations.zip" + +if TYPE_CHECKING: + from label_studio_sdk import Project + + +class LabelStudioAnnotationExport: + + def __init__(self, dataset: "Project" = None, filepath: str = None): + """ + Initialize LabelStudioAnnotationExport object with optional parameters. + + Parameters: + dataset: Label-studio dataset object. + filepath: A string representing the file path. Defaults to None. + """ + self.dataset = dataset + self.filepath = filepath + + def download_annotations(self): + """Downloads the annotations from label-studio to the local fs.""" + tmpfile_ = tempfile.NamedTemporaryFile(dir="data", delete=False) + tmpdirname = os.path.basename(tmpfile_.name) + self.filepath = os.path.join(tmpdirname, DEFAULT_FILENAME) + self.dataset.export_tasks( + export_type="YOLO", + export_location=self.filepath, + download_resources=False, + ) + + +class LabelStudioAnnotationMaterializer(BaseMaterializer): + """Base class for Label Studio annotation models.""" + + FILENAME: ClassVar[str] = DEFAULT_FILENAME + SKIP_REGISTRATION: ClassVar[bool] = True + ASSOCIATED_TYPES = (LabelStudioAnnotationExport,) + + def load(self, data_type: Type[Any]) -> LabelStudioAnnotationExport: + """Loads the serialized JSON file containing the annotations. + + Args: + data_type: A ultralytics YOLO type. + + Returns: + A ultralytics YOLO object. + """ + # Recreate the filepath of the file + filepath = os.path.join(self.uri, DEFAULT_FILENAME) + + # Create a temporary file + tmpfile_ = tempfile.NamedTemporaryFile( + dir="data", delete=False, suffix=".zip" + ) + + # Copy from artifact store to temporary file + fileio.copy(filepath, tmpfile_.name, overwrite=True) + + # Re-instantiate the LabelStudioAnnotationExport model + dataset = LabelStudioAnnotationExport(filepath=tmpfile_.name) + + return dataset + + def save(self, dataset: LabelStudioAnnotationExport) -> None: + """Creates a JSON serialization for a label studio YOLO dataset model. + + Args: + dataset: A label studio YOLO dataset model. + """ + # Downloads the annotations into the local fs + dataset.download_annotations() + + # create the destination path for the exported annotations + filepath = os.path.join(self.uri, DEFAULT_FILENAME) + + # copies the files from local fs into the artifact store + fileio.copy(dataset.filepath, filepath) diff --git a/end-to-end-computer-vision/materializers/ultralytics_materializer.py b/end-to-end-computer-vision/materializers/ultralytics_materializer.py new file mode 100644 index 00000000..fb4c47e4 --- /dev/null +++ b/end-to-end-computer-vision/materializers/ultralytics_materializer.py @@ -0,0 +1,69 @@ +# Copyright (c) ZenML GmbH 2022. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Implementation of the PyTorch DataLoader materializer.""" + +import os +import tempfile +from typing import Any, ClassVar, Type + +from ultralytics import YOLO +from zenml.integrations.pytorch.materializers.pytorch_module_materializer import ( + PyTorchModuleMaterializer, +) +from zenml.io import fileio + +DEFAULT_FILENAME = "obj.pt" + + +class UltralyticsMaterializer(PyTorchModuleMaterializer): + """Base class for ultralytics YOLO models.""" + + FILENAME: ClassVar[str] = DEFAULT_FILENAME + SKIP_REGISTRATION: ClassVar[bool] = True + ASSOCIATED_TYPES = (YOLO,) + + def load(self, data_type: Type[Any]) -> YOLO: + """Reads a ultralytics YOLO model from a serialized JSON file. + + Args: + data_type: A ultralytics YOLO type. + + Returns: + A ultralytics YOLO object. + """ + filepath = os.path.join(self.uri, DEFAULT_FILENAME) + + # Create a temporary folder + with tempfile.TemporaryDirectory(prefix="zenml-temp-") as temp_dir: + temp_file = os.path.join(str(temp_dir), DEFAULT_FILENAME) + + # Copy from artifact store to temporary file + fileio.copy(filepath, temp_file) + model = YOLO(temp_file) + + return model + + def save(self, model: YOLO) -> None: + """Creates a JSON serialization for a ultralytics YOLO model. + + Args: + model: A ultralytics YOLO model. + """ + filepath = os.path.join(self.uri, DEFAULT_FILENAME) + + # Make a temporary phantom artifact + with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as f: + model.save(f.name) + # Copy it into artifact store + fileio.copy(f.name, filepath) diff --git a/end-to-end-computer-vision/pipelines/__init__.py b/end-to-end-computer-vision/pipelines/__init__.py index 4dc295a9..6ee60a58 100644 --- a/end-to-end-computer-vision/pipelines/__init__.py +++ b/end-to-end-computer-vision/pipelines/__init__.py @@ -15,6 +15,7 @@ # limitations under the License. # -from .data_export import export_for_training - -# from .training import training +from .data_export import data_export_pipeline +from .data_ingestion import data_ingestion_pipeline +from .inference import inference_pipeline +from .training import training_pipeline diff --git a/end-to-end-computer-vision/pipelines/data_export.py b/end-to-end-computer-vision/pipelines/data_export.py index 344eadb9..3adf50c1 100644 --- a/end-to-end-computer-vision/pipelines/data_export.py +++ b/end-to-end-computer-vision/pipelines/data_export.py @@ -23,7 +23,7 @@ @pipeline -def export_for_training(dataset_name: str = "polution"): +def data_export_pipeline(dataset_name: str = "polution"): """Loads data from Label studio. Args: diff --git a/end-to-end-computer-vision/pipelines/data_ingestion.py b/end-to-end-computer-vision/pipelines/data_ingestion.py index e270179d..b1df9c6b 100644 --- a/end-to-end-computer-vision/pipelines/data_ingestion.py +++ b/end-to-end-computer-vision/pipelines/data_ingestion.py @@ -17,13 +17,13 @@ from zenml import pipeline from zenml.logger import get_logger -from steps.download_from_hf import download_dataset_from_hf +from steps.process_hf_dataset import process_hf_dataset from steps.upload_to_label_studio import upload_labels_to_label_studio logger = get_logger(__name__) @pipeline -def data_ingestion(): - labels_dict = download_dataset_from_hf() +def data_ingestion_pipeline(): + labels_dict = process_hf_dataset() upload_labels_to_label_studio(labels_dict) diff --git a/end-to-end-computer-vision/pipelines/inference.py b/end-to-end-computer-vision/pipelines/inference.py index 77db9ef1..8191f399 100644 --- a/end-to-end-computer-vision/pipelines/inference.py +++ b/end-to-end-computer-vision/pipelines/inference.py @@ -23,10 +23,6 @@ @pipeline -def inference(): +def inference_pipeline(): """Uses FiftyOne for inference on a dataset.""" create_fiftyone_dataset() - - -if __name__ == "__main__": - inference.with_options(config_path="configs/cloud_inference.yaml")() diff --git a/end-to-end-computer-vision/pipelines/training.py b/end-to-end-computer-vision/pipelines/training.py index 3e4edf41..eb31416d 100644 --- a/end-to-end-computer-vision/pipelines/training.py +++ b/end-to-end-computer-vision/pipelines/training.py @@ -25,7 +25,7 @@ @pipeline -def training(model_checkpoint: str = "yolov8l.pt"): +def training_pipeline(model_checkpoint: str = "yolov8l.pt"): """Trains a model on a dataset. Args: @@ -41,7 +41,3 @@ def training(model_checkpoint: str = "yolov8l.pt"): model=model, dataset=dataset, ) - - # promote_model(metrics) - - # predict_image(trained_model) diff --git a/end-to-end-computer-vision/requirements.txt b/end-to-end-computer-vision/requirements.txt index 6de734b1..2bcee708 100644 --- a/end-to-end-computer-vision/requirements.txt +++ b/end-to-end-computer-vision/requirements.txt @@ -1,14 +1,12 @@ -zenml[server]>=0.55.2 +zenml[server]>=0.55.3 notebook scikit-learn<1.3 pyarrow -wandb seaborn xgboost ultralytics torch -huggingface_hub>=0.20.0 # needed for fiftyone imports of HF datasets -# fiftyone @ git+https://github.com/voxel51/fiftyone.git@develop # fiftyone v0.24.0 +huggingface_hub>=0.20.0 fiftyone datasets albumentations diff --git a/end-to-end-computer-vision/run.py b/end-to-end-computer-vision/run.py index c5af3e28..b4035f30 100644 --- a/end-to-end-computer-vision/run.py +++ b/end-to-end-computer-vision/run.py @@ -17,15 +17,14 @@ from uuid import UUID import click -from zenml import Model from zenml.client import Client from zenml.enums import ModelStages from zenml.logger import get_logger -from pipelines.data_export import export_for_training -from pipelines.data_ingestion import data_ingestion -from pipelines.inference import inference -from pipelines.training import training +from pipelines.data_export import data_export_pipeline +from pipelines.data_ingestion import data_ingestion_pipeline +from pipelines.inference import inference_pipeline +from pipelines.training import training_pipeline from utils.constants import PREDICTIONS_DATASET_ARTIFACT_NAME, ZENML_MODEL_NAME logger = get_logger(__name__) @@ -37,7 +36,7 @@ @click.option( "--ingest", "-ig", - "ingest_data_pipeline", + "ingest_data", is_flag=True, default=False, help="Whether to run the data ingestion pipeline, that takes the dataset" @@ -55,7 +54,7 @@ @click.option( "--training", "-t", - "training_pipeline", + "train", is_flag=True, default=False, help="Whether to run the pipeline that trains the model.", @@ -63,7 +62,7 @@ @click.option( "--inference", "-i", - "inference_pipeline", + "batch_inference", is_flag=True, default=False, help="Whether to run the pipeline that performs inference.", @@ -74,15 +73,7 @@ "fiftyone", is_flag=True, default=False, - help="Whether to launch the FiftyOne app pipeline.", -) -@click.option( - "--stack", - "-s", - "stack", - required=False, - type=click.Choice(["alexej", "hamza", "alex"]), - help="The stack to use for the pipeline.", + help="Whether to launch the FiftyOne app.", ) @click.option( "--local", @@ -90,64 +81,104 @@ "train_local", is_flag=True, default=False, - help="Whether to train local.", + help="Whether to train local or an a remote orchestrator/ step operator.", ) def main( - ingest_data_pipeline: bool = False, + ingest_data: bool = False, export_pipeline: bool = False, - training_pipeline: bool = False, - inference_pipeline: bool = False, + train: bool = False, + batch_inference: bool = False, fiftyone: bool = False, - stack: UUID = "alexej", train_local: bool = False, ): - # TODO: remove all this :) - if stack == "hamza": - stack_id = UUID("cca5eaf7-0309-413d-89ff-1cd371b7d27c") - elif stack == "alex": - stack_id = UUID("fcf840ac-addd-4de3-a3e4-a1015f7bb96c") - else: - stack_id = UUID("7cda3cec-6744-48dc-8bdc-f102242a26d2") - client = Client() - if ingest_data_pipeline: - client.activate_stack(stack_id) - data_ingestion.with_options(config_path="configs/ingest_data.yaml")() + if ingest_data: + if not client.active_stack.orchestrator.config.is_local: + raise RuntimeError( + "The implementation of this pipeline " + "requires that you are running on a local " + "machine with data being persisted in the local " + "filesystem across multiple steps. Please " + "switch to a stack that contains a local " + "orchestrator and a local label-studio " + "annotator. See the README for more information " + "on this setup." + ) + + data_ingestion_pipeline.with_options( + config_path="configs/ingest_data.yaml" + )() if export_pipeline: - client.activate_stack(stack_id) + if not client.active_stack.orchestrator.config.is_local: + raise RuntimeError( + "The implementation of this pipeline " + "requires that you are running on a local " + "machine with a running instance of label-studio " + "configured in the stack as annotator." + " Please switch to a stack that contains a local " + "orchestrator and a local label-studio " + "annotator. See the README for more information " + "on this setup." + ) # Export data from label studio - export_for_training.with_options( - config_path="configs/data_export_alexej.yaml" + data_export_pipeline.with_options( + config_path="configs/data_export.yaml" )() - # Promote Model to staging - latest_model = Model(name=ZENML_MODEL_NAME, version=ModelStages.LATEST) - latest_model.set_stage(stage=ModelStages.STAGING, force=True) - - if training_pipeline and train_local: - client.activate_stack(stack_id) + if train: + try: + client.get_model_version( + model_name_or_id=ZENML_MODEL_NAME, + model_version_name_or_number_or_id=ModelStages.STAGING, + ) + except KeyError: + raise RuntimeError( + "This pipeline requires that there is a version of its " + "associated model in the `STAGING` stage. Make sure you run " + "the `data_export_pipeline` at least once to create the Model " + "along with a version of this model. After this you can " + "promote the version of your choice, either through the " + "frontend or with the following command: " + f"`zenml model version update {ZENML_MODEL_NAME} latest " + f"-s staging`" + ) + + if train_local: + config_path = "configs/training_pipeline.yaml" + else: + config_path = "configs/training_pipeline_remote_gpu.yaml" # Train model on data - training.with_options(config_path="configs/training.yaml")() - - if training_pipeline and not train_local: - client.activate_stack(REMOTE_STACK_ID) - - # Train model on data - training.with_options(config_path="configs/training_gpu.yaml")() - - if inference_pipeline: - client.activate_stack(stack_id) # REMOTE_STACK_ID) - - inference.with_options(config_path="configs/cloud_inference.yaml")() + training_pipeline.with_options(config_path=config_path)() + + if batch_inference: + try: + client.get_model_version( + model_name_or_id=ZENML_MODEL_NAME, + model_version_name_or_number_or_id=ModelStages.PRODUCTION, + ) + except KeyError: + raise RuntimeError( + "This pipeline requires that there is a version of its " + "associated model in the `Production` stage. Make sure you run " + "the `data_export_pipeline` at least once to create the Model " + "along with a version of this model. After this you can " + "promote the version of your choice, either through the " + "frontend or with the following command: " + f"`zenml model version update {ZENML_MODEL_NAME} staging " + f"-s production`" + ) + + inference_pipeline.with_options( + config_path="configs/inference_pipeline.yaml" + )() if fiftyone: import fiftyone as fo - client.activate_stack(stack_id) artifact = Client().get_artifact_version( name_id_or_prefix=PREDICTIONS_DATASET_ARTIFACT_NAME ) diff --git a/end-to-end-computer-vision/steps/export_label_studio.py b/end-to-end-computer-vision/steps/export_label_studio.py index 12e5cffb..0433ccf4 100644 --- a/end-to-end-computer-vision/steps/export_label_studio.py +++ b/end-to-end-computer-vision/steps/export_label_studio.py @@ -16,13 +16,13 @@ # from typing import Annotated, List, Tuple -from zenml import get_step_context, step +from zenml import log_artifact_metadata, step from zenml.client import Client from zenml.logger import get_logger -from materializers.label_studio_yolo_dataset_materializer import ( - LabelStudioYOLODataset, - LabelStudioYOLODatasetMaterializer, +from materializers.label_studio_export_materializer import ( + LabelStudioAnnotationExport, + LabelStudioAnnotationMaterializer, ) from utils.constants import LABELED_DATASET_NAME @@ -31,13 +31,13 @@ @step( output_materializers={ - LABELED_DATASET_NAME: LabelStudioYOLODatasetMaterializer + LABELED_DATASET_NAME: LabelStudioAnnotationMaterializer } ) def load_data_from_label_studio( dataset_name: str, ) -> Tuple[ - Annotated[LabelStudioYOLODataset, LABELED_DATASET_NAME], + Annotated[LabelStudioAnnotationExport, LABELED_DATASET_NAME], Annotated[List[int], "new_ids"], ]: """Loads data from Label Studio. @@ -61,37 +61,19 @@ def load_data_from_label_studio( if annotator and annotator._connection_available(): try: dataset = annotator.get_dataset(dataset_name=dataset_name) - ls_dataset = LabelStudioYOLODataset() + ls_dataset = LabelStudioAnnotationExport() ls_dataset.dataset = dataset - c = Client() - step_context = get_step_context() - cur_pipeline_name = step_context.pipeline.name - cur_step_name = step_context.step_name - - try: - last_run = c.get_pipeline( - cur_pipeline_name - ).last_successful_run - last_task_ids = ( - last_run.steps[cur_step_name].outputs["new_ids"].load() - ) - except (RuntimeError, KeyError): - last_task_ids = [] - current_labeled_task_ids = dataset.get_labeled_tasks_ids() - logger.info(f"{len(current_labeled_task_ids)} total labels found.") - new_task_ids = list( - set(current_labeled_task_ids) - set(last_task_ids) + ls_dataset.task_ids = current_labeled_task_ids + log_artifact_metadata( + metadata={ + "num_images": len(current_labeled_task_ids), + }, + artifact_name=LABELED_DATASET_NAME, ) - logger.info( - f"{len(new_task_ids)} new labels are being beamed " - f"straight to you." - ) - - ls_dataset.task_ids = new_task_ids - return ls_dataset, new_task_ids + return ls_dataset, current_labeled_task_ids except: raise ValueError( f"Dataset {dataset_name} not found in Label Studio." diff --git a/end-to-end-computer-vision/steps/fiftyone_inference.py b/end-to-end-computer-vision/steps/fiftyone_inference.py index 8c33f7eb..5a4584a7 100644 --- a/end-to-end-computer-vision/steps/fiftyone_inference.py +++ b/end-to-end-computer-vision/steps/fiftyone_inference.py @@ -33,7 +33,7 @@ os.environ["YOLO_VERBOSE"] = "False" -INFERENCE_BATCH = 20 +INFERENCE_BATCH = 5 @step diff --git a/end-to-end-computer-vision/steps/process_hf_dataset.py b/end-to-end-computer-vision/steps/process_hf_dataset.py new file mode 100644 index 00000000..e1847c3d --- /dev/null +++ b/end-to-end-computer-vision/steps/process_hf_dataset.py @@ -0,0 +1,97 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +from typing import Any, Dict + +from datasets import load_dataset +from PIL import Image +from zenml import step +from zenml.logger import get_logger + +from utils.dataset_utils import split_image_into_tiles + +Image.MAX_IMAGE_PIXELS = None + + +logger = get_logger(__name__) + + +@step +def process_hf_dataset( + dataset: str, data_source: str, max_tile_size: int = 1000 +) -> Dict[str, Any]: + """Downloads a Hugging Face dataset and does some processing. + + Converts the labels into the label_studio format. + Also uploads the images to the datasource path. + + The return of this step maps the path of images in the local fs to the + annotations in a format that label-studio can import. + {'data/image_116.png': [ + {'original_width': 907, + 'original_height': 937, + 'image_rotation': 0, + 'value': { + 'x': 66.26240352811466, + 'y': 77.5880469583778, + 'width': 5.071664829106946, + 'height': 3.4151547491995733, + 'rotation': 0, + 'rectanglelabels': ['ship'] + }, + 'from_name': 'label', + 'to_name': 'image', + 'type': 'rectanglelabels', + 'origin': 'manual'}, ... + ] + + Args: + dataset: Name of the hf dataset + data_source: Location of the image files + max_tile_size + + Returns: + Dictionary mapping filename to bboxes. You can find an example of this + dict in the docstring above + """ + # Load dataset from huggingface + dataset = load_dataset(dataset) + data = dataset["train"] + + # Create local directory to initially copy data to + local_output_dir = "data" + if not os.path.exists(local_output_dir): + os.mkdir(local_output_dir) + + # Dictionary that maps label information onto the corresponding image + all_images = {} + + # iterate through the dataset + for i, d in enumerate(data): + img_name = f"image_{i}" + + # in case images are very large, we split them up into 4 smaller tiles + split_image_into_tiles( + d=d, + img_name=img_name, + output_dir=local_output_dir, + all_images=all_images, + data_source=data_source, + max_tile_size=max_tile_size, + ) + + return all_images diff --git a/end-to-end-computer-vision/steps/train_model.py b/end-to-end-computer-vision/steps/train_model.py index 4f739d04..67be2774 100644 --- a/end-to-end-computer-vision/steps/train_model.py +++ b/end-to-end-computer-vision/steps/train_model.py @@ -20,10 +20,10 @@ from zenml import ArtifactConfig, log_artifact_metadata, step from zenml.logger import get_logger -from materializers.label_studio_yolo_dataset_materializer import ( - LabelStudioYOLODataset, +from materializers.label_studio_export_materializer import ( + LabelStudioAnnotationExport, ) -from materializers.yolo_materializer import UltralyticsMaterializer +from materializers.ultralytics_materializer import UltralyticsMaterializer from utils.dataset_utils import load_and_split_data logger = get_logger(__name__) @@ -35,7 +35,7 @@ ) def train_model( model: YOLO, - dataset: LabelStudioYOLODataset, + dataset: LabelStudioAnnotationExport, data_source: str, epochs: int = 100, batch_size: int = 16, diff --git a/end-to-end-computer-vision/steps/upload_to_label_studio.py b/end-to-end-computer-vision/steps/upload_to_label_studio.py index ecb52665..ffbecad5 100644 --- a/end-to-end-computer-vision/steps/upload_to_label_studio.py +++ b/end-to-end-computer-vision/steps/upload_to_label_studio.py @@ -18,6 +18,9 @@ from zenml import step from zenml.client import Client +from zenml.logger import get_logger + +logger = get_logger(__name__) @step @@ -54,7 +57,9 @@ def upload_labels_to_label_studio( for task in tasks: filename = task["storage_filename"].split("/")[-1] - - project.create_annotation( - task["id"], result=labels_dict[filename], ground_truth=True - ) + try: + project.create_annotation( + task["id"], result=labels_dict[filename], ground_truth=True + ) + except KeyError: + logger.info(f"No labels found for {filename}. Skipping ...") diff --git a/end-to-end-computer-vision/utils/dataset_utils.py b/end-to-end-computer-vision/utils/dataset_utils.py index 1da26e09..20e38a78 100644 --- a/end-to-end-computer-vision/utils/dataset_utils.py +++ b/end-to-end-computer-vision/utils/dataset_utils.py @@ -16,15 +16,20 @@ # import os import tempfile +from typing import Any, Dict, List, Tuple +import numpy as np from PIL import Image from zenml.io import fileio +from zenml.logger import get_logger -from materializers.label_studio_yolo_dataset_materializer import ( - LabelStudioYOLODataset, +from materializers.label_studio_export_materializer import ( + LabelStudioAnnotationExport, ) from utils.split_data import generate_yaml, split_dataset, unzip_dataset +logger = get_logger(__name__) + def load_images_from_folder(folder): images = [] @@ -41,7 +46,7 @@ def load_images_from_folder(folder): def load_and_split_data( - dataset: LabelStudioYOLODataset, data_source: str + dataset: LabelStudioAnnotationExport, data_source: str ) -> str: """Load data from dataset into file system and split into train/val. @@ -70,11 +75,189 @@ def load_and_split_data( images_folder = os.path.join(extract_location, "images") os.makedirs(images_folder, exist_ok=True) - for filename in filenames: + total_images = len(filenames) + logger.info(f"Downloading images from {data_source}") + for index, filename in enumerate(filenames): src_path = f"{data_source}/{filename}.png" dst_path = os.path.join(images_folder, f"{filename}.png") fileio.copy(src_path, dst_path) + if (index + 1) % 100 == 0 or index == total_images - 1: + logger.info( + f"{index + 1} of {total_images} images have been downloaded..." + ) split_dataset(extract_location, ratio=(0.7, 0.15, 0.15), seed=42) yaml_path = generate_yaml(extract_location) return yaml_path + + +def convert_bbox_for_ls(x1, x2, y1, y2, width, height) -> Dict[str, Any]: + """This function converts the bounding box coordinates to the label studio format. + + Parameters: + x1, x2, y1, y2 (int): The initial bounding box coordinates in pixels. + width, height (int): The original dimensions of the image. + + Returns: + A dictionary approximating the label studio. + """ + x = x1 / width + y = y1 / height + w = (x2 - x1) / width + h = (y2 - y1) / height + print(f"Converting bbox to results.") + return { + "original_width": width, + "original_height": height, + "image_rotation": 0, + "value": { + "x": x * 100, + "y": y * 100, + "width": w * 100, + "height": h * 100, + "rotation": 0, + "rectanglelabels": ["ship"], + }, + "from_name": "label", + "to_name": "image", + "type": "rectanglelabels", + "origin": "manual", + } + + +def crop_and_adjust_bboxes( + image: np.array, bboxes: List[List[int]], crop_coordinates: Tuple[int] +): + """Crops an image and adjust the bboxes that are within the croped portion. + + Args: + image: Image as np.array + bboxes: List of bboxes [[x1, y1, x2, y2], [x1, y1, x2, y2]] + crop_coordinates: Coordinates of the crop, format (x1, y1, x2, y2) + + Returns: + Tuple containing the cropped portion of the images and all bboxes + within the crop area + """ + x_crop_min, y_crop_min, x_crop_max, y_crop_max = crop_coordinates + cropped_image = image[y_crop_min:y_crop_max, x_crop_min:x_crop_max] + + adjusted_bboxes = [] + for bbox in bboxes: + xmin, ymin, xmax, ymax = bbox + + # Adjust bounding box coordinates + xmin_new = max(0, xmin - x_crop_min) + ymin_new = max(0, ymin - y_crop_min) + xmax_new = min(x_crop_max - x_crop_min, xmax - x_crop_min) + ymax_new = min(y_crop_max - y_crop_min, ymax - y_crop_min) + + # Check if the adjusted bounding box is still within the cropping area + if xmin_new < xmax_new and ymin_new < ymax_new: + adjusted_bboxes.append([xmin_new, ymin_new, xmax_new, ymax_new]) + + return cropped_image, adjusted_bboxes + + +def split_image_into_tiles( + d: Any, + img_name: str, + output_dir: str, + all_images: Dict[str, Any], + data_source: str, + max_tile_size: int = 500, +): + """Some images are too large to be useful, this splits them into multiple tiles. + + Args: + d: One hf dataset entry - needs to contain d['image'] and + d['objects']['bbox'] + img_name: Name of the image (excluding any suffix) + output_dir: Where to locally save the image + all_images: Dictionary containing the image_name<->bboxes mapping + max_tile_size: Maximum tile size + """ + tile_id = 0 + img = d["image"] + bboxes = d["objects"]["bbox"] + width, height = d["image"].size + + logger.info(f"Processing {img_name} ...") + for x in range(0, width, max_tile_size): + print(f"increased x={x}") + for y in range(0, height, max_tile_size): + + print(f"increased y={y}") + if x + max_tile_size <= width: + x1 = x + x2 = min(x + max_tile_size, width) + else: + # The last tile of the row also stays 500 pixels wide by + # cropping from max width to the left + x1 = max(0, width - max_tile_size) + x2 = width + if y + max_tile_size <= height: + y1 = y + y2 = min(y + max_tile_size, height) + else: + # The last tile of the row also stays 500 pixels high by + # cropping from max height up + y1 = max(0, height - max_tile_size) + y2 = height + cropped_img, adjusted_bboxes = crop_and_adjust_bboxes( + image=np.array(img), + bboxes=bboxes, + crop_coordinates=(x1, y1, x2, y2), + ) + + # store this tile + new_img_name = img_name + "_" + str(tile_id) + img_path = f"{output_dir}/{new_img_name}.png" + + print(f"Storing tile {tile_id} of {img_name} at {img_path} ...") + export_to_gcp( + data_source=data_source, + img=Image.fromarray(cropped_img), + img_name=new_img_name, + img_path=img_path, + ) + + logger.info(f"Calculating new bboxes for tile {img_path}") + tile_width = x2 - x1 + tile_height = y2 - y1 + + results = [] + for bbox in adjusted_bboxes: + results.append( + convert_bbox_for_ls( + x1=bbox[0], + x2=bbox[2], + y1=bbox[1], + y2=bbox[3], + width=tile_width, + height=tile_height, + ) + ) + + all_images[f"{new_img_name}.png"] = results + tile_id += 1 + + +def export_to_gcp(data_source: str, img: Image, img_name: str, img_path: str): + """ + Saves a given image locally, then copies it to a Google Cloud Storage bucket. + + Parameters: + data_source: The name of source directory or bucket where image needs to be + stored. + img: The image to be stored. This should be an instance of the PIL Image + class. + img_name: The name to be used when saving the image in the GCP bucket. Should not contain '.png' + img_path: The local path where the image will be temporarily saved before + being copied to the bucket. + """ + logger.info(f"Storing image to {img_path}.") + img.save(img_path) + bucket_path = os.path.join(data_source, f"{img_name}.png") + logger.info(f"Copying into gcp bucket {bucket_path}") + fileio.copy(img_path, bucket_path, overwrite=True)