diff --git a/classifier-e2e/pipelines/__init__.py b/classifier-e2e/pipelines/__init__.py index 0faf2902..a3ed214d 100644 --- a/classifier-e2e/pipelines/__init__.py +++ b/classifier-e2e/pipelines/__init__.py @@ -15,7 +15,7 @@ # limitations under the License. # +from .deploy import deploy from .feature_engineering import feature_engineering from .inference import inference from .training import training -from .deploy import deploy \ No newline at end of file diff --git a/classifier-e2e/pipelines/deploy.py b/classifier-e2e/pipelines/deploy.py index f9b77fc1..fd9138d8 100644 --- a/classifier-e2e/pipelines/deploy.py +++ b/classifier-e2e/pipelines/deploy.py @@ -1,7 +1,13 @@ -from zenml import pipeline, get_pipeline_context -from steps import data_loader, inference_preprocessor import random -from steps import deploy_endpoint, predict_on_endpoint, shutdown_endpoint + +from steps import ( + data_loader, + deploy_endpoint, + inference_preprocessor, + predict_on_endpoint, + shutdown_endpoint, +) +from zenml import get_pipeline_context, pipeline @pipeline diff --git a/classifier-e2e/pipelines/feature_engineering.py b/classifier-e2e/pipelines/feature_engineering.py index 0dcdbb74..5cc712df 100644 --- a/classifier-e2e/pipelines/feature_engineering.py +++ b/classifier-e2e/pipelines/feature_engineering.py @@ -15,15 +15,14 @@ # limitations under the License. # -from typing import List, Optional import random +from typing import List, Optional from steps import ( data_loader, data_preprocessor, data_splitter, ) - from zenml import pipeline from zenml.logger import get_logger @@ -59,7 +58,7 @@ def feature_engineering( # Link all the steps together by calling them and passing the output # of one step as the input of the next step. if random_state is None: - random_state = random.randint(0,1000) + random_state = random.randint(0, 1000) raw_data = data_loader(random_state=random_state, target=target) dataset_trn, dataset_tst = data_splitter( dataset=raw_data, diff --git a/classifier-e2e/pipelines/inference.py b/classifier-e2e/pipelines/inference.py index 46620c80..fb0fdb82 100644 --- a/classifier-e2e/pipelines/inference.py +++ b/classifier-e2e/pipelines/inference.py @@ -20,7 +20,6 @@ inference_predict, inference_preprocessor, ) - from zenml import get_pipeline_context, pipeline from zenml.logger import get_logger @@ -41,7 +40,9 @@ def inference(random_state: str, target: str): target: Name of target column in dataset. """ # Get the production model artifact - model = get_pipeline_context().model.get_artifact("breast_cancer_classifier") + model = get_pipeline_context().model.get_artifact( + "breast_cancer_classifier" + ) # Get the preprocess pipeline artifact associated with this version preprocess_pipeline = get_pipeline_context().model.get_artifact( diff --git a/classifier-e2e/pipelines/training.py b/classifier-e2e/pipelines/training.py index be95df32..da52374b 100644 --- a/classifier-e2e/pipelines/training.py +++ b/classifier-e2e/pipelines/training.py @@ -19,13 +19,13 @@ from uuid import UUID from steps import model_evaluator, model_promoter, model_trainer +from zenml import pipeline +from zenml.client import Client +from zenml.logger import get_logger from pipelines import ( feature_engineering, ) -from zenml import pipeline -from zenml.client import Client -from zenml.logger import get_logger logger = get_logger(__name__) @@ -58,7 +58,9 @@ def training( # Execute Feature Engineering Pipeline if train_dataset_id is None or test_dataset_id is None: - dataset_trn, dataset_tst = feature_engineering(random_state=random_state) + dataset_trn, dataset_tst = feature_engineering( + random_state=random_state + ) else: client = Client() dataset_trn = client.get_artifact_version( diff --git a/classifier-e2e/run.py b/classifier-e2e/run.py index 68033d98..b91526c9 100644 --- a/classifier-e2e/run.py +++ b/classifier-e2e/run.py @@ -26,7 +26,6 @@ inference, training, ) - from zenml.client import Client from zenml.logger import get_logger @@ -204,7 +203,7 @@ def main( "test_dataset_id" ] = test_dataset_artifact_version.id - run_args_train["random_state"] = random.randint(0,1000) + run_args_train["random_state"] = random.randint(0, 1000) # Run the SGD pipeline pipeline_args = {} @@ -224,7 +223,9 @@ def main( config_folder, f"training_xgboost{custom_training_suffix}.yaml" ) training.with_options(**pipeline_args)(**run_args_train) - logger.info("Training pipeline with XGBoost finished successfully!\n\n") + logger.info( + "Training pipeline with XGBoost finished successfully!\n\n" + ) if inference_pipeline: run_args_inference = {} diff --git a/classifier-e2e/steps/__init__.py b/classifier-e2e/steps/__init__.py index b17987b9..9d24cfc8 100644 --- a/classifier-e2e/steps/__init__.py +++ b/classifier-e2e/steps/__init__.py @@ -9,20 +9,20 @@ from .data_splitter import ( data_splitter, ) +from .deploy_endpoint import deploy_endpoint from .inference_predict import ( inference_predict, ) from .inference_preprocessor import ( inference_preprocessor, ) +from .misc_endpoint import predict_on_endpoint, shutdown_endpoint from .model_evaluator import ( model_evaluator, ) -from .model_trainer import ( - model_trainer, -) from .model_promoter import ( model_promoter, ) -from .deploy_endpoint import deploy_endpoint -from .misc_endpoint import predict_on_endpoint, shutdown_endpoint \ No newline at end of file +from .model_trainer import ( + model_trainer, +) diff --git a/classifier-e2e/steps/data_loader.py b/classifier-e2e/steps/data_loader.py index 1934baa8..0275afe4 100644 --- a/classifier-e2e/steps/data_loader.py +++ b/classifier-e2e/steps/data_loader.py @@ -18,7 +18,6 @@ import pandas as pd from sklearn.datasets import load_breast_cancer from typing_extensions import Annotated - from zenml import step from zenml.logger import get_logger diff --git a/classifier-e2e/steps/data_preprocessor.py b/classifier-e2e/steps/data_preprocessor.py index 1bf99025..2987f835 100644 --- a/classifier-e2e/steps/data_preprocessor.py +++ b/classifier-e2e/steps/data_preprocessor.py @@ -22,7 +22,6 @@ from sklearn.preprocessing import MinMaxScaler from typing_extensions import Annotated from utils.preprocess import ColumnsDropper, DataFrameCaster, NADropper - from zenml import log_artifact_metadata, step diff --git a/classifier-e2e/steps/data_splitter.py b/classifier-e2e/steps/data_splitter.py index bb0e9bd2..7ba58bea 100644 --- a/classifier-e2e/steps/data_splitter.py +++ b/classifier-e2e/steps/data_splitter.py @@ -20,7 +20,6 @@ import pandas as pd from sklearn.model_selection import train_test_split from typing_extensions import Annotated - from zenml import step diff --git a/classifier-e2e/steps/deploy_endpoint.py b/classifier-e2e/steps/deploy_endpoint.py index 7f5ad977..ad166525 100644 --- a/classifier-e2e/steps/deploy_endpoint.py +++ b/classifier-e2e/steps/deploy_endpoint.py @@ -1,24 +1,24 @@ -from typing_extensions import Annotated +from datetime import datetime import sagemaker -from sagemaker.image_uris import retrieve from sagemaker import Predictor - -from zenml import step, get_step_context, ArtifactConfig, log_artifact_metadata -from datetime import datetime - +from sagemaker.image_uris import retrieve +from typing_extensions import Annotated from utils.aws import get_aws_config from utils.sagemaker_materializer import SagemakerPredictorMaterializer +from zenml import ArtifactConfig, get_step_context, log_artifact_metadata, step @step( enable_cache=False, output_materializers=[SagemakerPredictorMaterializer], ) -def deploy_endpoint() -> Annotated[ - Predictor, - ArtifactConfig(name="sagemaker_endpoint", is_deployment_artifact=True), -]: +def deploy_endpoint() -> ( + Annotated[ + Predictor, + ArtifactConfig(name="sagemaker_endpoint", is_deployment_artifact=True), + ] +): role, session, region = get_aws_config() model = get_step_context().model._get_model_version() diff --git a/classifier-e2e/steps/inference_predict.py b/classifier-e2e/steps/inference_predict.py index cd1d2921..60f9267f 100644 --- a/classifier-e2e/steps/inference_predict.py +++ b/classifier-e2e/steps/inference_predict.py @@ -19,7 +19,6 @@ import pandas as pd from typing_extensions import Annotated - from zenml import step from zenml.logger import get_logger diff --git a/classifier-e2e/steps/inference_preprocessor.py b/classifier-e2e/steps/inference_preprocessor.py index d484433e..d12247e0 100644 --- a/classifier-e2e/steps/inference_preprocessor.py +++ b/classifier-e2e/steps/inference_preprocessor.py @@ -18,7 +18,6 @@ import pandas as pd from sklearn.pipeline import Pipeline from typing_extensions import Annotated - from zenml import step diff --git a/classifier-e2e/steps/misc_endpoint.py b/classifier-e2e/steps/misc_endpoint.py index aa6c7c8d..9628a71e 100644 --- a/classifier-e2e/steps/misc_endpoint.py +++ b/classifier-e2e/steps/misc_endpoint.py @@ -1,9 +1,7 @@ +import pandas as pd +from sagemaker.predictor import Predictor from typing_extensions import Annotated - - from zenml import step -from sagemaker.predictor import Predictor -import pandas as pd @step diff --git a/classifier-e2e/steps/model_evaluator.py b/classifier-e2e/steps/model_evaluator.py index db15b689..dd335ae5 100644 --- a/classifier-e2e/steps/model_evaluator.py +++ b/classifier-e2e/steps/model_evaluator.py @@ -18,20 +18,18 @@ from typing import Optional import pandas as pd +import wandb from sklearn.base import ClassifierMixin from sklearn.metrics import confusion_matrix - from zenml import ( + get_step_context, log_artifact_metadata, - step, log_model_metadata, - get_step_context, + step, ) -from zenml.logger import get_logger -import wandb from zenml.client import Client from zenml.exceptions import StepContextError - +from zenml.logger import get_logger logger = get_logger(__name__) diff --git a/classifier-e2e/steps/model_promoter.py b/classifier-e2e/steps/model_promoter.py index aac7083f..5dcd5edb 100644 --- a/classifier-e2e/steps/model_promoter.py +++ b/classifier-e2e/steps/model_promoter.py @@ -15,10 +15,10 @@ # limitations under the License. # +from sklearn.metrics import accuracy_score from zenml import get_step_context, step from zenml.client import Client from zenml.logger import get_logger -from sklearn.metrics import accuracy_score logger = get_logger(__name__) @@ -58,7 +58,9 @@ def model_promoter(accuracy: float, stage: str = "production") -> bool: try: stage_model = client.get_model_version(current_model.name, stage) # We compare their metrics - prod_classifier = stage_model.get_artifact("breast_cancer_classifier") + prod_classifier = stage_model.get_artifact( + "breast_cancer_classifier" + ) if prod_classifier: # and recompute metrics for current prod model using current test set prod_classifier = prod_classifier.load() diff --git a/classifier-e2e/steps/model_trainer.py b/classifier-e2e/steps/model_trainer.py index 3b84362a..aef08e27 100644 --- a/classifier-e2e/steps/model_trainer.py +++ b/classifier-e2e/steps/model_trainer.py @@ -21,15 +21,18 @@ from sklearn.base import ClassifierMixin from sklearn.linear_model import SGDClassifier from typing_extensions import Annotated - +from utils.sagemaker_materializer import SagemakerMaterializer from zenml import ArtifactConfig, step from zenml.logger import get_logger -from utils.sagemaker_materializer import SagemakerMaterializer - logger = get_logger(__name__) -@step(output_materializers=[SagemakerMaterializer,]) + +@step( + output_materializers=[ + SagemakerMaterializer, + ] +) def model_trainer( dataset_trn: pd.DataFrame, model_type: str = "sgd", diff --git a/classifier-e2e/utils/__init__.py b/classifier-e2e/utils/__init__.py index 8d4e9614..757bd841 100644 --- a/classifier-e2e/utils/__init__.py +++ b/classifier-e2e/utils/__init__.py @@ -1,16 +1,16 @@ # Apache Software License 2.0 -# +# # Copyright (c) ZenML GmbH 2024. All rights reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# +# diff --git a/classifier-e2e/utils/aws.py b/classifier-e2e/utils/aws.py index c74bb410..cc80ef6d 100644 --- a/classifier-e2e/utils/aws.py +++ b/classifier-e2e/utils/aws.py @@ -1,5 +1,5 @@ import os -from typing import Tuple, Any +from typing import Any, Tuple import boto3 import sagemaker diff --git a/classifier-e2e/utils/sagemaker_materializer.py b/classifier-e2e/utils/sagemaker_materializer.py index aa886af9..8dec8320 100644 --- a/classifier-e2e/utils/sagemaker_materializer.py +++ b/classifier-e2e/utils/sagemaker_materializer.py @@ -1,17 +1,18 @@ import os +import tarfile +import tempfile from typing import Type, Union +import joblib +from sagemaker import Predictor +from sklearn.base import ClassifierMixin +from sklearn.linear_model import SGDClassifier +from xgboost import XGBClassifier from zenml.enums import ArtifactType from zenml.io import fileio from zenml.materializers.base_materializer import BaseMaterializer from zenml.materializers.built_in_materializer import BuiltInMaterializer -from sklearn.linear_model import SGDClassifier -from xgboost import XGBClassifier -import tarfile -import tempfile -import joblib -from sklearn.base import ClassifierMixin -from sagemaker import Predictor + class SagemakerMaterializer(BaseMaterializer): ASSOCIATED_TYPES = (ClassifierMixin,) @@ -77,9 +78,7 @@ class SagemakerPredictorMaterializer(BaseMaterializer): ASSOCIATED_TYPES = (Predictor,) ASSOCIATED_ARTIFACT_TYPE = ArtifactType.SERVICE - def load( - self, data_type: Type[Predictor] - ) -> Predictor: + def load(self, data_type: Type[Predictor]) -> Predictor: """Read from artifact store.""" return Predictor(endpoint_name=BuiltInMaterializer(self.uri).load(str)) diff --git a/classifier-e2e/utils/sklearn_inference.py b/classifier-e2e/utils/sklearn_inference.py index bf7a8d2c..ecae1af3 100644 --- a/classifier-e2e/utils/sklearn_inference.py +++ b/classifier-e2e/utils/sklearn_inference.py @@ -1,35 +1,45 @@ -import joblib import os -import pandas as pd from io import StringIO +import joblib +import pandas as pd + """ Deserialize fitted model """ + + def model_fn(model_dir): model = joblib.load(os.path.join(model_dir, "sklearn-model")) return model + """ input_fn request_body: The body of the request sent to the model. request_content_type: (string) specifies the format/variable type of the request """ + + def input_fn(request_body, request_content_type): - if request_content_type == 'text/csv': + if request_content_type == "text/csv": request = pd.read_csv(StringIO(request_body)) return request else: raise ValueError("This model only supports text/csv input") + """ predict_fn input_data: returned array from input_fn above model (sklearn model) returned model loaded from model_fn above """ + + def predict_fn(input_data, model): return model.predict(input_data) + """ output_fn prediction: the returned value from predict_fn above @@ -37,5 +47,6 @@ def predict_fn(input_data, model): """ + def output_fn(prediction, content_type): - return pd.Series(prediction).to_csv(index=False,header=False) \ No newline at end of file + return pd.Series(prediction).to_csv(index=False, header=False) diff --git a/customer-churn/steps/evaluation.py b/customer-churn/steps/evaluation.py index 0539c2ce..5da06149 100644 --- a/customer-churn/steps/evaluation.py +++ b/customer-churn/steps/evaluation.py @@ -9,9 +9,9 @@ @step -def evaluation( - model: ClassifierMixin, test: pd.DataFrame -) -> Output(accuracy=float): +def evaluation(model: ClassifierMixin, test: pd.DataFrame) -> Output( + accuracy=float +): """ Args: model: ClassifierMixin diff --git a/customer-churn/steps/src/data_processing.py b/customer-churn/steps/src/data_processing.py index 9a3d2879..24c3d82d 100644 --- a/customer-churn/steps/src/data_processing.py +++ b/customer-churn/steps/src/data_processing.py @@ -16,9 +16,9 @@ def __init__(self) -> None: """Initialize the DataProcessor class.""" pass - def encode_categorical_columns( - self, data: pd.DataFrame - ) -> Output(data=pd.DataFrame): + def encode_categorical_columns(self, data: pd.DataFrame) -> Output( + data=pd.DataFrame + ): """ Encode categorical columns to numeric values using LabelEncoder. @@ -69,9 +69,9 @@ def mean_encoding(self, data: pd.DataFrame) -> Output(data=pd.DataFrame): except Exception as e: logger.error(e) - def drop_columns( - self, data: pd.DataFrame - ) -> Output(output_data=pd.DataFrame): + def drop_columns(self, data: pd.DataFrame) -> Output( + output_data=pd.DataFrame + ): """ Drop columns from the dataframe by using several methods. @@ -123,9 +123,9 @@ def single_value_column_remover(self, data: pd.DataFrame) -> pd.DataFrame: except Exception as e: logger.error(e) - def handle_missing_values( - self, data: pd.DataFrame - ) -> Output(data=pd.DataFrame): + def handle_missing_values(self, data: pd.DataFrame) -> Output( + data=pd.DataFrame + ): """ Handle missing values by filling them with mean values. diff --git a/customer-churn/steps/trainer.py b/customer-churn/steps/trainer.py index b43c92f3..fe55514b 100644 --- a/customer-churn/steps/trainer.py +++ b/customer-churn/steps/trainer.py @@ -17,9 +17,9 @@ class ModelNameConfig(BaseParameters): @step -def model_trainer( - train: pd.DataFrame, config: ModelNameConfig -) -> Output(model=ClassifierMixin): +def model_trainer(train: pd.DataFrame, config: ModelNameConfig) -> Output( + model=ClassifierMixin +): """Trains a specified model.""" try: x_train, x_test, y_train, y_test = train_test_split( diff --git a/customer-satisfaction/model/model_dev.py b/customer-satisfaction/model/model_dev.py index 83b9098a..c5548279 100644 --- a/customer-satisfaction/model/model_dev.py +++ b/customer-satisfaction/model/model_dev.py @@ -98,7 +98,9 @@ def __init__( self.x_test = x_test self.y_test = y_test - def random_forest_trainer(self, fine_tuning: bool = True) -> RegressorMixin: + def random_forest_trainer( + self, fine_tuning: bool = True + ) -> RegressorMixin: """ It trains the random forest model. diff --git a/customer-satisfaction/pipelines/deployment_pipeline.py b/customer-satisfaction/pipelines/deployment_pipeline.py index bc1be4df..ffc20711 100644 --- a/customer-satisfaction/pipelines/deployment_pipeline.py +++ b/customer-satisfaction/pipelines/deployment_pipeline.py @@ -1,22 +1,18 @@ import os -from zenml.integrations.mlflow.steps import mlflow_model_deployer_step - -from zenml import pipeline - from pipelines.training_pipeline import customer_satisfaction_training_pipeline from steps import predictor from steps.dynamic_importer import dynamic_importer from steps.model_loader import model_loader from steps.prediction_service_loader import prediction_service_loader +from zenml import pipeline +from zenml.integrations.mlflow.steps import mlflow_model_deployer_step requirements_file = os.path.join(os.path.dirname(__file__), "requirements.txt") @pipeline -def continuous_deployment_pipeline( - model_type: str = "lightgbm" -): +def continuous_deployment_pipeline(model_type: str = "lightgbm"): """Run a training job and deploy an mlflow model deployment.""" # Run a training pipeline customer_satisfaction_training_pipeline(model_type=model_type) @@ -24,14 +20,12 @@ def continuous_deployment_pipeline( # Fetch the production model from the Model Registry production_model = model_loader( model_name="Customer_Satisfaction_Predictor", - after="model_promoter" # Make sure this runs only once the training pipeline is done + after="model_promoter", # Make sure this runs only once the training pipeline is done ) # (Re)deploy the production model mlflow_model_deployer_step( - workers=3, - deploy_decision=True, - model=production_model + workers=3, deploy_decision=True, model=production_model ) @@ -41,6 +35,6 @@ def inference_pipeline(): batch_data = dynamic_importer() model_deployment_service = prediction_service_loader( pipeline_name="continuous_deployment_pipeline", - step_name="mlflow_model_deployer_step" + step_name="mlflow_model_deployer_step", ) predictor(service=model_deployment_service, input_data=batch_data) diff --git a/customer-satisfaction/pipelines/training_pipeline.py b/customer-satisfaction/pipelines/training_pipeline.py index 96a166cf..e48524d8 100644 --- a/customer-satisfaction/pipelines/training_pipeline.py +++ b/customer-satisfaction/pipelines/training_pipeline.py @@ -1,17 +1,15 @@ -from typing import Tuple, Annotated - -from sklearn.base import RegressorMixin -from zenml import pipeline - from steps import ( - ingest_data, clean_data, train_model, evaluation, model_promoter + clean_data, + evaluation, + ingest_data, + model_promoter, + train_model, ) +from zenml import pipeline @pipeline -def customer_satisfaction_training_pipeline( - model_type: str = "lightgbm" -): +def customer_satisfaction_training_pipeline(model_type: str = "lightgbm"): """Training Pipeline. Args: @@ -19,7 +17,13 @@ def customer_satisfaction_training_pipeline( """ df = ingest_data() x_train, x_test, y_train, y_test = clean_data(df) - model = train_model(x_train=x_train, x_test=x_test, y_train=y_train, y_test=y_test, model_type=model_type) + model = train_model( + x_train=x_train, + x_test=x_test, + y_train=y_train, + y_test=y_test, + model_type=model_type, + ) mse, rmse = evaluation(model, x_test, y_test) is_promoted = model_promoter(mse=mse) - return model, is_promoted \ No newline at end of file + return model, is_promoted diff --git a/customer-satisfaction/pipelines/utils.py b/customer-satisfaction/pipelines/utils.py index b537bf36..71dc6ecd 100644 --- a/customer-satisfaction/pipelines/utils.py +++ b/customer-satisfaction/pipelines/utils.py @@ -1,7 +1,6 @@ import logging import pandas as pd - from model.data_cleaning import DataCleaning diff --git a/customer-satisfaction/run_deployment.py b/customer-satisfaction/run_deployment.py index 09f4ca31..018ed0db 100644 --- a/customer-satisfaction/run_deployment.py +++ b/customer-satisfaction/run_deployment.py @@ -1,13 +1,12 @@ import click - from pipelines.deployment_pipeline import ( continuous_deployment_pipeline, - inference_pipeline + inference_pipeline, ) from rich import print from zenml.integrations.mlflow.mlflow_utils import get_tracking_uri from zenml.integrations.mlflow.model_deployers.mlflow_model_deployer import ( - MLFlowModelDeployer + MLFlowModelDeployer, ) @@ -23,9 +22,13 @@ "-m", type=click.Choice(["lightgbm", "randomforest", "xgboost"]), default="xgboost", - help="Here you can choose what type of model should be trained." + help="Here you can choose what type of model should be trained.", ) -def run_main(stop_service: bool, model_type: str, model_name="Customer_Satisfaction_Predictor"): +def run_main( + stop_service: bool, + model_type: str, + model_name="Customer_Satisfaction_Predictor", +): """Run the mlflow example pipeline""" if stop_service: # get the MLflow model deployer stack component @@ -43,7 +46,9 @@ def run_main(stop_service: bool, model_type: str, model_name="Customer_Satisfact existing_services[0].stop(timeout=10) return - continuous_deployment_pipeline.with_options(config_path="config.yaml")(model_type=model_type) + continuous_deployment_pipeline.with_options(config_path="config.yaml")( + model_type=model_type + ) model_deployer = MLFlowModelDeployer.get_active_model_deployer() @@ -60,7 +65,7 @@ def run_main(stop_service: bool, model_type: str, model_name="Customer_Satisfact # fetch existing services with same pipeline name, step name and model name service = model_deployer.find_model_server( pipeline_name="continuous_deployment_pipeline", - pipeline_step_name="mlflow_model_deployer_step" + pipeline_step_name="mlflow_model_deployer_step", ) if service[0]: diff --git a/customer-satisfaction/run_pipeline.py b/customer-satisfaction/run_pipeline.py index f35f5483..d1e9b909 100644 --- a/customer-satisfaction/run_pipeline.py +++ b/customer-satisfaction/run_pipeline.py @@ -1,8 +1,7 @@ +import click from pipelines.training_pipeline import customer_satisfaction_training_pipeline from zenml.integrations.mlflow.mlflow_utils import get_tracking_uri -import click - @click.command() @click.option( @@ -10,12 +9,13 @@ "-m", type=click.Choice(["lightgbm", "randomforest", "xgboost"]), default="xgboost", - help="Here you can choose what type of model should be trained." + help="Here you can choose what type of model should be trained.", ) def main(model_type: str): ( - customer_satisfaction_training_pipeline - .with_options(config_path="config.yaml")(model_type) + customer_satisfaction_training_pipeline.with_options( + config_path="config.yaml" + )(model_type) ) print( @@ -29,6 +29,3 @@ def main(model_type: str): if __name__ == "__main__": main() - - - diff --git a/customer-satisfaction/steps/__init__.py b/customer-satisfaction/steps/__init__.py index c5a1cea3..570688a8 100644 --- a/customer-satisfaction/steps/__init__.py +++ b/customer-satisfaction/steps/__init__.py @@ -7,12 +7,8 @@ from .ingest_data import ( ingest_data, ) +from .model_promoter import model_promoter +from .predictor import predictor from .train_model import ( train_model, ) -from .predictor import ( - predictor -) -from .model_promoter import ( - model_promoter -) diff --git a/customer-satisfaction/steps/dynamic_importer.py b/customer-satisfaction/steps/dynamic_importer.py index 129ef361..e902074b 100644 --- a/customer-satisfaction/steps/dynamic_importer.py +++ b/customer-satisfaction/steps/dynamic_importer.py @@ -1,8 +1,7 @@ from typing import Annotated -from zenml import step - from pipelines.utils import get_data_for_test +from zenml import step @step(enable_cache=False) diff --git a/customer-satisfaction/steps/evaluation.py b/customer-satisfaction/steps/evaluation.py index 27b37d25..ec69a93f 100644 --- a/customer-satisfaction/steps/evaluation.py +++ b/customer-satisfaction/steps/evaluation.py @@ -1,26 +1,21 @@ import logging -from typing import Tuple, Annotated +from typing import Annotated, Tuple import mlflow import numpy as np import pandas as pd from model.evaluator import Evaluator from sklearn.base import RegressorMixin +from zenml import get_step_context, log_artifact_metadata, step from zenml.client import Client -from zenml import step, get_step_context, log_artifact_metadata experiment_tracker = Client().active_stack.experiment_tracker @step(experiment_tracker=experiment_tracker.name) def evaluation( - model: RegressorMixin, - x_test: pd.DataFrame, - y_test: pd.Series -) -> Tuple[ - Annotated[float, "r2_score"], - Annotated[float, "rmse"] -]: + model: RegressorMixin, x_test: pd.DataFrame, y_test: pd.Series +) -> Tuple[Annotated[float, "r2_score"], Annotated[float, "rmse"]]: """Evaluates the Model on the Test Dataset and returns the metrics. Args: @@ -52,7 +47,7 @@ def evaluation( "metrics": { "r2_score": float(r2_score), "mse": float(mse), - "rmse": float(rmse) + "rmse": float(rmse), } }, artifact_name=artifact.name, diff --git a/customer-satisfaction/steps/ingest_data.py b/customer-satisfaction/steps/ingest_data.py index 8dfaf2de..e1541c1c 100644 --- a/customer-satisfaction/steps/ingest_data.py +++ b/customer-satisfaction/steps/ingest_data.py @@ -20,7 +20,7 @@ def get_data(self) -> pd.DataFrame: @step def ingest_data() -> pd.DataFrame: - """ Ingest Data and return a Dataframe with the whole dataset. + """Ingest Data and return a Dataframe with the whole dataset. Returns: df: pd.DataFrame diff --git a/customer-satisfaction/steps/model_loader.py b/customer-satisfaction/steps/model_loader.py index 310587e9..939deb26 100644 --- a/customer-satisfaction/steps/model_loader.py +++ b/customer-satisfaction/steps/model_loader.py @@ -1,20 +1,14 @@ from sklearn.base import RegressorMixin -from zenml import step, Model -from zenml.client import Client +from zenml import Model, step @step -def model_loader( - model_name: str -) -> RegressorMixin: +def model_loader(model_name: str) -> RegressorMixin: """Implements a simple model loader that loads the current production model. Args: model_name: Name of the Model to load """ - model = Model( - name=model_name, - version="production" - ) + model = Model(name=model_name, version="production") model_artifact: RegressorMixin = model.load_artifact("sklearn_regressor") - return model_artifact \ No newline at end of file + return model_artifact diff --git a/customer-satisfaction/steps/model_promoter.py b/customer-satisfaction/steps/model_promoter.py index c32f8cf1..8276156e 100644 --- a/customer-satisfaction/steps/model_promoter.py +++ b/customer-satisfaction/steps/model_promoter.py @@ -1,14 +1,11 @@ -from zenml import get_step_context, step, Model +from zenml import Model, get_step_context, step from zenml.logger import get_logger logger = get_logger(__name__) @step -def model_promoter( - mse: float, - stage: str = "production" -) -> bool: +def model_promoter(mse: float, stage: str = "production") -> bool: """Model promotion step Step that conditionally promotes a model in case it has an MSE greater than @@ -26,14 +23,15 @@ def model_promoter( # Get the previous model version at the production stage previous_production_model = Model( - name=zenml_model.name, - version="production" + name=zenml_model.name, version="production" ) try: # In case there already is a model version at the correct stage previous_production_model_mse = float( - previous_production_model.get_artifact("sklearn_regressor").run_metadata["metrics"].value["mse"] + previous_production_model.get_artifact("sklearn_regressor") + .run_metadata["metrics"] + .value["mse"] ) except RuntimeError: # In case no model version has been promoted before, diff --git a/customer-satisfaction/steps/predictor.py b/customer-satisfaction/steps/predictor.py index c4ed192d..fc74f32b 100644 --- a/customer-satisfaction/steps/predictor.py +++ b/customer-satisfaction/steps/predictor.py @@ -1,8 +1,9 @@ -from zenml import step -from zenml.integrations.mlflow.services import MLFlowDeploymentService -import numpy as np import json + +import numpy as np import pandas as pd +from zenml import step +from zenml.integrations.mlflow.services import MLFlowDeploymentService @step(enable_cache=False) @@ -34,4 +35,4 @@ def predictor( json_list = json.loads(json.dumps(list(df.T.to_dict().values()))) data = np.array(json_list) prediction = service.predict(data) - return prediction \ No newline at end of file + return prediction diff --git a/customer-satisfaction/steps/train_model.py b/customer-satisfaction/steps/train_model.py index 66109f32..98f7b4c0 100644 --- a/customer-satisfaction/steps/train_model.py +++ b/customer-satisfaction/steps/train_model.py @@ -3,13 +3,10 @@ import mlflow import pandas as pd - from model.model_dev import ModelTrainer from sklearn.base import RegressorMixin +from zenml import ArtifactConfig, step from zenml.client import Client -from zenml import step -from zenml import ArtifactConfig - experiment_tracker = Client().active_stack.experiment_tracker @@ -21,10 +18,10 @@ def train_model( y_train: pd.Series, y_test: pd.Series, model_type: str = "lightgbm", - do_fine_tuning: bool = True + do_fine_tuning: bool = True, ) -> Annotated[ RegressorMixin, - ArtifactConfig(name="sklearn_regressor", is_model_artifact=True) + ArtifactConfig(name="sklearn_regressor", is_model_artifact=True), ]: """ Args: diff --git a/customer-satisfaction/streamlit_app.py b/customer-satisfaction/streamlit_app.py index 5b70fcea..2a6496ae 100644 --- a/customer-satisfaction/streamlit_app.py +++ b/customer-satisfaction/streamlit_app.py @@ -4,9 +4,8 @@ import pandas as pd import streamlit as st from PIL import Image -from zenml.integrations.mlflow.model_deployers import MLFlowModelDeployer - from run_deployment import run_main +from zenml.integrations.mlflow.model_deployers import MLFlowModelDeployer def main(): @@ -69,7 +68,7 @@ def main(): service = model_deployer.find_model_server( pipeline_name="continuous_deployment_pipeline", - pipeline_step_name="mlflow_model_deployer_step" + pipeline_step_name="mlflow_model_deployer_step", )[0] if service is None: st.write( diff --git a/huggingface-sagemaker/gradio/app.py b/huggingface-sagemaker/gradio/app.py index 61dbb15f..e481b79f 100644 --- a/huggingface-sagemaker/gradio/app.py +++ b/huggingface-sagemaker/gradio/app.py @@ -37,10 +37,14 @@ "--model_name_or_path", default=None, help="Name or the path of the model." ) @click.option( - "--labels", default="Negative,Positive", help="Comma-separated list of labels." + "--labels", + default="Negative,Positive", + help="Comma-separated list of labels.", ) @click.option( - "--title", default="ZenML NLP Use-Case", help="Title of the Gradio interface." + "--title", + default="ZenML NLP Use-Case", + help="Title of the Gradio interface.", ) @click.option( "--description", @@ -61,7 +65,7 @@ "--pipeline_version", default="3", help="Which version of the deploy pipeline should be deployed.", - type=int + type=int, ) def sentiment_analysis( tokenizer_name_or_path: Optional[str], @@ -71,7 +75,7 @@ def sentiment_analysis( description: Optional[str], interpretation: Optional[str], pipeline_version: int, - examples: Optional[str] + examples: Optional[str], ): """Launches a Gradio interface for sentiment analysis. @@ -119,10 +123,14 @@ def analyze_text(inference_type, text): model_path = f"{dirname(__file__)}/{model_name_or_path}/" print(f"Loading model from {model_path}") if tokenizer_name_or_path: - tokenizer_path = f"{dirname(__file__)}/{tokenizer_name_or_path}/" + tokenizer_path = ( + f"{dirname(__file__)}/{tokenizer_name_or_path}/" + ) print(f"Loading tokenizer from {tokenizer_path}") tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) - model = AutoModelForSequenceClassification.from_pretrained(model_path) + model = AutoModelForSequenceClassification.from_pretrained( + model_path + ) text = preprocess(text) encoded_input = tokenizer(text, return_tensors="pt") @@ -149,9 +157,15 @@ def analyze_text(inference_type, text): ) res = predictor.predict({"inputs": text}) if res[0]["label"] == "LABEL_1": - scores = {"Negative": 1 - res[0]["score"], "Positive": res[0]["score"]} + scores = { + "Negative": 1 - res[0]["score"], + "Positive": res[0]["score"], + } else: - scores = {"Negative": res[0]["score"], "Positive": 1 - res[0]["score"]} + scores = { + "Negative": res[0]["score"], + "Positive": 1 - res[0]["score"], + } return scores @@ -159,7 +173,9 @@ def analyze_text(inference_type, text): fn=analyze_text, inputs=[ gr.Dropdown( - ["local", "sagemaker"], label="Select inference type", value="sagemaker" + ["local", "sagemaker"], + label="Select inference type", + value="sagemaker", ), gr.TextArea("Write your text or tweet here", label="Analyze Text"), ], diff --git a/huggingface-sagemaker/model_deployers/hf_sagemaker_client.py b/huggingface-sagemaker/model_deployers/hf_sagemaker_client.py index aca40b09..0942157b 100644 --- a/huggingface-sagemaker/model_deployers/hf_sagemaker_client.py +++ b/huggingface-sagemaker/model_deployers/hf_sagemaker_client.py @@ -271,7 +271,9 @@ def sanitize_tags(tags: Dict[str, str]) -> None: # Kubernetes labels must be alphanumeric, no longer than # 63 characters, and must begin and end with an alphanumeric # character ([a-z0-9A-Z]) - tags[key] = re.sub(r"[^0-9a-zA-Z-_\.]+", "_", value)[:63].strip("-_.") + tags[key] = re.sub(r"[^0-9a-zA-Z-_\.]+", "_", value)[:63].strip( + "-_." + ) def create_deployment( self, @@ -298,7 +300,9 @@ def create_deployment( the deployment. """ try: - logger.debug(f"Creating HFSagemakerDeployment resource: {deployment}") + logger.debug( + f"Creating HFSagemakerDeployment resource: {deployment}" + ) # mark the deployment as managed by ZenML, to differentiate # between deployments that are created by ZenML and those that @@ -306,18 +310,21 @@ def create_deployment( deployment.mark_as_managed_by_zenml() body_deploy = deployment.dict(exclude_none=True) - response = self._custom_objects_api.create_namespaced_custom_object( - group="machinelearning.seldon.io", - version="v1", - namespace=self._namespace, - plural="seldondeployments", - body=body_deploy, - _request_timeout=poll_timeout or None, + response = ( + self._custom_objects_api.create_namespaced_custom_object( + group="machinelearning.seldon.io", + version="v1", + namespace=self._namespace, + plural="seldondeployments", + body=body_deploy, + _request_timeout=poll_timeout or None, + ) ) logger.debug("HFSagemaker Core API response: %s", response) except k8s_client.rest.ApiException as e: logger.error( - "Exception when creating HFSagemakerDeployment resource: %s", str(e) + "Exception when creating HFSagemakerDeployment resource: %s", + str(e), ) if e.status == 409: raise HFSagemakerDeploymentExistsError( @@ -367,14 +374,16 @@ def delete_deployment( # a HFSagemakerDeploymentNotFoundError otherwise self.get_deployment(name=name) - response = self._custom_objects_api.delete_namespaced_custom_object( - group="machinelearning.seldon.io", - version="v1", - namespace=self._namespace, - plural="seldondeployments", - name=name, - _request_timeout=poll_timeout or None, - grace_period_seconds=0 if force else None, + response = ( + self._custom_objects_api.delete_namespaced_custom_object( + group="machinelearning.seldon.io", + version="v1", + namespace=self._namespace, + plural="seldondeployments", + name=name, + _request_timeout=poll_timeout or None, + grace_period_seconds=0 if force else None, + ) ) logger.debug("HFSagemaker Core API response: %s", response) except k8s_client.rest.ApiException as e: @@ -418,7 +427,9 @@ def update_deployment( deployment. """ try: - logger.debug(f"Updating HFSagemakerDeployment resource: {deployment.name}") + logger.debug( + f"Updating HFSagemakerDeployment resource: {deployment.name}" + ) # mark the deployment as managed by ZenML, to differentiate # between deployments that are created by ZenML and those that @@ -442,7 +453,8 @@ def update_deployment( logger.debug("HFSagemaker Core API response: %s", response) except k8s_client.rest.ApiException as e: logger.error( - "Exception when updating HFSagemakerDeployment resource: %s", str(e) + "Exception when updating HFSagemakerDeployment resource: %s", + str(e), ) raise HFSagemakerClientError( "Exception when creating HFSagemakerDeployment resource" @@ -526,7 +538,9 @@ def find_deployments( # Initialize a list to store the filtered SageMaker endpoints filtered_endpoints = [] - def initialize_sagemaker_predictors(filtered_endpoints, sagemaker_session): + def initialize_sagemaker_predictors( + filtered_endpoints, sagemaker_session + ): predictors = [] for endpoint in filtered_endpoints: @@ -558,7 +572,8 @@ def initialize_sagemaker_predictors(filtered_endpoints, sagemaker_session): # Check if all tags in the 'tags' parameter are present in 'endpoint_tags' all_tags_present = all( all( - tag_key in endpoint_tags and endpoint_tags[tag_key] == tag_value + tag_key in endpoint_tags + and endpoint_tags[tag_key] == tag_value for tag_key, tag_value in tags.items() ) ) @@ -574,7 +589,9 @@ def initialize_sagemaker_predictors(filtered_endpoints, sagemaker_session): # Handle exceptions appropriately logger.error(f"An error occurred: {str(e)}") - return initialize_sagemaker_predictors(filtered_endpoints, sagemaker_session) + return initialize_sagemaker_predictors( + filtered_endpoints, sagemaker_session + ) @staticmethod def sanitize_tags(tags: Dict[str, str]) -> None: @@ -583,4 +600,6 @@ def sanitize_tags(tags: Dict[str, str]) -> None: # Kubernetes labels must be alphanumeric, no longer than # 63 characters, and must begin and end with an alphanumeric # character ([a-z0-9A-Z]) - tags[key] = re.sub(r"[^0-9a-zA-Z-_\.]+", "_", value)[:63].strip("-_.") + tags[key] = re.sub(r"[^0-9a-zA-Z-_\.]+", "_", value)[:63].strip( + "-_." + ) diff --git a/huggingface-sagemaker/model_deployers/hf_sagemaker_deployment_service.py b/huggingface-sagemaker/model_deployers/hf_sagemaker_deployment_service.py index 672694c4..11d2c342 100644 --- a/huggingface-sagemaker/model_deployers/hf_sagemaker_deployment_service.py +++ b/huggingface-sagemaker/model_deployers/hf_sagemaker_deployment_service.py @@ -129,7 +129,9 @@ def create_from_deployment( the expected annotations or it contains an invalid or incompatible Seldon Core service configuration. """ - config_data = deployment.metadata.annotations.get("zenml.service_config") + config_data = deployment.metadata.annotations.get( + "zenml.service_config" + ) if not config_data: raise ValueError( f"The given deployment resource does not contain a " @@ -174,7 +176,9 @@ def get_sagemaker_session( self, config: HFSagemakerDeploymentConfig ) -> sagemaker.Session: """Returns sagemaker session from connector""" - session = sagemaker.Session(boto3.Session(**config.sagemaker_session_args)) + session = sagemaker.Session( + boto3.Session(**config.sagemaker_session_args) + ) return session def check_status(self) -> Tuple[ServiceState, str]: @@ -201,7 +205,8 @@ def check_status(self) -> Tuple[ServiceState, str]: if deployment.is_failed(): return ( ServiceState.ERROR, - f"Seldon Core deployment '{name}' failed: " f"{deployment.get_error()}", + f"Seldon Core deployment '{name}' failed: " + f"{deployment.get_error()}", ) pending_message = deployment.get_pending_message() or "" diff --git a/huggingface-sagemaker/model_deployers/hf_sagemaker_model_deployer.py b/huggingface-sagemaker/model_deployers/hf_sagemaker_model_deployer.py index ca6510a3..28fb2fc2 100644 --- a/huggingface-sagemaker/model_deployers/hf_sagemaker_model_deployer.py +++ b/huggingface-sagemaker/model_deployers/hf_sagemaker_model_deployer.py @@ -47,7 +47,9 @@ class HFSagemakerModelDeployer(BaseModelDeployer): """Huggingface Sagemaker model deployer stack component implementation.""" NAME: ClassVar[str] = "Huggingface Sagemaker" - FLAVOR: ClassVar[Type[BaseModelDeployerFlavor]] = HFSagemakerModelDeployerFlavor + FLAVOR: ClassVar[ + Type[BaseModelDeployerFlavor] + ] = HFSagemakerModelDeployerFlavor @property def config(self) -> HFSagemakerModelDeployerConfig: @@ -160,7 +162,9 @@ def deploy_model( if service: # update an equivalent service in place service.update(config) - logger.info(f"Updating an existing Seldon deployment service: {service}") + logger.info( + f"Updating an existing Seldon deployment service: {service}" + ) else: # create a new service service = HFSagemakerDeploymentService(config=config) diff --git a/huggingface-sagemaker/pipelines/__init__.py b/huggingface-sagemaker/pipelines/__init__.py index 0f5aa8ca..a515a775 100644 --- a/huggingface-sagemaker/pipelines/__init__.py +++ b/huggingface-sagemaker/pipelines/__init__.py @@ -17,6 +17,8 @@ from .deploying import sentinment_analysis_deploy_pipeline -from .feature_engineering import sentinment_analysis_feature_engineering_pipeline +from .feature_engineering import ( + sentinment_analysis_feature_engineering_pipeline, +) from .promoting import sentinment_analysis_promote_pipeline from .training import sentinment_analysis_training_pipeline diff --git a/huggingface-sagemaker/pipelines/deploying.py b/huggingface-sagemaker/pipelines/deploying.py index 18d7eb9e..608c4782 100644 --- a/huggingface-sagemaker/pipelines/deploying.py +++ b/huggingface-sagemaker/pipelines/deploying.py @@ -16,15 +16,14 @@ # -from zenml import pipeline -from zenml.client import Client -from zenml.logger import get_logger - from steps import ( deploy_hf_to_sagemaker, notify_on_failure, notify_on_success, ) +from zenml import pipeline +from zenml.client import Client +from zenml.logger import get_logger logger = get_logger(__name__) diff --git a/huggingface-sagemaker/pipelines/feature_engineering.py b/huggingface-sagemaker/pipelines/feature_engineering.py index f85ee7cb..d149e34b 100644 --- a/huggingface-sagemaker/pipelines/feature_engineering.py +++ b/huggingface-sagemaker/pipelines/feature_engineering.py @@ -18,21 +18,20 @@ from typing import Optional -from zenml import pipeline -from zenml.logger import get_logger - from steps import ( data_loader, + generate_reference_and_comparison_datasets, notify_on_failure, tokenization_step, tokenizer_loader, - generate_reference_and_comparison_datasets, ) +from zenml import pipeline from zenml.integrations.evidently.metrics import EvidentlyMetricConfig from zenml.integrations.evidently.steps import ( EvidentlyColumnMapping, evidently_report_step, ) +from zenml.logger import get_logger logger = get_logger(__name__) @@ -71,9 +70,10 @@ def sentinment_analysis_feature_engineering_pipeline( dataset = data_loader() ########## Data Quality stage ########## - reference_dataset, comparison_dataset = generate_reference_and_comparison_datasets( - dataset - ) + ( + reference_dataset, + comparison_dataset, + ) = generate_reference_and_comparison_datasets(dataset) text_data_report = evidently_report_step.with_options( parameters=dict( column_mapping=EvidentlyColumnMapping( diff --git a/huggingface-sagemaker/pipelines/promoting.py b/huggingface-sagemaker/pipelines/promoting.py index 8f57a80a..8a27d208 100644 --- a/huggingface-sagemaker/pipelines/promoting.py +++ b/huggingface-sagemaker/pipelines/promoting.py @@ -16,15 +16,14 @@ # -from zenml import pipeline -from zenml.logger import get_logger - from steps import ( notify_on_failure, notify_on_success, promote_get_metrics, promote_metric_compare_promoter, ) +from zenml import pipeline +from zenml.logger import get_logger logger = get_logger(__name__) diff --git a/huggingface-sagemaker/pipelines/training.py b/huggingface-sagemaker/pipelines/training.py index 18807aef..28422e71 100644 --- a/huggingface-sagemaker/pipelines/training.py +++ b/huggingface-sagemaker/pipelines/training.py @@ -18,11 +18,6 @@ from typing import Optional from uuid import UUID -from zenml import get_pipeline_context, pipeline -from zenml.artifacts.external_artifact import ExternalArtifact -from zenml.logger import get_logger - -from pipelines import sentinment_analysis_feature_engineering_pipeline from steps import ( deploy_to_huggingface, model_trainer, @@ -30,6 +25,11 @@ notify_on_success, register_model, ) +from zenml import get_pipeline_context, pipeline +from zenml.artifacts.external_artifact import ExternalArtifact +from zenml.logger import get_logger + +from pipelines import sentinment_analysis_feature_engineering_pipeline logger = get_logger(__name__) @@ -76,7 +76,10 @@ def sentinment_analysis_training_pipeline( # Execute Feature Engineering Pipeline if dataset_artifact_id is None: - tokenizer, tokenized_data = sentinment_analysis_feature_engineering_pipeline( + ( + tokenizer, + tokenized_data, + ) = sentinment_analysis_feature_engineering_pipeline( lower_case=lower_case, padding=padding, max_seq_length=max_seq_length, diff --git a/huggingface-sagemaker/run.py b/huggingface-sagemaker/run.py index 8124d4ce..b43e4737 100644 --- a/huggingface-sagemaker/run.py +++ b/huggingface-sagemaker/run.py @@ -20,17 +20,16 @@ from typing import Optional import click -from zenml.client import Client -from zenml.enums import ModelStages -from zenml.logger import get_logger -from zenml import Model - from pipelines import ( sentinment_analysis_deploy_pipeline, sentinment_analysis_feature_engineering_pipeline, sentinment_analysis_promote_pipeline, sentinment_analysis_training_pipeline, ) +from zenml import Model +from zenml.client import Client +from zenml.enums import ModelStages +from zenml.logger import get_logger logger = get_logger(__name__) @@ -197,8 +196,8 @@ def main( # all steps in the pipeline in the correct order using the orchestrator # stack component that is configured in your active ZenML stack. config_folder = os.path.join( - os.path.dirname(os.path.realpath(__file__)), - "configs", + os.path.dirname(os.path.realpath(__file__)), + "configs", ) zenml_model = Model( name=zenml_model_name, @@ -215,21 +214,25 @@ def main( # Execute Feature Engineering Pipeline if feature_pipeline: pipeline_args["model"] = zenml_model - pipeline_args["config_path"] = os.path.join(config_folder, "feature_engineering_config.yaml") + pipeline_args["config_path"] = os.path.join( + config_folder, "feature_engineering_config.yaml" + ) run_args_feature = { "max_seq_length": max_seq_length, } pipeline_args[ "run_name" ] = f"sentinment_analysis_feature_engineering_pipeline_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}" - sentinment_analysis_feature_engineering_pipeline.with_options(**pipeline_args)( - **run_args_feature - ) + sentinment_analysis_feature_engineering_pipeline.with_options( + **pipeline_args + )(**run_args_feature) logger.info("Feature Engineering pipeline finished successfully!") # Execute Training Pipeline if training_pipeline: - pipeline_args["config_path"] = os.path.join(config_folder, "trainer_config.yaml") + pipeline_args["config_path"] = os.path.join( + config_folder, "trainer_config.yaml" + ) run_args_train = { "num_epochs": num_epochs, @@ -254,8 +257,12 @@ def main( "base_tokenizer", dataset_version_name ) # Use versioned artifacts - run_args_train["dataset_artifact_id"] = tokenized_dataset_artifact.id - run_args_train["tokenizer_artifact_id"] = tokenized_tokenizer_artifact.id + run_args_train[ + "dataset_artifact_id" + ] = tokenized_dataset_artifact.id + run_args_train[ + "tokenizer_artifact_id" + ] = tokenized_tokenizer_artifact.id pipeline_args["model"] = zenml_model @@ -276,7 +283,9 @@ def main( name=zenml_model_name, version=ModelStages.LATEST, ) - pipeline_args["config_path"] = os.path.join(config_folder, "promoting_config.yaml") + pipeline_args["config_path"] = os.path.join( + config_folder, "promoting_config.yaml" + ) pipeline_args["model"] = zenml_model @@ -289,7 +298,9 @@ def main( logger.info("Promoting pipeline finished successfully!") if deploying_pipeline: - pipeline_args["config_path"] = os.path.join(config_folder, "deploying_config.yaml") + pipeline_args["config_path"] = os.path.join( + config_folder, "deploying_config.yaml" + ) # Deploying pipeline has new ZenML model config zenml_model = Model( diff --git a/huggingface-sagemaker/run_delete_endpoint.py b/huggingface-sagemaker/run_delete_endpoint.py index 62f0af23..166a9b9e 100644 --- a/huggingface-sagemaker/run_delete_endpoint.py +++ b/huggingface-sagemaker/run_delete_endpoint.py @@ -67,7 +67,9 @@ def main( latest_run = client.get_pipeline( deployment_pipeline_name, version=deployment_pipeline_version ).runs[0] - endpoint_name = latest_run.steps[step_name].outputs[step_output_name].load() + endpoint_name = ( + latest_run.steps[step_name].outputs[step_output_name].load() + ) logger.info(f"Deleting endpoint with name: {endpoint_name}") # Do a `aws sagemaker delete-endpoint --endpoint-name ` on the CLI diff --git a/huggingface-sagemaker/steps/__init__.py b/huggingface-sagemaker/steps/__init__.py index d31fbcee..3ad666de 100644 --- a/huggingface-sagemaker/steps/__init__.py +++ b/huggingface-sagemaker/steps/__init__.py @@ -28,10 +28,10 @@ ) from .promotion import promote_get_metrics, promote_metric_compare_promoter from .registerer import register_model -from .tokenizer_loader import ( - tokenizer_loader, -) from .tokenization import ( tokenization_step, ) +from .tokenizer_loader import ( + tokenizer_loader, +) from .training import model_trainer diff --git a/huggingface-sagemaker/steps/dataset_loader/__init__.py b/huggingface-sagemaker/steps/dataset_loader/__init__.py index aa373f69..b5442cd6 100644 --- a/huggingface-sagemaker/steps/dataset_loader/__init__.py +++ b/huggingface-sagemaker/steps/dataset_loader/__init__.py @@ -16,4 +16,6 @@ # from .data_loader import data_loader -from .generate_reference_and_comparison_datasets import generate_reference_and_comparison_datasets +from .generate_reference_and_comparison_datasets import ( + generate_reference_and_comparison_datasets, +) diff --git a/huggingface-sagemaker/steps/dataset_loader/data_loader.py b/huggingface-sagemaker/steps/dataset_loader/data_loader.py index f31110df..61b40e95 100644 --- a/huggingface-sagemaker/steps/dataset_loader/data_loader.py +++ b/huggingface-sagemaker/steps/dataset_loader/data_loader.py @@ -55,12 +55,14 @@ def sample_dataset(dataset, sample_rate=0.2): sampled_dataset = DatasetDict() for split in dataset.keys(): split_size = len(dataset[split]) - indices = np.random.choice(split_size, int(split_size * sample_rate), replace=False) + indices = np.random.choice( + split_size, int(split_size * sample_rate), replace=False + ) sampled_dataset[split] = dataset[split].select(indices) return sampled_dataset dataset = sample_dataset(dataset) - + # Log the dataset and sample examples logger.info(dataset) logger.info( diff --git a/huggingface-sagemaker/steps/deploying/huggingface_deployment.py b/huggingface-sagemaker/steps/deploying/huggingface_deployment.py index 4ecb04fe..89d7305f 100644 --- a/huggingface-sagemaker/steps/deploying/huggingface_deployment.py +++ b/huggingface-sagemaker/steps/deploying/huggingface_deployment.py @@ -23,7 +23,6 @@ from zenml.client import Client from zenml.logger import get_logger - # Initialize logger logger = get_logger(__name__) @@ -48,13 +47,13 @@ def deploy_to_huggingface( save_model_to_deploy.entrypoint() logger.info("Model saved locally. Pushing to HuggingFace...") - assert ( - secret - ), "No secret found with name 'huggingface_creds'. Please create one with your `token`." + assert secret, "No secret found with name 'huggingface_creds'. Please create one with your `token`." token = secret.secret_values["token"] api = HfApi(token=token) - hf_repo = api.create_repo(repo_id=repo_name, repo_type="model", exist_ok=True) + hf_repo = api.create_repo( + repo_id=repo_name, repo_type="model", exist_ok=True + ) zenml_repo_root = Client().root if not zenml_repo_root: logger.warning( diff --git a/huggingface-sagemaker/steps/deploying/sagemaker_deployment.py b/huggingface-sagemaker/steps/deploying/sagemaker_deployment.py index 546917c0..ab1b2ca8 100644 --- a/huggingface-sagemaker/steps/deploying/sagemaker_deployment.py +++ b/huggingface-sagemaker/steps/deploying/sagemaker_deployment.py @@ -17,13 +17,12 @@ from typing import Optional +from gradio.aws_helper import get_sagemaker_role, get_sagemaker_session from sagemaker.huggingface import HuggingFaceModel from typing_extensions import Annotated from zenml import get_step_context, step from zenml.logger import get_logger -from gradio.aws_helper import get_sagemaker_role, get_sagemaker_session - # Initialize logger logger = get_logger(__name__) @@ -50,7 +49,9 @@ def deploy_hf_to_sagemaker( if repo_id is None or revision is None: context = get_step_context() zenml_model = context.model - deployment_metadata = zenml_model.get_data_artifact(name="huggingface_url").run_metadata + deployment_metadata = zenml_model.get_data_artifact( + name="huggingface_url" + ).run_metadata repo_id = deployment_metadata["repo_id"].value revision = deployment_metadata["revision"].value diff --git a/huggingface-sagemaker/steps/promotion/promote_get_metrics.py b/huggingface-sagemaker/steps/promotion/promote_get_metrics.py index 7e97b52c..93cebad1 100644 --- a/huggingface-sagemaker/steps/promotion/promote_get_metrics.py +++ b/huggingface-sagemaker/steps/promotion/promote_get_metrics.py @@ -25,6 +25,7 @@ logger = get_logger(__name__) + @step def promote_get_metrics() -> ( Tuple[ @@ -52,7 +53,11 @@ def promote_get_metrics() -> ( # Get current model metric in current run current_zenml_model = get_step_context().model - current_metrics = current_zenml_model.get_model_artifact("model").run_metadata["metrics"].value + current_metrics = ( + current_zenml_model.get_model_artifact("model") + .run_metadata["metrics"] + .value + ) logger.info(f"Current model metrics are {current_metrics}") # Get latest saved model version metric in target environment @@ -67,7 +72,9 @@ def promote_get_metrics() -> ( latest_zenml_model = None if latest_zenml_model: latest_metrics = ( - latest_zenml_model.get_model_artifact("model").run_metadata["metrics"].value + latest_zenml_model.get_model_artifact("model") + .run_metadata["metrics"] + .value ) logger.info(f"Current model metrics are {latest_metrics}") else: diff --git a/huggingface-sagemaker/steps/promotion/promote_metric_compare_promoter.py b/huggingface-sagemaker/steps/promotion/promote_metric_compare_promoter.py index 4b38c657..585aff8f 100644 --- a/huggingface-sagemaker/steps/promotion/promote_metric_compare_promoter.py +++ b/huggingface-sagemaker/steps/promotion/promote_metric_compare_promoter.py @@ -66,7 +66,10 @@ def promote_metric_compare_promoter( f"Latest model metric={latest_metrics[metric_to_compare]:.6f}\n" f"Current model metric={current_metrics[metric_to_compare]:.6f}" ) - if latest_metrics[metric_to_compare] < current_metrics[metric_to_compare]: + if ( + latest_metrics[metric_to_compare] + < current_metrics[metric_to_compare] + ): logger.info( "Current model outperformed latest model - promoting current" ) diff --git a/huggingface-sagemaker/steps/registerer/model_log_register.py b/huggingface-sagemaker/steps/registerer/model_log_register.py index 6601d078..29ae5a59 100644 --- a/huggingface-sagemaker/steps/registerer/model_log_register.py +++ b/huggingface-sagemaker/steps/registerer/model_log_register.py @@ -24,7 +24,9 @@ ) from zenml import step from zenml.client import Client -from zenml.integrations.mlflow.experiment_trackers import MLFlowExperimentTracker +from zenml.integrations.mlflow.experiment_trackers import ( + MLFlowExperimentTracker, +) from zenml.logger import get_logger # Initialize logger diff --git a/huggingface-sagemaker/steps/tokenization/tokenization.py b/huggingface-sagemaker/steps/tokenization/tokenization.py index 2be84d1e..5e93e642 100644 --- a/huggingface-sagemaker/steps/tokenization/tokenization.py +++ b/huggingface-sagemaker/steps/tokenization/tokenization.py @@ -18,11 +18,10 @@ from datasets import DatasetDict from transformers import PreTrainedTokenizerBase from typing_extensions import Annotated +from utils.misc import find_max_length from zenml import step from zenml.logger import get_logger -from utils.misc import find_max_length - logger = get_logger(__name__) @@ -56,7 +55,9 @@ def tokenization_step( train_max_length = find_max_length(dataset["train"][text_column]) # Depending on the dataset, find the maximum length of text in the validation or test dataset - val_or_test_max_length = find_max_length(dataset["validation"][text_column]) + val_or_test_max_length = find_max_length( + dataset["validation"][text_column] + ) max_length = ( train_max_length if train_max_length >= val_or_test_max_length @@ -93,7 +94,9 @@ def preprocess_function(examples): # Remove the original text column and rename the label column tokenized_datasets = tokenized_datasets.remove_columns([text_column]) - tokenized_datasets = tokenized_datasets.rename_column(label_column, "labels") + tokenized_datasets = tokenized_datasets.rename_column( + label_column, "labels" + ) # Set the format of the tokenized dataset tokenized_datasets.set_format("torch") diff --git a/huggingface-sagemaker/steps/tokenizer_loader/tokenizer_loader.py b/huggingface-sagemaker/steps/tokenizer_loader/tokenizer_loader.py index 1200d7ea..faf80386 100644 --- a/huggingface-sagemaker/steps/tokenizer_loader/tokenizer_loader.py +++ b/huggingface-sagemaker/steps/tokenizer_loader/tokenizer_loader.py @@ -17,7 +17,7 @@ from transformers import AutoTokenizer, PreTrainedTokenizerBase from typing_extensions import Annotated -from zenml import step, ArtifactConfig +from zenml import ArtifactConfig, step from zenml.logger import get_logger logger = get_logger(__name__) @@ -27,7 +27,8 @@ def tokenizer_loader( lower_case: bool, ) -> Annotated[ - PreTrainedTokenizerBase, ArtifactConfig(name="base_tokenizer", is_model_artifact=True) + PreTrainedTokenizerBase, + ArtifactConfig(name="base_tokenizer", is_model_artifact=True), ]: """Tokenizer selection step. diff --git a/huggingface-sagemaker/steps/training/model_trainer.py b/huggingface-sagemaker/steps/training/model_trainer.py index 540204b4..88957340 100644 --- a/huggingface-sagemaker/steps/training/model_trainer.py +++ b/huggingface-sagemaker/steps/training/model_trainer.py @@ -28,13 +28,13 @@ TrainingArguments, ) from typing_extensions import Annotated -from zenml import log_artifact_metadata, step, ArtifactConfig +from utils.misc import compute_metrics +from zenml import ArtifactConfig, log_artifact_metadata, step from zenml.client import Client from zenml.integrations.mlflow.experiment_trackers import ( MLFlowExperimentTracker, ) from zenml.logger import get_logger -from utils.misc import compute_metrics # Initialize logger logger = get_logger(__name__) @@ -64,9 +64,12 @@ def model_trainer( eval_batch_size: Optional[int] = 16, weight_decay: Optional[float] = 0.01, ) -> Tuple[ - Annotated[PreTrainedModel, ArtifactConfig(name="model", is_model_artifact=True)], Annotated[ - PreTrainedTokenizerBase, ArtifactConfig(name="tokenizer", is_model_artifact=True) + PreTrainedModel, ArtifactConfig(name="model", is_model_artifact=True) + ], + Annotated[ + PreTrainedTokenizerBase, + ArtifactConfig(name="tokenizer", is_model_artifact=True), ], ]: """ diff --git a/label_studio_annotation/pipelines/inference_pipeline.py b/label_studio_annotation/pipelines/inference_pipeline.py index 17948cfc..0da69a0d 100644 --- a/label_studio_annotation/pipelines/inference_pipeline.py +++ b/label_studio_annotation/pipelines/inference_pipeline.py @@ -19,7 +19,6 @@ predictor, ) from steps.sync_new_data_to_label_studio import data_sync - from zenml import pipeline from zenml.logger import get_logger diff --git a/label_studio_annotation/pipelines/training_pipeline.py b/label_studio_annotation/pipelines/training_pipeline.py index a81d5808..0aa817e7 100644 --- a/label_studio_annotation/pipelines/training_pipeline.py +++ b/label_studio_annotation/pipelines/training_pipeline.py @@ -18,7 +18,6 @@ from steps.pytorch_trainer import ( pytorch_model_trainer, ) - from zenml import pipeline from zenml.integrations.label_studio.steps.label_studio_standard_steps import ( get_labeled_data, diff --git a/label_studio_annotation/steps/convert_annotations_step.py b/label_studio_annotation/steps/convert_annotations_step.py index ea8bcd57..3a68e10e 100644 --- a/label_studio_annotation/steps/convert_annotations_step.py +++ b/label_studio_annotation/steps/convert_annotations_step.py @@ -14,7 +14,6 @@ from typing import Any, Dict, List, Tuple from typing_extensions import Annotated - from zenml import step from zenml.logger import get_logger diff --git a/label_studio_annotation/steps/load_image_data_step.py b/label_studio_annotation/steps/load_image_data_step.py index a1fa0639..1386ba47 100644 --- a/label_studio_annotation/steps/load_image_data_step.py +++ b/label_studio_annotation/steps/load_image_data_step.py @@ -18,7 +18,6 @@ from typing import Annotated, Dict, Optional, Tuple from PIL import Image - from zenml import get_step_context, step from zenml.integrations.pillow.materializers.pillow_image_materializer import ( DEFAULT_IMAGE_FILENAME, diff --git a/label_studio_annotation/steps/prediction_steps.py b/label_studio_annotation/steps/prediction_steps.py index 6d7c9f13..fe412fce 100644 --- a/label_studio_annotation/steps/prediction_steps.py +++ b/label_studio_annotation/steps/prediction_steps.py @@ -16,7 +16,6 @@ import torch from steps.pytorch_trainer import LABEL_MAPPING, load_mobilenetv3_transforms from typing_extensions import Annotated - from zenml import step from zenml.client import Client diff --git a/label_studio_annotation/steps/pytorch_trainer.py b/label_studio_annotation/steps/pytorch_trainer.py index 8e9b6921..ece79572 100644 --- a/label_studio_annotation/steps/pytorch_trainer.py +++ b/label_studio_annotation/steps/pytorch_trainer.py @@ -23,7 +23,6 @@ from PIL import Image from steps.get_or_create_dataset import LABELS from torchvision import models, transforms - from zenml import get_step_context, step from zenml.client import Client from zenml.integrations.label_studio.label_studio_utils import ( diff --git a/llm-agents/agent/agent_executor_materializer.py b/llm-agents/agent/agent_executor_materializer.py index a77606cf..c4b5a756 100644 --- a/llm-agents/agent/agent_executor_materializer.py +++ b/llm-agents/agent/agent_executor_materializer.py @@ -14,9 +14,8 @@ """Implementation of ZenML's pickle materializer.""" import os -from typing import Any, ClassVar, Tuple, Type - import pickle +from typing import Any, ClassVar, Tuple, Type from zenml.enums import ArtifactType from zenml.environment import Environment diff --git a/llm-agents/run.py b/llm-agents/run.py index 1682487b..94ac3827 100644 --- a/llm-agents/run.py +++ b/llm-agents/run.py @@ -13,9 +13,9 @@ # permissions and limitations under the License. import os -from pipelines.agent_creator import zenml_agent_creation_pipeline import click +from pipelines.agent_creator import zenml_agent_creation_pipeline from zenml.logger import get_logger logger = get_logger(__name__) @@ -51,11 +51,10 @@ def main( ) pipeline_args = {} if config: - pipeline_args["config_path"] = os.path.join( - config_folder, config - ) + pipeline_args["config_path"] = os.path.join(config_folder, config) zenml_agent_creation_pipeline.with_options(**pipeline_args)() - + + if __name__ == "__main__": main() diff --git a/llm-agents/steps/agent_creator.py b/llm-agents/steps/agent_creator.py index 77e53680..22a3c489 100644 --- a/llm-agents/steps/agent_creator.py +++ b/llm-agents/steps/agent_creator.py @@ -1,16 +1,14 @@ from typing import Dict -from typing_extensions import Annotated from agent.agent_executor_materializer import AgentExecutorMaterializer from agent.prompt import PREFIX, SUFFIX -from langchain.agents import ConversationalChatAgent +from langchain.agents import AgentExecutor, ConversationalChatAgent from langchain.chat_models import ChatOpenAI from langchain.schema.vectorstore import VectorStore from langchain.tools.vectorstore.tool import VectorStoreQATool -from langchain.agents import AgentExecutor from pydantic import BaseModel -from zenml import step, ArtifactConfig, log_artifact_metadata - +from typing_extensions import Annotated +from zenml import ArtifactConfig, log_artifact_metadata, step PIPELINE_NAME = "zenml_agent_creation_pipeline" # Choose what character to use for your agent's answers diff --git a/llm-agents/steps/index_generator.py b/llm-agents/steps/index_generator.py index 0f08f369..5632ec6c 100644 --- a/llm-agents/steps/index_generator.py +++ b/llm-agents/steps/index_generator.py @@ -12,17 +12,17 @@ # or implied. See the License for the specific language governing # permissions and limitations under the License. -from typing_extensions import Annotated from typing import List from langchain.docstore.document import Document from langchain.embeddings import OpenAIEmbeddings +from langchain.schema.vectorstore import VectorStore from langchain.text_splitter import ( CharacterTextSplitter, ) -from langchain.schema.vectorstore import VectorStore from langchain.vectorstores.faiss import FAISS -from zenml import step, log_artifact_metadata +from typing_extensions import Annotated +from zenml import log_artifact_metadata, step @step diff --git a/llm-agents/steps/url_scraper.py b/llm-agents/steps/url_scraper.py index 0ab4436d..ef5ac8b0 100644 --- a/llm-agents/steps/url_scraper.py +++ b/llm-agents/steps/url_scraper.py @@ -13,9 +13,9 @@ # permissions and limitations under the License. from typing import List + from typing_extensions import Annotated -from steps.url_scraping_utils import get_all_pages, get_nested_readme_urls -from zenml import step, log_artifact_metadata +from zenml import log_artifact_metadata, step @step @@ -35,7 +35,7 @@ def url_scraper( Returns: List of URLs to scrape. """ - + # We comment this out to make this pipeline faster # examples_readme_urls = get_nested_readme_urls(repo_url) # docs_urls = get_all_pages(docs_url) diff --git a/llm-finetuning/huggingface/hf_deployment_base_config.py b/llm-finetuning/huggingface/hf_deployment_base_config.py index 2ecaaeed..2ff7025f 100644 --- a/llm-finetuning/huggingface/hf_deployment_base_config.py +++ b/llm-finetuning/huggingface/hf_deployment_base_config.py @@ -1,6 +1,6 @@ +from typing import Dict, Optional + from pydantic import BaseModel -from typing import Optional, Dict -from zenml.utils.secret_utils import SecretField class HuggingFaceBaseConfig(BaseModel): diff --git a/llm-finetuning/huggingface/hf_deployment_service.py b/llm-finetuning/huggingface/hf_deployment_service.py index 54b297ce..69a86e87 100644 --- a/llm-finetuning/huggingface/hf_deployment_service.py +++ b/llm-finetuning/huggingface/hf_deployment_service.py @@ -1,22 +1,21 @@ """Implementation of the Huggingface Deployment service.""" -from zenml.logger import get_logger -from typing import Generator, Tuple, Optional, Any, List -from zenml.services import ServiceType, ServiceState, ServiceStatus -from zenml.services.service import BaseDeploymentService, ServiceConfig +from typing import Any, Generator, Optional, Tuple + from huggingface_hub import ( InferenceClient, - InferenceEndpointError, InferenceEndpoint, + InferenceEndpointError, InferenceEndpointStatus, -) -from huggingface_hub.utils import HfHubHTTPError -from huggingface_hub import ( create_inference_endpoint, get_inference_endpoint, ) -from huggingface.hf_deployment_base_config import HuggingFaceBaseConfig - +from huggingface_hub.utils import HfHubHTTPError from pydantic import Field +from zenml.logger import get_logger +from zenml.services import ServiceState, ServiceStatus, ServiceType +from zenml.services.service import BaseDeploymentService, ServiceConfig + +from huggingface.hf_deployment_base_config import HuggingFaceBaseConfig logger = get_logger(__name__) diff --git a/llm-finetuning/huggingface/hf_model_deployer.py b/llm-finetuning/huggingface/hf_model_deployer.py index 544f03f3..082a02e1 100644 --- a/llm-finetuning/huggingface/hf_model_deployer.py +++ b/llm-finetuning/huggingface/hf_model_deployer.py @@ -1,27 +1,27 @@ """Implementation of the Huggingface Model Deployer.""" +from typing import ClassVar, Dict, List, Optional, Type, cast from uuid import UUID -from zenml.model_deployers import BaseModelDeployer -from huggingface.hf_model_deployer_flavor import HuggingFaceModelDeployerFlavor + +from huggingface_hub import InferenceEndpoint, list_inference_endpoints +from zenml.artifacts.utils import log_artifact_metadata, save_artifact +from zenml.client import Client from zenml.logger import get_logger +from zenml.model_deployers import BaseModelDeployer +from zenml.model_deployers.base_model_deployer import ( + DEFAULT_DEPLOYMENT_START_STOP_TIMEOUT, + BaseModelDeployerFlavor, +) +from zenml.services import BaseService, ServiceConfig, ServiceRegistry -from typing import List, Optional, cast, ClassVar, Type, Dict -from zenml.services import BaseService, ServiceConfig from huggingface.hf_deployment_service import ( HuggingFaceDeploymentService, HuggingFaceServiceConfig, ) from huggingface.hf_model_deployer_flavor import ( - HuggingFaceModelDeployerSettings, HuggingFaceModelDeployerConfig, + HuggingFaceModelDeployerFlavor, + HuggingFaceModelDeployerSettings, ) -from zenml.model_deployers.base_model_deployer import ( - DEFAULT_DEPLOYMENT_START_STOP_TIMEOUT, - BaseModelDeployerFlavor, -) -from huggingface_hub import InferenceEndpoint, list_inference_endpoints -from zenml.client import Client -from zenml.services import ServiceRegistry -from zenml.artifacts.utils import save_artifact, log_artifact_metadata logger = get_logger(__name__) diff --git a/llm-finetuning/huggingface/hf_model_deployer_flavor.py b/llm-finetuning/huggingface/hf_model_deployer_flavor.py index 6a85845a..ccdca7bd 100644 --- a/llm-finetuning/huggingface/hf_model_deployer_flavor.py +++ b/llm-finetuning/huggingface/hf_model_deployer_flavor.py @@ -1,13 +1,15 @@ """Huggingface model deployer flavor.""" -from typing import Optional, Type, TYPE_CHECKING +from typing import TYPE_CHECKING, Optional, Type + +from zenml.config.base_settings import BaseSettings from zenml.model_deployers.base_model_deployer import ( - BaseModelDeployerFlavor, BaseModelDeployerConfig, + BaseModelDeployerFlavor, ) -from zenml.config.base_settings import BaseSettings -from huggingface.hf_deployment_base_config import HuggingFaceBaseConfig from zenml.utils.secret_utils import SecretField +from huggingface.hf_deployment_base_config import HuggingFaceBaseConfig + if TYPE_CHECKING: from huggingface.hf_model_deployer import HuggingFaceModelDeployer diff --git a/llm-finetuning/materializers/__init__.py b/llm-finetuning/materializers/__init__.py index 6e4e80ae..320cbb9f 100644 --- a/llm-finetuning/materializers/__init__.py +++ b/llm-finetuning/materializers/__init__.py @@ -13,4 +13,7 @@ # permissions and limitations under the License. """Implementation of the Huggingface Trainer materializer.""" -from .huggingface_model_materializer import HFTrainerMaterializer, DEFAULT_TRAINER_MODEL_DIR \ No newline at end of file +from .huggingface_model_materializer import ( + DEFAULT_TRAINER_MODEL_DIR, + HFTrainerMaterializer, +) diff --git a/llm-finetuning/materializers/huggingface_model_materializer.py b/llm-finetuning/materializers/huggingface_model_materializer.py index b26f9005..6e78dcd8 100644 --- a/llm-finetuning/materializers/huggingface_model_materializer.py +++ b/llm-finetuning/materializers/huggingface_model_materializer.py @@ -19,11 +19,10 @@ from typing import Any, ClassVar, Dict, Tuple, Type from transformers import ( # type: ignore [import-untyped] - Trainer, AutoConfig, - TrainingArguments + Trainer, + TrainingArguments, ) - from zenml.enums import ArtifactType from zenml.materializers.base_materializer import BaseMaterializer from zenml.metadata.metadata_types import DType, MetadataType @@ -32,6 +31,7 @@ DEFAULT_TRAINER_MODEL_DIR = "hf_model" DEFAULT_ARGS_JSON = "args.json" + class HFTrainerMaterializer(BaseMaterializer): """Materializer to read torch model to and from huggingface pretrained model.""" @@ -76,12 +76,10 @@ def save(self, trainer: Trainer) -> None: os.path.join(self.uri, DEFAULT_TRAINER_MODEL_DIR), ) yaml_utils.write_json( - os.path.join(self.uri, DEFAULT_ARGS_JSON), trainer.args.to_dict() + os.path.join(self.uri, DEFAULT_ARGS_JSON), trainer.args.to_dict() ) - def extract_metadata( - self, trainer: Trainer - ) -> Dict[str, "MetadataType"]: + def extract_metadata(self, trainer: Trainer) -> Dict[str, "MetadataType"]: """Extract metadata from the given `PreTrainedModel` object. Args: diff --git a/llm-finetuning/pipelines/__init__.py b/llm-finetuning/pipelines/__init__.py index 6e272689..79820207 100644 --- a/llm-finetuning/pipelines/__init__.py +++ b/llm-finetuning/pipelines/__init__.py @@ -15,6 +15,6 @@ # limitations under the License. # -from .generate_code_dataset import generate_code_dataset +from .deployment import huggingface_deployment from .finetune import finetune_starcoder -from .deployment import huggingface_deployment \ No newline at end of file +from .generate_code_dataset import generate_code_dataset diff --git a/llm-finetuning/pipelines/_generate_code_dataset_by_gpt4turbo.py b/llm-finetuning/pipelines/_generate_code_dataset_by_gpt4turbo.py index 208f39d7..1e917177 100644 --- a/llm-finetuning/pipelines/_generate_code_dataset_by_gpt4turbo.py +++ b/llm-finetuning/pipelines/_generate_code_dataset_by_gpt4turbo.py @@ -1,19 +1,24 @@ -from zenml.steps import step, Output -from zenml.pipelines import pipeline from zenml.artifacts import DataArtifact +from zenml.pipelines import pipeline +from zenml.steps import Output, step + # Step 1: Fetch the list of repositories from GitHub @step -def fetch_repos(username: str, access_token: str, include_fork: bool = False) -> Output(repos=list): +def fetch_repos( + username: str, access_token: str, include_fork: bool = False +) -> Output(repos=list): # ... (implementation of get_repos function) ... return get_repos(username, access_token, include_fork) + # Step 2: Clone the repositories locally @step def clone_repos(repos: list, mirror_directory: str) -> None: # ... (implementation of mirror_repositories function) ... mirror_repositories(repos, mirror_directory) + # Step 3: Read and filter files from the cloned repositories @step def read_and_filter_files(mirror_directory: str) -> DataArtifact: @@ -21,12 +26,16 @@ def read_and_filter_files(mirror_directory: str) -> DataArtifact: df = read_repository_files(mirror_directory) return df + # Step 4: Upload the DataFrame to the Hugging Face Hub @step -def upload_to_hf_hub(df: DataArtifact, dataset_id: str, file_format: str) -> None: +def upload_to_hf_hub( + df: DataArtifact, dataset_id: str, file_format: str +) -> None: # ... (implementation of upload_to_hub function) ... upload_to_hub(df, dataset_id, file_format) + # Define the pipeline @pipeline def github_to_hf_pipeline( @@ -40,13 +49,20 @@ def github_to_hf_pipeline( df = read_and_filter_files_step() upload_to_hf_hub_step(df) + # Run the pipeline if __name__ == "__main__": # Define the steps - fetch_repos_step = fetch_repos(username=ORG, access_token=os.environ["GH_ACCESS_TOKEN"]) + fetch_repos_step = fetch_repos( + username=ORG, access_token=os.environ["GH_ACCESS_TOKEN"] + ) clone_repos_step = clone_repos(mirror_directory=MIRROR_DIRECTORY) - read_and_filter_files_step = read_and_filter_files(mirror_directory=MIRROR_DIRECTORY) - upload_to_hf_hub_step = upload_to_hf_hub(dataset_id=DATASET_ID, file_format=FEATHER_FORMAT) + read_and_filter_files_step = read_and_filter_files( + mirror_directory=MIRROR_DIRECTORY + ) + upload_to_hf_hub_step = upload_to_hf_hub( + dataset_id=DATASET_ID, file_format=FEATHER_FORMAT + ) # Create and run the pipeline pipeline_instance = github_to_hf_pipeline( diff --git a/llm-finetuning/pipelines/deployment.py b/llm-finetuning/pipelines/deployment.py index 1aa11e5a..718c7bd9 100644 --- a/llm-finetuning/pipelines/deployment.py +++ b/llm-finetuning/pipelines/deployment.py @@ -15,10 +15,7 @@ # limitations under the License. # -from steps import ( - deploy_model_to_hf_hub -) - +from steps import deploy_model_to_hf_hub from zenml import pipeline from zenml.logger import get_logger diff --git a/llm-finetuning/pipelines/finetune.py b/llm-finetuning/pipelines/finetune.py index 70c03cff..effa162c 100644 --- a/llm-finetuning/pipelines/finetune.py +++ b/llm-finetuning/pipelines/finetune.py @@ -15,11 +15,7 @@ # limitations under the License. # -from steps import ( - trainer, - merge_and_push -) - +from steps import merge_and_push, trainer from zenml import pipeline from zenml.logger import get_logger @@ -33,5 +29,11 @@ def finetune_starcoder(): """ # Link all the steps together by calling them and passing the output # of one step as the input of the next step. - trainer_obj, tokenizer, output_peft_repo_id, train_dataset, eval_dataset = trainer() + ( + trainer_obj, + tokenizer, + output_peft_repo_id, + train_dataset, + eval_dataset, + ) = trainer() merge_and_push(peft_model_id=output_peft_repo_id) diff --git a/llm-finetuning/pipelines/generate_code_dataset.py b/llm-finetuning/pipelines/generate_code_dataset.py index cdf41411..666230f0 100644 --- a/llm-finetuning/pipelines/generate_code_dataset.py +++ b/llm-finetuning/pipelines/generate_code_dataset.py @@ -16,11 +16,9 @@ # from steps import ( - prepare_dataset, - push_to_hub, mirror_repositories, + prepare_dataset, ) - from zenml import pipeline from zenml.logger import get_logger @@ -28,9 +26,7 @@ @pipeline -def generate_code_dataset( - dataset_id: str -): +def generate_code_dataset(dataset_id: str): """ This pipeline generates the code dataset. """ diff --git a/llm-finetuning/steps/__init__.py b/llm-finetuning/steps/__init__.py index 9d9e5a73..ce9dd601 100644 --- a/llm-finetuning/steps/__init__.py +++ b/llm-finetuning/steps/__init__.py @@ -15,8 +15,8 @@ # limitations under the License. # +from .deployment import deploy_model_to_hf_hub from .parallel_clones import mirror_repositories from .prepare_dataset import prepare_dataset from .push_dataset_to_hub import push_to_hub -from .trainer import trainer, merge_and_push -from .deployment import deploy_model_to_hf_hub \ No newline at end of file +from .trainer import merge_and_push, trainer diff --git a/llm-finetuning/steps/deployment.py b/llm-finetuning/steps/deployment.py index 5d6572bc..a9f811e6 100644 --- a/llm-finetuning/steps/deployment.py +++ b/llm-finetuning/steps/deployment.py @@ -1,13 +1,13 @@ -from zenml import step -from zenml import get_step_context -from zenml.client import Client -from typing import Optional, cast, Dict -from zenml.logger import get_logger +from typing import Dict, Optional, cast + from huggingface.hf_deployment_service import ( HuggingFaceDeploymentService, HuggingFaceServiceConfig, ) from huggingface.hf_model_deployer import HuggingFaceModelDeployer +from zenml import get_step_context, step +from zenml.client import Client +from zenml.logger import get_logger logger = get_logger(__name__) @@ -42,9 +42,9 @@ def deploy_model_to_hf_hub(hf_endpoint_cfg: Optional[Dict] = None) -> None: secret = Client().get_secret("huggingface_creds") hf_token = secret.secret_values["token"] - commit_info = get_step_context().model.run_metadata[ - "merged_model_commit_info" - ].value + commit_info = ( + get_step_context().model.run_metadata["merged_model_commit_info"].value + ) model_namespace, repository, revision = parse_huggingface_url(commit_info) diff --git a/llm-finetuning/steps/parallel_clones.py b/llm-finetuning/steps/parallel_clones.py index 896b62d3..8417000d 100644 --- a/llm-finetuning/steps/parallel_clones.py +++ b/llm-finetuning/steps/parallel_clones.py @@ -8,10 +8,11 @@ import os import subprocess from multiprocessing import Pool -from zenml import step -from zenml.client import Client from typing import List + from typing_extensions import Annotated +from zenml import step +from zenml.client import Client ORG = "zenml-io" MIRROR_DIRECTORY = "cloned_public_repos" @@ -27,7 +28,9 @@ def mirror_repository(repository): @step -def mirror_repositories(repositories: List[str]) -> Annotated[str, "mirror_directory"]: +def mirror_repositories( + repositories: List[str], +) -> Annotated[str, "mirror_directory"]: """Locally clones a list of repositories. Args: @@ -44,17 +47,21 @@ def mirror_repositories(repositories: List[str]) -> Annotated[str, "mirror_direc gh_access_token = None gh_access_token = os.getenv("GH_ACCESS_TOKEN", None) client = Client() - + # Try to get the access token from the ZenML client try: - gh_access_token = client.get_secret("GH_ACCESS_TOKEN").secret_values["token"] + gh_access_token = client.get_secret("GH_ACCESS_TOKEN").secret_values[ + "token" + ] except KeyError: pass - + # Raise an error if the access token is not found if gh_access_token is None: - raise ValueError("Please set the GH_ACCESS_TOKEN environment variable.") - + raise ValueError( + "Please set the GH_ACCESS_TOKEN environment variable." + ) + # Get the list of repositories in the organization print(f"Total repositories found: {len(repositories)}.") diff --git a/llm-finetuning/steps/prepare_dataset.py b/llm-finetuning/steps/prepare_dataset.py index 0319d73e..8e042fb2 100644 --- a/llm-finetuning/steps/prepare_dataset.py +++ b/llm-finetuning/steps/prepare_dataset.py @@ -6,12 +6,13 @@ """ import os +from typing import Dict + import pandas as pd -from nbformat import reads, NO_CONVERT -from tqdm import tqdm from datasets import Dataset -from typing import Dict from huggingface_hub import HfApi +from nbformat import NO_CONVERT, reads +from tqdm import tqdm from zenml import step from zenml.client import Client @@ -60,12 +61,15 @@ def upload_to_hub(df: pd.DataFrame, dataset_id: str) -> str: token = secret.secret_values["token"] api = HfApi(token=token) - repo_id = api.create_repo(repo_id=dataset_id, exist_ok=True, repo_type="dataset").repo_id + repo_id = api.create_repo( + repo_id=dataset_id, exist_ok=True, repo_type="dataset" + ).repo_id dataset = Dataset.from_pandas(df) dataset.push_to_hub(dataset_id, token=token) return repo_id + def filter_code_cell(cell) -> bool: """Filters a code cell w.r.t shell commands, etc.""" only_shell = cell["source"].startswith("!") @@ -75,6 +79,7 @@ def filter_code_cell(cell) -> bool: else: return True + def process_file(directory_name: str, file_path: str) -> Dict[str, str]: """Processes a single file.""" try: @@ -115,7 +120,8 @@ def read_repository_files(directory) -> pd.DataFrame: for file in files: file_path = os.path.join(root, file) if not file_path.endswith(ANTI_FORMATS) and all( - k not in file_path for k in [".git", "__pycache__", "xcodeproj"] + k not in file_path + for k in [".git", "__pycache__", "xcodeproj"] ): file_paths.append((os.path.dirname(root), file_path)) @@ -134,7 +140,9 @@ def read_repository_files(directory) -> pd.DataFrame: @step(enable_cache=False) -def prepare_dataset(mirror_directory: str, dataset_id: str = "zenml-codegen-v1"): +def prepare_dataset( + mirror_directory: str, dataset_id: str = "zenml-codegen-v1" +): df = read_repository_files(mirror_directory) repo_id = upload_to_hub(df, dataset_id) return repo_id diff --git a/llm-finetuning/steps/push_dataset_to_hub.py b/llm-finetuning/steps/push_dataset_to_hub.py index 2ae1b542..16de5dcc 100644 --- a/llm-finetuning/steps/push_dataset_to_hub.py +++ b/llm-finetuning/steps/push_dataset_to_hub.py @@ -4,16 +4,18 @@ Based off Sayak Paul (https://github.com/sayakpaul) and Sourab Mangrulkar (https://github.com/pacman100) codebase: https://github.com/pacman100/DHS-LLM-Workshop/tree/main/ All credit to them for their amazing work! """ -from huggingface_hub import HfApi +import glob + +import pandas as pd from datasets import Dataset +from huggingface_hub import HfApi from tqdm import tqdm -import pandas as pd -import glob from zenml import step from zenml.client import Client FEATHER_FORMAT = "*.ftr" + @step def push_to_hub(repo_id: str, dataset_id: str): """Pushes the dataset to the Hugging Face Hub. @@ -25,9 +27,11 @@ def push_to_hub(repo_id: str, dataset_id: str): secret = Client().get_secret("huggingface_creds") token = secret.secret_values["token"] api = HfApi(token=token) - + folder_path = api.snapshot_download( - repo_id=repo_id, allow_patterns=f"*.{FEATHER_FORMAT}", repo_type="dataset" + repo_id=repo_id, + allow_patterns=f"*.{FEATHER_FORMAT}", + repo_type="dataset", ) feather_files = glob.glob(f"{folder_path}/raw_csvs/*.{FEATHER_FORMAT}") print(folder_path, len(feather_files)) diff --git a/llm-finetuning/steps/trainer.py b/llm-finetuning/steps/trainer.py index 6d52dd6b..2f053a97 100644 --- a/llm-finetuning/steps/trainer.py +++ b/llm-finetuning/steps/trainer.py @@ -5,56 +5,38 @@ All credit to them for their amazing work! """ -from pydantic import BaseModel -from typing import Optional +import functools import os import random -from zenml import step -from zenml import ArtifactConfig -from typing_extensions import Annotated +from typing import Optional, Tuple + import numpy as np import torch from datasets import load_dataset -from torch.utils.data import IterableDataset -from zenml.client import Client -from zenml import log_model_metadata +from huggingface_hub import login from materializers import HFTrainerMaterializer -from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast -from typing import Tuple -from zenml import save_artifact -from tqdm import tqdm -from transformers import ( - AutoModelForCausalLM, - AutoTokenizer, - Trainer, - TrainingArguments, - set_seed, - BitsAndBytesConfig, +from peft import ( + LoraConfig, + PeftModel, + get_peft_model, + prepare_model_for_kbit_training, ) -from huggingface_hub import login - -from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from peft.tuners.lora import LoraLayer - -import functools - -import numpy as np -import os - -from dataclasses import dataclass, field -from typing import Optional - -import torch -from datasets import load_dataset -from peft import LoraConfig +from pydantic import BaseModel +from torch.utils.data import IterableDataset +from tqdm import tqdm from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, - AutoTokenizer, + Trainer, TrainingArguments, + set_seed, ) -from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel +from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast +from typing_extensions import Annotated +from zenml import ArtifactConfig, log_model_metadata, save_artifact, step +from zenml.client import Client # this is expensive so we cache it @@ -522,7 +504,7 @@ def run_training(args: Configuration, train_data, val_data, hf_token): except Exception as e: print(str(e)) print("Skipped saving final checkpoint to ZenML") - pass + pass if is_deepspeed_peft_enabled: trainer.accelerator.wait_for_everyone() @@ -542,28 +524,38 @@ def run_training(args: Configuration, train_data, val_data, hf_token): try: if args.push_to_hub: commit_info = trainer.push_to_hub() - log_model_metadata(metadata={"trainer_commit_info": str(commit_info)}) + log_model_metadata( + metadata={"trainer_commit_info": str(commit_info)} + ) else: trainer.save_model(args.output_dir) trainer.accelerator.print(f"Model saved to {args.output_dir}") if args.push_to_hub: - commit_info = trainer.model.push_to_hub(repo_id=args.output_peft_repo_id, token=hf_token) - log_model_metadata(metadata={"model_commit_info": str(commit_info)}) - except Exception as e: + commit_info = trainer.model.push_to_hub( + repo_id=args.output_peft_repo_id, token=hf_token + ) + log_model_metadata( + metadata={"model_commit_info": str(commit_info)} + ) + except Exception as e: print("Exception while pushing or saving") print(str(e)) - pass + pass return trainer @step -def merge_and_push(peft_model_id: str, base_model_name: str = "bigcode/starcoder"): +def merge_and_push( + peft_model_id: str, base_model_name: str = "bigcode/starcoder" +): secret = Client().get_secret("huggingface_creds") hf_token = secret.secret_values["token"] login(token=hf_token) os.environ["CUDA_VISIBLE_DEVICES"] = "0" - tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained( + base_model_name, trust_remote_code=True + ) model = AutoModelForCausalLM.from_pretrained( base_model_name, quantization_config=None, @@ -576,15 +568,21 @@ def merge_and_push(peft_model_id: str, base_model_name: str = "bigcode/starcoder if not hasattr(model, "hf_device_map"): model.cuda() - peft_model = PeftModel.from_pretrained(model, peft_model_id, adapter_name="personal_copilot") - peft_model.add_weighted_adapter(["personal_copilot"], [0.8], "best_personal_copilot") + peft_model = PeftModel.from_pretrained( + model, peft_model_id, adapter_name="personal_copilot" + ) + peft_model.add_weighted_adapter( + ["personal_copilot"], [0.8], "best_personal_copilot" + ) peft_model.set_adapter("best_personal_copilot") final_model = peft_model.merge_and_unload() final_model.eval() model_id_merged = f"{peft_model_id}-merged" commit_info = tokenizer.push_to_hub(model_id_merged, token=hf_token) - log_model_metadata(metadata={"merged_tokenizer_commit_info": str(commit_info)}) + log_model_metadata( + metadata={"merged_tokenizer_commit_info": str(commit_info)} + ) commit_info = final_model.push_to_hub(model_id_merged, token=hf_token) log_model_metadata(metadata={"merged_model_commit_info": str(commit_info)}) @@ -593,8 +591,13 @@ def merge_and_push(peft_model_id: str, base_model_name: str = "bigcode/starcoder def trainer( args: Configuration, ) -> Tuple[ - Annotated[Trainer, ArtifactConfig(name="trainer_obj", is_model_artifact=True)], - Annotated[GPT2TokenizerFast, ArtifactConfig(name="tokenizer_obj", is_model_artifact=True)], + Annotated[ + Trainer, ArtifactConfig(name="trainer_obj", is_model_artifact=True) + ], + Annotated[ + GPT2TokenizerFast, + ArtifactConfig(name="tokenizer_obj", is_model_artifact=True), + ], Annotated[str, "peft_model_id"], Annotated[ConstantLengthDataset, "train_dataset"], Annotated[ConstantLengthDataset, "eval_dataset"], @@ -606,7 +609,7 @@ def trainer( secret = Client().get_secret("huggingface_creds") hf_token = secret.secret_values["token"] login(token=hf_token) - + print("Loading tokenizer...") os.makedirs(args.output_dir, exist_ok=True) tokenizer = AutoTokenizer.from_pretrained( @@ -615,8 +618,14 @@ def trainer( print("Creating a dataset...") train_dataset, eval_dataset = create_datasets(tokenizer, args) - + print("Creating a training...") trainer_obj = run_training(args, train_dataset, eval_dataset, hf_token) - return trainer_obj, tokenizer, args.output_peft_repo_id, train_dataset, eval_dataset + return ( + trainer_obj, + tokenizer, + args.output_peft_repo_id, + train_dataset, + eval_dataset, + ) diff --git a/llm-finetuning/test_starcoder_bigcode.py b/llm-finetuning/test_starcoder_bigcode.py index 2dabea08..bcdd00ad 100644 --- a/llm-finetuning/test_starcoder_bigcode.py +++ b/llm-finetuning/test_starcoder_bigcode.py @@ -14,7 +14,7 @@ def tf_mnist_pipeline(epochs: int = 5, lr: float = 0.001): """Links all the steps together in a pipeline.""" # Link all the steps together by calling them and passing the output # of one step as the input - + # x_train, x_test, y_train, y_test = RandomSplit(test_size=0.2)( # dataset=iris_data_loader() # ) @@ -25,4 +25,6 @@ def tf_mnist_pipeline(epochs: int = 5, lr: float = 0.001): dataset=iris_data_loader() ) model = TFFeed(epochs=epochs, lr=lr)( - x_train=x_train \ No newline at end of file + x_train=x_train + + diff --git a/llm-finetuning/test_zencoder.py b/llm-finetuning/test_zencoder.py index e3184cad..81703268 100644 --- a/llm-finetuning/test_zencoder.py +++ b/llm-finetuning/test_zencoder.py @@ -1,27 +1,33 @@ -# Write a zenml pipeline that loads sklearn iris dataset and builds a sklearn classifier +# Write a zenml pipeline that loads sklearn iris dataset and builds a sklearn classifier -from zenml import pipeline, step from sklearn.datasets import load_iris from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split +from zenml import pipeline, step + @step def importer() -> pd.DataFrame: """Load the iris dataset.""" - df = load_iris(as_frame=True)["data"] + df = load_iris(as_frame=True)["data"] return df + @step def trainer(df: pd.DataFrame) -> Any: """Train a model on the dataset.""" - X_train, X_test, y_train, y_test = train_test_split( - df.to_numpy()[:, :2], df.to_numpy()[:, 2], test_size=0.2, random_state=42 + X_train, X_test, y_train, y_test = train_test_split( + df.to_numpy()[:, :2], + df.to_numpy()[:, 2], + test_size=0.2, + random_state=42, ) model = RandomForestClassifier(n_estimators=10) model.fit(X_train, y_train) return model + @pipeline def sklearn_pipeline(): df = importer() - model = trainer(df) \ No newline at end of file + model = trainer(df) diff --git a/llm-lora-finetuning/finetune/adapter.py b/llm-lora-finetuning/finetune/adapter.py index ada08236..acf8f6d4 100644 --- a/llm-lora-finetuning/finetune/adapter.py +++ b/llm-lora-finetuning/finetune/adapter.py @@ -35,6 +35,7 @@ load_checkpoint, num_parameters, ) + from scripts.prepare_alpaca import generate_prompt diff --git a/llm-lora-finetuning/finetune/adapter_v2.py b/llm-lora-finetuning/finetune/adapter_v2.py index f03aeccd..ac7de327 100644 --- a/llm-lora-finetuning/finetune/adapter_v2.py +++ b/llm-lora-finetuning/finetune/adapter_v2.py @@ -35,6 +35,7 @@ load_checkpoint, num_parameters, ) + from scripts.prepare_alpaca import generate_prompt diff --git a/llm-lora-finetuning/finetune/full.py b/llm-lora-finetuning/finetune/full.py index e263323f..02e28a72 100644 --- a/llm-lora-finetuning/finetune/full.py +++ b/llm-lora-finetuning/finetune/full.py @@ -29,6 +29,7 @@ load_checkpoint, num_parameters, ) + from scripts.prepare_alpaca import generate_prompt diff --git a/llm-lora-finetuning/finetune/lora.py b/llm-lora-finetuning/finetune/lora.py index 5213190f..39caa06e 100644 --- a/llm-lora-finetuning/finetune/lora.py +++ b/llm-lora-finetuning/finetune/lora.py @@ -35,6 +35,7 @@ load_checkpoint, num_parameters, ) + from scripts.prepare_alpaca import generate_prompt diff --git a/llm-lora-finetuning/generate/adapter.py b/llm-lora-finetuning/generate/adapter.py index bc2c425f..3daa8836 100644 --- a/llm-lora-finetuning/generate/adapter.py +++ b/llm-lora-finetuning/generate/adapter.py @@ -22,6 +22,7 @@ get_default_supported_precision, lazy_load, ) + from scripts.prepare_alpaca import generate_prompt diff --git a/llm-lora-finetuning/generate/adapter_v2.py b/llm-lora-finetuning/generate/adapter_v2.py index e25d0cfb..6f9d76d4 100644 --- a/llm-lora-finetuning/generate/adapter_v2.py +++ b/llm-lora-finetuning/generate/adapter_v2.py @@ -22,6 +22,7 @@ get_default_supported_precision, lazy_load, ) + from scripts.prepare_alpaca import generate_prompt diff --git a/llm-lora-finetuning/generate/full.py b/llm-lora-finetuning/generate/full.py index a9e8d19b..cc1da495 100644 --- a/llm-lora-finetuning/generate/full.py +++ b/llm-lora-finetuning/generate/full.py @@ -21,6 +21,7 @@ get_default_supported_precision, load_checkpoint, ) + from scripts.prepare_alpaca import generate_prompt diff --git a/llm-lora-finetuning/generate/lora.py b/llm-lora-finetuning/generate/lora.py index f84f39db..0b30b701 100644 --- a/llm-lora-finetuning/generate/lora.py +++ b/llm-lora-finetuning/generate/lora.py @@ -22,6 +22,7 @@ get_default_supported_precision, lazy_load, ) + from scripts.prepare_alpaca import generate_prompt diff --git a/llm-lora-finetuning/lit_gpt/config.py b/llm-lora-finetuning/lit_gpt/config.py index 1dbef24d..dab1523b 100644 --- a/llm-lora-finetuning/lit_gpt/config.py +++ b/llm-lora-finetuning/lit_gpt/config.py @@ -54,9 +54,9 @@ class Config: shared_attention_norm: bool = False _norm_class: Literal["LayerNorm", "RMSNorm"] = "LayerNorm" norm_eps: float = 1e-5 - _mlp_class: Literal["GptNeoxMLP", "LLaMAMLP", "GemmaMLP", "LLaMAMoE"] = ( - "GptNeoxMLP" - ) + _mlp_class: Literal[ + "GptNeoxMLP", "LLaMAMLP", "GemmaMLP", "LLaMAMoE" + ] = "GptNeoxMLP" gelu_approximate: str = "none" intermediate_size: Optional[int] = None rope_condense_ratio: int = 1 diff --git a/llm-lora-finetuning/lit_gpt/lora.py b/llm-lora-finetuning/lit_gpt/lora.py index 84d42543..6e9274e1 100644 --- a/llm-lora-finetuning/lit_gpt/lora.py +++ b/llm-lora-finetuning/lit_gpt/lora.py @@ -373,9 +373,7 @@ def zero_pad(self, x: torch.Tensor) -> torch.Tensor: ) # (4096, 256) return result.view( (*x.shape[:-1], self.linear.out_features) - ).transpose( - 0, 1 - ) # (64, 64, 384) + ).transpose(0, 1) # (64, 64, 384) def conv1d( self, input: torch.Tensor, weight: torch.Tensor diff --git a/llm-lora-finetuning/pipelines/evaluate.py b/llm-lora-finetuning/pipelines/evaluate.py index c4c41cb1..d1303944 100644 --- a/llm-lora-finetuning/pipelines/evaluate.py +++ b/llm-lora-finetuning/pipelines/evaluate.py @@ -15,11 +15,10 @@ # limitations under the License. # +from steps import evaluate from zenml import pipeline from zenml.config import DockerSettings -from steps import evaluate - @pipeline( settings={ diff --git a/llm-lora-finetuning/pipelines/feature_engineering.py b/llm-lora-finetuning/pipelines/feature_engineering.py index e8e0bec8..c08c5e9a 100644 --- a/llm-lora-finetuning/pipelines/feature_engineering.py +++ b/llm-lora-finetuning/pipelines/feature_engineering.py @@ -15,11 +15,10 @@ # limitations under the License. # +from steps import feature_engineering from zenml import pipeline from zenml.config import DockerSettings -from steps import feature_engineering - @pipeline( settings={ diff --git a/llm-lora-finetuning/pipelines/finetuning.py b/llm-lora-finetuning/pipelines/finetuning.py index 024c22f4..e61d0a88 100644 --- a/llm-lora-finetuning/pipelines/finetuning.py +++ b/llm-lora-finetuning/pipelines/finetuning.py @@ -17,11 +17,10 @@ from typing import Optional +from steps import finetune from zenml import get_pipeline_context, pipeline from zenml.config import DockerSettings -from steps import finetune - @pipeline( settings={ diff --git a/llm-lora-finetuning/pipelines/merge.py b/llm-lora-finetuning/pipelines/merge.py index 507e19df..a034c143 100644 --- a/llm-lora-finetuning/pipelines/merge.py +++ b/llm-lora-finetuning/pipelines/merge.py @@ -15,11 +15,10 @@ # limitations under the License. # +from steps import merge from zenml import pipeline from zenml.config import DockerSettings -from steps import merge - @pipeline( settings={ diff --git a/llm-lora-finetuning/run.py b/llm-lora-finetuning/run.py index 18826797..c8faf2a8 100644 --- a/llm-lora-finetuning/run.py +++ b/llm-lora-finetuning/run.py @@ -19,14 +19,13 @@ from typing import Optional import click -from zenml.logger import get_logger - from pipelines import ( llm_lora_evaluation, llm_lora_feature_engineering, llm_lora_finetuning, llm_lora_merging, ) +from zenml.logger import get_logger logger = get_logger(__name__) diff --git a/llm-lora-finetuning/scripts/convert_lit_checkpoint.py b/llm-lora-finetuning/scripts/convert_lit_checkpoint.py index 8dd440ca..1239e7d2 100644 --- a/llm-lora-finetuning/scripts/convert_lit_checkpoint.py +++ b/llm-lora-finetuning/scripts/convert_lit_checkpoint.py @@ -17,6 +17,7 @@ from lit_gpt import Config from lit_gpt.utils import CLI, incremental_save, lazy_load + from scripts.convert_hf_checkpoint import layer_template, load_param diff --git a/llm-lora-finetuning/scripts/prepare_dolly.py b/llm-lora-finetuning/scripts/prepare_dolly.py index 94f6659c..8bb43439 100644 --- a/llm-lora-finetuning/scripts/prepare_dolly.py +++ b/llm-lora-finetuning/scripts/prepare_dolly.py @@ -17,6 +17,7 @@ from lit_gpt.tokenizer import Tokenizer from lit_gpt.utils import CLI + from scripts.prepare_alpaca import download_if_missing diff --git a/llm-lora-finetuning/scripts/prepare_flan.py b/llm-lora-finetuning/scripts/prepare_flan.py index d8285ab3..1e6b45c5 100644 --- a/llm-lora-finetuning/scripts/prepare_flan.py +++ b/llm-lora-finetuning/scripts/prepare_flan.py @@ -16,6 +16,7 @@ from lit_gpt.tokenizer import Tokenizer from lit_gpt.utils import CLI + from scripts.prepare_alpaca import download_if_missing diff --git a/llm-lora-finetuning/scripts/prepare_longform.py b/llm-lora-finetuning/scripts/prepare_longform.py index 115021a0..6327bad8 100644 --- a/llm-lora-finetuning/scripts/prepare_longform.py +++ b/llm-lora-finetuning/scripts/prepare_longform.py @@ -16,6 +16,7 @@ from lit_gpt.tokenizer import Tokenizer from lit_gpt.utils import CLI + from scripts.prepare_alpaca import download_if_missing diff --git a/llm-lora-finetuning/steps/evaluate.py b/llm-lora-finetuning/steps/evaluate.py index 5b86a2e4..6173a575 100644 --- a/llm-lora-finetuning/steps/evaluate.py +++ b/llm-lora-finetuning/steps/evaluate.py @@ -21,13 +21,13 @@ from typing import Any, Dict, List, Literal, Optional import torch +from evaluate.lm_eval_harness import run_eval_harness from huggingface_hub import snapshot_download from pydantic import BaseModel from typing_extensions import Annotated from zenml import step from zenml.logger import get_logger -from evaluate.lm_eval_harness import run_eval_harness from scripts.download import download_from_hub from scripts.merge_lora import merge_lora from steps.params import LoraParameters diff --git a/llm-lora-finetuning/steps/feature_engineering.py b/llm-lora-finetuning/steps/feature_engineering.py index f778e714..b0483216 100644 --- a/llm-lora-finetuning/steps/feature_engineering.py +++ b/llm-lora-finetuning/steps/feature_engineering.py @@ -21,12 +21,12 @@ from pathlib import Path from typing import Any, Dict +from lit_gpt import Config +from materializers.directory_materializer import DirectoryMaterializer from pydantic import BaseModel from typing_extensions import Annotated from zenml import log_artifact_metadata, step -from lit_gpt import Config -from materializers.directory_materializer import DirectoryMaterializer from scripts.download import download_from_hub from steps.utils import get_huggingface_access_token diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py index 8763ddf7..6168c263 100644 --- a/llm-lora-finetuning/steps/finetune.py +++ b/llm-lora-finetuning/steps/finetune.py @@ -20,16 +20,16 @@ from typing import Literal, Optional import torch +from finetune.lora import setup from huggingface_hub import upload_folder +from lit_gpt.args import EvalArgs, IOArgs, TrainArgs +from materializers.directory_materializer import DirectoryMaterializer from pydantic import BaseModel from typing_extensions import Annotated from zenml import get_step_context, log_model_metadata, step from zenml.logger import get_logger from zenml.materializers import BuiltInMaterializer -from finetune.lora import setup -from lit_gpt.args import EvalArgs, IOArgs, TrainArgs -from materializers.directory_materializer import DirectoryMaterializer from scripts.convert_lit_checkpoint import convert_lit_checkpoint from scripts.download import download_from_hub from scripts.merge_lora import merge_lora diff --git a/nba-pipeline/steps/splitter.py b/nba-pipeline/steps/splitter.py index 6aa7cdf1..b07e4079 100644 --- a/nba-pipeline/steps/splitter.py +++ b/nba-pipeline/steps/splitter.py @@ -105,9 +105,9 @@ class SplitConfig(BaseParameters): @step -def date_based_splitter( - dataset: pd.DataFrame, config: SplitConfig -) -> Output(before=pd.DataFrame, after=pd.DataFrame): +def date_based_splitter(dataset: pd.DataFrame, config: SplitConfig) -> Output( + before=pd.DataFrame, after=pd.DataFrame +): """Splits data for drift detection.""" cols = config.columns if config.columns else dataset.columns dataset["GAME_DATE"] = pd.to_datetime(dataset["GAME_DATE"]) diff --git a/pyproject.toml b/pyproject.toml index 55cb4e6c..083e3f39 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,36 +8,91 @@ authors = ["ZenML CodeMonkey "] python = ">=3.7.0,<3.9.0" [tool.poetry.dev-dependencies] -black = "^21.9b0" -isort = "^5.9.3" pytest = "^6.2.5" darglint = "1.8.1" -ruff = "^0.0.254" +ruff = ">=0.0.254" pyspelling = "^2.8.2" -mypy = "^1.0.0" +mypy = ">=1.0.0" [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" -[tool.isort] -profile = "black" -known_third_party = [] -skip_glob = [] -line_length = 79 - -[tool.black] +[tool.ruff] line-length = 79 -include = '\.pyi?$' -exclude = ''' -/( - \.git -| \.hg -| \.mypy_cache -| \.tox -| \.venv -| _build -| buck-out -| build -)/ -''' +# Exclude a variety of commonly ignored directories. +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".hg", + ".mypy_cache", + ".nox", + ".pants.d", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + "__pypackages__", + "_build", + "buck-out", + ".test_durations", + "build", + "dist", + "node_modules", + "venv", + '__init__.py', +] +# use Python 3.8 as the minimum version for autofixing +target-version = "py38" + + +[tool.ruff.format] +exclude = [ + "*.git", + "*.hg", + ".mypy_cache", + ".tox", + ".venv", + "_build", + "buck-out", + "build]", +] + +[tool.ruff.lint] +# Disable autofix for unused imports (`F401`). +unfixable = ["F401"] +per-file-ignores = {} +ignore-init-module-imports = true +ignore = [ + "E501", + "F401", + "F403", + "D301", + "D401", + "D403", + "D407", + "D213", + "D203", + "S101", + "S104", + "S105", + "S106", + "S107", +] +select = ["D", "E", "F", "I", "I001", "Q"] + +[tool.ruff.lint.flake8-import-conventions.aliases] +altair = "alt" +"matplotlib.pyplot" = "plt" +numpy = "np" +pandas = "pd" +seaborn = "sns" + +[tool.ruff.lint.mccabe] +max-complexity = 18 + +[tool.ruff.lint.pydocstyle] +# Use Google-style docstrings. +convention = "google" diff --git a/scripts/format.sh b/scripts/format.sh index bc8e09a3..c944f607 100755 --- a/scripts/format.sh +++ b/scripts/format.sh @@ -1,14 +1,26 @@ -#!/bin/sh -e +#!/usr/bin/env bash set -x -SRC=${1:-"."} +export ZENML_DEBUG=1 +export ZENML_ANALYTICS_OPT_IN=false + +# Initialize default source directories +default_src="." +# Initialize SRC as an empty string +SRC="" + +# If no source directories were provided, use the default +if [ -z "$SRC" ]; then + SRC="$default_src" +fi export ZENML_DEBUG=1 export ZENML_ANALYTICS_OPT_IN=false # autoflake replacement: removes unused imports and variables -ruff $SRC --select F401,F841 --fix --exclude "__init__.py" --isolated +ruff check $SRC --select F401,F841 --fix --exclude "__init__.py" --exclude "llm-finetuning/" --exclude "sign-language-detection-yolov5/model.py" --isolated # sorts imports -ruff $SRC --select I --fix --ignore D -black $SRC +ruff check $SRC --exclude "llm-finetuning/" --exclude "sign-language-detection-yolov5/model.py" --select I --fix --ignore D +ruff format $SRC --exclude "sign-language-detection-yolov5/model.py" --exclude "llm-finetuning/" + diff --git a/sign-language-detection-yolov5/model.py b/sign-language-detection-yolov5/model.py index c393a817..bd7982c7 100644 --- a/sign-language-detection-yolov5/model.py +++ b/sign-language-detection-yolov5/model.py @@ -95,9 +95,9 @@ def forward(self, imgs): ) # mypy: ignore p = Path(p) # to Path s += "%gx%g " % im.shape[2:] # print string - torch.tensor(im0.shape)[ # mypy: ignore + torch.tensor(im0.shape)[ [1, 0, 1, 0] - ] # mypy: ignore + ] # mypy: ignore # mypy: ignore annotator = Annotator(im0, line_width=1, example=str(names)) if len(det): # Rescale boxes from img_size to im0 size