diff --git a/.github/workflows/production_run_complete_llm.yml b/.github/workflows/production_run_complete_llm.yml index 354c9f9e..6c8ff248 100644 --- a/.github/workflows/production_run_complete_llm.yml +++ b/.github/workflows/production_run_complete_llm.yml @@ -11,11 +11,11 @@ concurrency: cancel-in-progress: true jobs: - run-staging-workflow: + run-production-workflow: runs-on: ubuntu-latest env: - ZENML_HOST: ${{ secrets.ZENML_PROJECTS_HOST }} - ZENML_API_KEY: ${{ secrets.ZENML_PROJECTS_API_KEY }} + ZENML_STORE_URL: ${{ secrets.ZENML_PROJECTS_HOST }} + ZENML_STORE_API_KEY: ${{ secrets.ZENML_PROJECTS_API_KEY }} ZENML_PRODUCTION_STACK: b3951d43-0fb2-4d32-89c5-3399374e7c7e # Set this to your production stack ID ZENML_GITHUB_SHA: ${{ github.event.pull_request.head.sha }} ZENML_GITHUB_URL_PR: ${{ github.event.pull_request._links.html.href }} @@ -38,15 +38,16 @@ jobs: - name: Install requirements working-directory: ./llm-complete-guide run: | - pip3 install -r requirements.txt - pip3 install -r requirements-argilla.txt - zenml integration install gcp -y + pip3 install uv + uv pip install -r requirements.txt --system + uv pip install -r requirements-argilla.txt --system + zenml integration install gcp -y --uv - name: Connect to ZenML server working-directory: ./llm-complete-guide run: | zenml init - zenml connect --url $ZENML_HOST --api-key $ZENML_API_KEY + zenml connect --url $ZENML_STORE_URL --api-key $ZENML_STORE_API_KEY - name: Set stack (Production) working-directory: ./llm-complete-guide @@ -56,4 +57,4 @@ jobs: - name: Run pipeline, create pipeline, configure trigger (Production) working-directory: ./llm-complete-guide run: | - python gh_action_rag.py --no-cache --create-template ----event-source-id --service-account-id ${{ env.ZENML_SERVICE_ACCOUNT_ID }} --action-id ${{ env.ZENML_ACTION_ID }} --config rag_gcp.yaml \ No newline at end of file + python gh_action_rag.py --no-cache --create-template ----event-source-id --service-account-id ${{ env.ZENML_SERVICE_ACCOUNT_ID }} --action-id ${{ env.ZENML_ACTION_ID }} --config production/rag.yaml --zenml-model-version production \ No newline at end of file diff --git a/.github/workflows/staging_run_complete_llm.yml b/.github/workflows/staging_run_complete_llm.yml index 57125f84..b43911a1 100644 --- a/.github/workflows/staging_run_complete_llm.yml +++ b/.github/workflows/staging_run_complete_llm.yml @@ -12,8 +12,8 @@ jobs: run-staging-workflow: runs-on: ubuntu-latest env: - ZENML_HOST: ${{ secrets.ZENML_PROJECTS_HOST }} - ZENML_API_KEY: ${{ secrets.ZENML_PROJECTS_API_KEY }} + ZENML_STORE_URL: ${{ secrets.ZENML_PROJECTS_HOST }} + ZENML_STORE_API_KEY: ${{ secrets.ZENML_PROJECTS_API_KEY }} ZENML_STAGING_STACK : 67166d73-a44e-42f9-b67f-011e9afab9b5 # Set this to your staging stack ID ZENML_GITHUB_SHA: ${{ github.event.pull_request.head.sha }} ZENML_GITHUB_URL_PR: ${{ github.event.pull_request._links.html.href }} @@ -34,15 +34,16 @@ jobs: - name: Install requirements working-directory: ./llm-complete-guide run: | - pip3 install -r requirements.txt - pip3 install -r requirements-argilla.txt - zenml integration install aws s3 -y + pip3 install uv + uv pip install -r requirements.txt --system + uv pip install -r requirements-argilla.txt --system + zenml integration install aws s3 -y --uv - name: Connect to ZenML server working-directory: ./llm-complete-guide run: | zenml init - zenml connect --url $ZENML_HOST --api-key $ZENML_API_KEY + zenml connect --url $ZENML_STORE_URL --api-key $ZENML_STORE_API_KEY - name: Set stack (Staging) working-directory: ./llm-complete-guide @@ -52,4 +53,4 @@ jobs: - name: Run pipeline (Staging) working-directory: ./llm-complete-guide run: | - python gh_action_rag.py --no-cache --config rag_local_dev.yaml \ No newline at end of file + python gh_action_rag.py --no-cache --config staging/rag.yaml --zenml-model-version staging \ No newline at end of file diff --git a/llm-complete-guide/README.md b/llm-complete-guide/README.md index 5e5844c4..75f7586e 100644 --- a/llm-complete-guide/README.md +++ b/llm-complete-guide/README.md @@ -23,7 +23,7 @@ instructions are provided below for how to set that up. ## 📽️ Watch the webinars -We've recently been holding some webinars about this repository and project. Watche the videos below if you want an introduction and context around the code and ideas covered in this project. +We've recently been holding some webinars about this repository and project. Watch the videos below if you want an introduction and context around the code and ideas covered in this project. [![Building and Optimizing RAG Pipelines: Data Preprocessing, Embeddings, and Evaluation with ZenML](https://github.com/user-attachments/assets/1aea2bd4-8079-4ea2-98e1-8da6ba9aeebe)](https://www.youtube.com/watch?v=PazRMY8bo3U) @@ -45,7 +45,7 @@ pip install -r requirements.txt Depending on your hardware you may run into some issues when running the `pip install` command with the `flash_attn` package. In that case running `FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn --no-build-isolation` -could help you. +could help you. Possibly you might also need to install torch separately. In order to use the default LLM for this query, you'll need an account and an API key from OpenAI specified as a ZenML secret: @@ -85,7 +85,7 @@ to run the pipelines in the correct order. You can run the script with the following command: ```shell -python run.py --rag +python run.py rag ``` This will run the basic RAG pipeline, which scrapes the ZenML documentation and @@ -100,7 +100,7 @@ use for the LLM. When you're ready to make the query, run the following command: ```shell -python run.py --query "how do I use a custom materializer inside my own zenml steps? i.e. how do I set it? inside the @step decorator?" --model=gpt4 +python run.py query "how do I use a custom materializer inside my own zenml steps? i.e. how do I set it? inside the @step decorator?" --model=gpt4 ``` Alternative options for LLMs to use include: @@ -164,7 +164,7 @@ the RAG pipeline. To run the evaluation pipeline, you can use the following command: ```shell -python run.py --evaluation +python run.py evaluation ``` You'll need to have first run the RAG pipeline to have the necessary assets in @@ -182,7 +182,7 @@ To run the `distilabel` synthetic data generation pipeline, you can use the foll ```shell pip install -r requirements-argilla.txt # special requirements -python run.py --synthetic +python run.py synthetic ``` You will also need to have set up and connected to an Argilla instance for this @@ -221,7 +221,7 @@ commands: ```shell pip install -r requirements-argilla.txt # special requirements -python run.py --embeddings +python run.py embeddings ``` *Credit to Phil Schmid for his [tutorial on embeddings finetuning with Matryoshka diff --git a/llm-complete-guide/ZENML_VERSION.txt b/llm-complete-guide/ZENML_VERSION.txt new file mode 100644 index 00000000..65529192 --- /dev/null +++ b/llm-complete-guide/ZENML_VERSION.txt @@ -0,0 +1 @@ +0.68.1 \ No newline at end of file diff --git a/llm-complete-guide/configs/embeddings.yaml b/llm-complete-guide/configs/dev/embeddings.yaml similarity index 64% rename from llm-complete-guide/configs/embeddings.yaml rename to llm-complete-guide/configs/dev/embeddings.yaml index 6fb2cc20..821b1069 100644 --- a/llm-complete-guide/configs/embeddings.yaml +++ b/llm-complete-guide/configs/dev/embeddings.yaml @@ -3,7 +3,7 @@ # environment configuration settings: docker: - parent_image: "zenmldocker/prepare-release:base-0.68.0" + python_package_installer: "uv" requirements: - langchain-community - ratelimit @@ -27,14 +27,6 @@ settings: - datasets - torch - pygithub + - openai environment: - ZENML_PROJECT_SECRET_NAME: llm_complete - - -# configuration of the Model Control Plane -model: - name: finetuned-zenml-docs-embeddings - version: latest - license: Apache 2.0 - description: Finetuned LLM on ZenML docs - tags: ["rag", "finetuned"] \ No newline at end of file + ZENML_PROJECT_SECRET_NAME: llm_complete \ No newline at end of file diff --git a/llm-complete-guide/configs/rag_local_dev.yaml b/llm-complete-guide/configs/dev/rag.yaml similarity index 65% rename from llm-complete-guide/configs/rag_local_dev.yaml rename to llm-complete-guide/configs/dev/rag.yaml index 334044b5..bfeaebaa 100644 --- a/llm-complete-guide/configs/rag_local_dev.yaml +++ b/llm-complete-guide/configs/dev/rag.yaml @@ -19,16 +19,9 @@ settings: ZENML_PROJECT_SECRET_NAME: llm_complete ZENML_ENABLE_RICH_TRACEBACK: FALSE ZENML_LOGGING_VERBOSITY: INFO - - -# configuration of the Model Control Plane -model: - name: finetuned-zenml-docs-embeddings - license: Apache 2.0 - description: Finetuned LLM on ZenML docs - tags: ["rag", "finetuned"] - + python_package_installer: "uv" steps: url_scraper: parameters: - docs_url: https://docs.zenml.io/stack-components/orchestrators + docs_url: https://docs.zenml.io/ + use_dev_set: true diff --git a/llm-complete-guide/configs/rag_eval.yaml b/llm-complete-guide/configs/dev/rag_eval.yaml similarity index 58% rename from llm-complete-guide/configs/rag_eval.yaml rename to llm-complete-guide/configs/dev/rag_eval.yaml index 6116f3bc..4ac3c94b 100644 --- a/llm-complete-guide/configs/rag_eval.yaml +++ b/llm-complete-guide/configs/dev/rag_eval.yaml @@ -13,10 +13,4 @@ settings: - psycopg2-binary - tiktoken - pygithub - -# configuration of the Model Control Plane -model: - name: finetuned-zenml-docs-embeddings - license: Apache 2.0 - description: Finetuned LLM on ZenML docs - tags: ["rag", "finetuned"] \ No newline at end of file + python_package_installer: "uv" diff --git a/llm-complete-guide/configs/synthetic.yaml b/llm-complete-guide/configs/dev/synthetic.yaml similarity index 74% rename from llm-complete-guide/configs/synthetic.yaml rename to llm-complete-guide/configs/dev/synthetic.yaml index 6b052429..5dafdfcc 100644 --- a/llm-complete-guide/configs/synthetic.yaml +++ b/llm-complete-guide/configs/dev/synthetic.yaml @@ -25,14 +25,7 @@ settings: - torch - distilabel - pygithub + - openai environment: ZENML_PROJECT_SECRET_NAME: llm_complete - - -# configuration of the Model Control Plane -model: - name: finetuned-zenml-docs-embeddings - version: latest - license: Apache 2.0 - description: Finetuned LLM on ZenML docs - tags: ["rag", "finetuned"] + python_package_installer: "uv" \ No newline at end of file diff --git a/llm-complete-guide/configs/production/embeddings.yaml b/llm-complete-guide/configs/production/embeddings.yaml new file mode 100644 index 00000000..0862735c --- /dev/null +++ b/llm-complete-guide/configs/production/embeddings.yaml @@ -0,0 +1,37 @@ +# enable_cache: False + +# environment configuration +settings: + docker: + requirements: + - ratelimit + - pgvector + - psycopg2-binary + - beautifulsoup4 + - unstructured + - pandas + - numpy + - sentence-transformers>=3 + - transformers[torch]==4.43.1 + - litellm + - ollama + - tiktoken + - umap-learn + - matplotlib + - pyarrow + - rerankers[flashrank] + - datasets + - torch + - pygithub + - openai + environment: + ZENML_PROJECT_SECRET_NAME: llm_complete + python_package_installer: "uv" + +steps: + finetune: + step_operator: "gcp_a100" + settings: + step_operator.vertex: + accelerator_count: 1 + accelerator_type: NVIDIA_TESLA_A100 \ No newline at end of file diff --git a/llm-complete-guide/configs/production/eval.yaml b/llm-complete-guide/configs/production/eval.yaml new file mode 100644 index 00000000..1786b3b8 --- /dev/null +++ b/llm-complete-guide/configs/production/eval.yaml @@ -0,0 +1,24 @@ +enable_cache: False + +# environment configuration +settings: + docker: + requirements: + - unstructured + - sentence-transformers>=3 + - pgvector + - datasets + - litellm + - numpy + - psycopg2-binary + - tiktoken + - ratelimit + - rerankers[flashrank] + - matplotlib + - pillow + - pygithub + environment: + ZENML_PROJECT_SECRET_NAME: llm_complete + ZENML_ENABLE_RICH_TRACEBACK: FALSE + ZENML_LOGGING_VERBOSITY: INFO + python_package_installer: "uv" diff --git a/llm-complete-guide/configs/rag_gcp.yaml b/llm-complete-guide/configs/production/rag.yaml similarity index 56% rename from llm-complete-guide/configs/rag_gcp.yaml rename to llm-complete-guide/configs/production/rag.yaml index 462b5790..3a8940b2 100644 --- a/llm-complete-guide/configs/rag_gcp.yaml +++ b/llm-complete-guide/configs/production/rag.yaml @@ -1,3 +1,5 @@ +enable_cache: True + # environment configuration settings: docker: @@ -17,25 +19,16 @@ settings: ZENML_PROJECT_SECRET_NAME: llm_complete ZENML_ENABLE_RICH_TRACEBACK: FALSE ZENML_LOGGING_VERBOSITY: INFO - + python_package_installer: "uv" steps: url_scraper: parameters: docs_url: https://docs.zenml.io - repo_url: https://github.com/zenml-io/zenml - website_url: https://zenml.io - + use_dev_set: false + enable_cache: true # generate_embeddings: -# step_operator: "terraform-gcp-6c0fd52233ca" +# step_operator: "sagemaker" # settings: -# step_operator.vertex: -# accelerator_type: "NVIDIA_TESLA_P100" +# step_operator.sagemaker: # accelerator_count: 1 -# machine_type: "n1-standard-8" - -# configuration of the Model Control Plane -model: - name: finetuned-zenml-docs-embeddings - license: Apache 2.0 - description: Finetuned LLM on ZenML docs - tags: ["rag", "finetuned"] \ No newline at end of file +# accelerator_type: NVIDIA_TESLA_A100 \ No newline at end of file diff --git a/llm-complete-guide/configs/production/synthetic.yaml b/llm-complete-guide/configs/production/synthetic.yaml new file mode 100644 index 00000000..98b0c506 --- /dev/null +++ b/llm-complete-guide/configs/production/synthetic.yaml @@ -0,0 +1,29 @@ +# environment configuration +settings: + docker: + requirements: + - ratelimit + - pgvector + - psycopg2-binary + - beautifulsoup4 + - unstructured + - pandas + - numpy + - sentence-transformers>=3 + - transformers==4.43.1 + - litellm + - ollama + - tiktoken + - umap-learn + - matplotlib + - pyarrow + - rerankers[flashrank] + - datasets + - torch + - distilabel + - argilla + - pygithub + - openai + environment: + ZENML_PROJECT_SECRET_NAME: llm_complete + python_package_installer: "uv" diff --git a/llm-complete-guide/configs/staging/embeddings.yaml b/llm-complete-guide/configs/staging/embeddings.yaml new file mode 100644 index 00000000..3c191335 --- /dev/null +++ b/llm-complete-guide/configs/staging/embeddings.yaml @@ -0,0 +1,29 @@ +# enable_cache: False + +# environment configuration +settings: + docker: + requirements: + - ratelimit + - pgvector + - psycopg2-binary + - beautifulsoup4 + - unstructured + - pandas + - numpy + - sentence-transformers>=3 + - transformers[torch]==4.43.1 + - litellm + - ollama + - tiktoken + - umap-learn + - matplotlib + - pyarrow + - rerankers[flashrank] + - datasets + - torch + - pygithub + - openai + environment: + ZENML_PROJECT_SECRET_NAME: llm_complete + python_package_installer: "uv" diff --git a/llm-complete-guide/configs/staging/eval.yaml b/llm-complete-guide/configs/staging/eval.yaml new file mode 100644 index 00000000..1786b3b8 --- /dev/null +++ b/llm-complete-guide/configs/staging/eval.yaml @@ -0,0 +1,24 @@ +enable_cache: False + +# environment configuration +settings: + docker: + requirements: + - unstructured + - sentence-transformers>=3 + - pgvector + - datasets + - litellm + - numpy + - psycopg2-binary + - tiktoken + - ratelimit + - rerankers[flashrank] + - matplotlib + - pillow + - pygithub + environment: + ZENML_PROJECT_SECRET_NAME: llm_complete + ZENML_ENABLE_RICH_TRACEBACK: FALSE + ZENML_LOGGING_VERBOSITY: INFO + python_package_installer: "uv" diff --git a/llm-complete-guide/configs/staging/rag.yaml b/llm-complete-guide/configs/staging/rag.yaml new file mode 100644 index 00000000..c5383fb0 --- /dev/null +++ b/llm-complete-guide/configs/staging/rag.yaml @@ -0,0 +1,30 @@ +enable_cache: False + +# environment configuration +settings: + docker: + requirements: + - unstructured + - sentence-transformers>=3 + - pgvector + - datasets + - litellm + - numpy + - psycopg2-binary + - tiktoken + - ratelimit + - rerankers + - pygithub + environment: + ZENML_PROJECT_SECRET_NAME: llm_complete + ZENML_ENABLE_RICH_TRACEBACK: FALSE + ZENML_LOGGING_VERBOSITY: INFO + python_package_installer: "uv" + parent_image: "339712793861.dkr.ecr.eu-central-1.amazonaws.com/zenml:llm_index_and_evaluate-orchestrator" + skip_build: true +steps: + url_scraper: + parameters: + docs_url: https://docs.zenml.io + use_dev_set: false + enable_cache: true diff --git a/llm-complete-guide/configs/staging/synthetic.yaml b/llm-complete-guide/configs/staging/synthetic.yaml new file mode 100644 index 00000000..98b0c506 --- /dev/null +++ b/llm-complete-guide/configs/staging/synthetic.yaml @@ -0,0 +1,29 @@ +# environment configuration +settings: + docker: + requirements: + - ratelimit + - pgvector + - psycopg2-binary + - beautifulsoup4 + - unstructured + - pandas + - numpy + - sentence-transformers>=3 + - transformers==4.43.1 + - litellm + - ollama + - tiktoken + - umap-learn + - matplotlib + - pyarrow + - rerankers[flashrank] + - datasets + - torch + - distilabel + - argilla + - pygithub + - openai + environment: + ZENML_PROJECT_SECRET_NAME: llm_complete + python_package_installer: "uv" diff --git a/llm-complete-guide/gh_action_rag.py b/llm-complete-guide/gh_action_rag.py index 4828b57d..bfd3dbd3 100644 --- a/llm-complete-guide/gh_action_rag.py +++ b/llm-complete-guide/gh_action_rag.py @@ -21,8 +21,9 @@ import click import yaml -from pipelines.llm_basic_rag import llm_basic_rag +from pipelines.llm_index_and_evaluate import llm_index_and_evaluate from zenml.client import Client +from zenml import Model from zenml.exceptions import ZenKeyError @@ -63,14 +64,30 @@ default=None, help="Specify an event source ID", ) +@click.option( + "--zenml-model-name", + "zenml_model_name", + default="zenml-docs-qa-chatbot", + help="Specify a ZenML model name", + required=False, +) +@click.option( + "--zenml-model-version", + "zenml_model_version", + default=None, + help="Specify a ZenML model version", + required=False, +) def main( no_cache: bool = False, config: Optional[str] = "rag_local_dev.yaml", create_template: bool = False, service_account_id: Optional[str] = None, event_source_id: Optional[str] = None, + zenml_model_name: Optional[str] = "zenml-docs-qa-rag", + zenml_model_version: Optional[str] = None, ): - """ + """ Executes the pipeline to train a basic RAG model. Args: @@ -80,6 +97,8 @@ def main( action_id (str): The action ID. service_account_id (str): The service account ID. event_source_id (str): The event source ID. + zenml_model_name (str): The ZenML model name. + zenml_model_version (str): The ZenML model version. """ client = Client() config_path = Path(__file__).parent / "configs" / config @@ -87,14 +106,46 @@ def main( with open(config_path, "r") as file: config = yaml.safe_load(file) + # Read the model version from a file in the root of the repo + # called "ZENML_VERSION.txt". + if zenml_model_version == "staging": + postfix = "-rc0" + elif zenml_model_version == "production": + postfix = "" + else: + postfix = "-dev" + + if Path("ZENML_VERSION.txt").exists(): + with open("ZENML_VERSION.txt", "r") as file: + zenml_model_version = file.read().strip() + zenml_model_version += postfix + else: + raise RuntimeError( + "No model version file found. Please create a file called ZENML_VERSION.txt in the root of the repo with the model version." + ) + + zenml_model = Model( + name=zenml_model_name, + version=zenml_model_version, + license="Apache 2.0", + description="RAG application for ZenML docs", + tags=["rag", "finetuned", "chatbot"], + limitations="Only works for ZenML documentation. Not generalizable to other domains. Entirely build with synthetic data. The data is also quite noisy on account of how the chunks were split.", + trade_offs="Focused on a specific RAG retrieval use case. Not generalizable to other domains.", + audience="ZenML users", + use_cases="RAG retrieval", + ) + if create_template: # run pipeline - run = llm_basic_rag.with_options( - config_path=str(config_path), enable_cache=not no_cache + run = llm_index_and_evaluate.with_options( + model=zenml_model, + config_path=str(config_path), + enable_cache=not no_cache, )() # create new run template rt = client.create_run_template( - name=f"production-llm-complete-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}", + name=f"zenml-docs-qa-chatbot-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}", deployment_id=run.deployment_id, ) @@ -144,8 +195,10 @@ def main( ) else: - llm_basic_rag.with_options( - config_path=str(config_path), enable_cache=not no_cache + llm_index_and_evaluate.with_options( + model=zenml_model, + config_path=str(config_path), + enable_cache=not no_cache, )() diff --git a/llm-complete-guide/notebooks/reranking.ipynb b/llm-complete-guide/notebooks/reranking.ipynb index 94342811..80f8507a 100644 --- a/llm-complete-guide/notebooks/reranking.ipynb +++ b/llm-complete-guide/notebooks/reranking.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -12,128 +12,9 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading TransformerRanker model mixedbread-ai/mxbai-rerank-large-v1\n", - "No device set\n", - "Using device cuda\n", - "No dtype set\n", - "Using dtype torch.float16\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "867edac78ccb49aea85b6e96c03c201b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "config.json: 0%| | 0.00/970 [00:00RankedResults(\n", - " results=[\n", - " Result(doc_id=0, text='I like to play soccer', score=-1.2607421875, rank=1),\n", - " Result(doc_id=2, text='I like to play basketball', score=-1.2890625, rank=2),\n", - " Result(doc_id=1, text='I like to play football', score=-1.9384765625, rank=3),\n", - " Result(doc_id=3, text='I love dogs', score=-5.12109375, rank=4),\n", - " Result(doc_id=4, text='Catcher in the Rye is a great book', score=-6.19140625, rank=5)\n", - " ],\n", - " query=\"What's your favorite sport?\",\n", - " has_scores=True\n", - ")\n", - "\n" - ], - "text/plain": [ - "\u001b[1;35mRankedResults\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mresults\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m0\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'I like to play soccer'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-1.2607421875\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m2\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'I like to play basketball'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-1.2890625\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m2\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'I like to play football'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-1.9384765625\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m3\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m3\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'I love dogs'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-5.12109375\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m4\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m4\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'Catcher in the Rye is a great book'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-6.19140625\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m5\u001b[0m\u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33mquery\u001b[0m=\u001b[32m\"What\u001b[0m\u001b[32m's your favorite sport?\"\u001b[0m,\n", - " \u001b[33mhas_scores\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "results = ranker.rank(query=\"What's your favorite sport?\", docs=texts)\n", "\n", @@ -207,35 +56,9 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
[\n",
-       "    'I like to play soccer',\n",
-       "    'I like to play basketball',\n",
-       "    'I like to play football',\n",
-       "    'I love dogs',\n",
-       "    'Catcher in the Rye is a great book'\n",
-       "]\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1m[\u001b[0m\n", - " \u001b[32m'I like to play soccer'\u001b[0m,\n", - " \u001b[32m'I like to play basketball'\u001b[0m,\n", - " \u001b[32m'I like to play football'\u001b[0m,\n", - " \u001b[32m'I love dogs'\u001b[0m,\n", - " \u001b[32m'Catcher in the Rye is a great book'\u001b[0m\n", - "\u001b[1m]\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "print([document.text for document in results.results])" ] @@ -256,36 +79,24 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "ImportError", - "evalue": "attempted relative import with no known parent package", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[46], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msteps\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01meval_retrieval\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m query_similar_docs\n", - "\u001b[0;31mImportError\u001b[0m: attempted relative import with no known parent package" - ] - } - ], + "outputs": [], "source": [ "embedded_question = get_embeddings(question)\n", - " db_conn = get_db_conn()\n", - " num_docs = 20 if use_reranking else 5\n", - " # get (content, url) tuples for the top n similar documents\n", - " top_similar_docs = get_topn_similar_docs(\n", - " embedded_question, db_conn, n=num_docs, include_metadata=True\n", - " )\n", - "\n", - " if use_reranking:\n", - " urls = rerank_documents(question, top_similar_docs)[:5]\n", - " else:\n", - " urls = [doc[1] for doc in top_similar_docs] # Unpacking URLs\n", - "\n", - " return (question, url_ending, urls)\n" + "db_conn = get_db_conn()\n", + "num_docs = 20 if use_reranking else 5\n", + "# get (content, url) tuples for the top n similar documents\n", + "top_similar_docs = get_topn_similar_docs(\n", + " embedded_question, db_conn, n=num_docs, include_metadata=True\n", + ")\n", + "\n", + "if use_reranking:\n", + " urls = rerank_documents(question, top_similar_docs)[:5]\n", + "else:\n", + " urls = [doc[1] for doc in top_similar_docs] # Unpacking URLs\n", + "\n", + "return (question, url_ending, urls)" ] }, { @@ -297,32 +108,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Ranked documents for query: quick brown fox\n", - "Document: A quick brown fox jumps over the lazy dog\n", - "Score: 0.6937165451385258\n", - "\n", - "Document: The quick brown fox jumps over the lazy dog\n", - "Score: 0.6928630071635998\n", - "\n", - "Document: The quick brown fox is quick and brown\n", - "Score: 0.6868308019742143\n", - "\n", - "Document: The quick brown fox is different from the lazy dog\n", - "Score: 0.6802242759508812\n", - "\n", - "Document: The lazy dog is lazy and sleepy\n", - "Score: 0.5727275080137214\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "import numpy as np\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", @@ -338,7 +126,11 @@ "]\n", "\n", "# Toy queries and their corresponding relevant document indices\n", - "queries = [(\"quick fox\", [0, 1, 2]), (\"lazy dog\", [3, 4]), (\"brown fox\", [0, 1, 2, 4])]\n", + "queries = [\n", + " (\"quick fox\", [0, 1, 2]),\n", + " (\"lazy dog\", [3, 4]),\n", + " (\"brown fox\", [0, 1, 2, 4]),\n", + "]\n", "\n", "# Create TF-IDF vectorizer\n", "vectorizer = TfidfVectorizer()\n", @@ -355,7 +147,9 @@ " query_vector = vectorizer.transform([query])\n", " for doc_idx, doc_vector in enumerate(document_vectors):\n", " X_train.append(\n", - " np.concatenate((query_vector.toarray()[0], doc_vector.toarray()[0]))\n", + " np.concatenate(\n", + " (query_vector.toarray()[0], doc_vector.toarray()[0])\n", + " )\n", " )\n", " y_train.append(1 if doc_idx in relevant_docs else 0)\n", "\n", @@ -367,7 +161,9 @@ "scores = []\n", "\n", "for doc_vector in document_vectors:\n", - " input_vector = np.concatenate((query_vector.toarray()[0], doc_vector.toarray()[0]))\n", + " input_vector = np.concatenate(\n", + " (query_vector.toarray()[0], doc_vector.toarray()[0])\n", + " )\n", " score = reranker.predict_proba([input_vector])[0][1]\n", " scores.append(score)\n", "\n", @@ -381,28 +177,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading default cross-encoder model for language en\n", - "Warning: Model type could not be auto-mapped with the defaults list. Defaulting to TransformerRanker.\n", - "If your model is NOT intended to be ran as a one-label cross-encoder, please reload it and specify the model_type! Otherwise, you may ignore this warning. You may specify `model_type='cross-encoder'` to suppress this warning in the future.\n", - "Default Model: mixedbread-ai/mxbai-rerank-base-v1\n", - "Loading TransformerRanker model mixedbread-ai/mxbai-rerank-base-v1\n", - "No device set\n", - "Using device cuda\n", - "No dtype set\n", - "Using dtype torch.float16\n", - "Loaded model mixedbread-ai/mxbai-rerank-base-v1\n", - "Using device cuda.\n", - "Using dtype torch.float16.\n" - ] - } - ], + "outputs": [], "source": [ "from rerankers import Reranker\n", "\n", @@ -422,45 +199,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
RankedResults(\n",
-       "    results=[\n",
-       "        Result(doc_id=5, text='I like to play basketball', score=-0.46533203125, rank=1),\n",
-       "        Result(doc_id=0, text='I like to play soccer', score=-0.7353515625, rank=2),\n",
-       "        Result(doc_id=1, text='I like to play football', score=-0.9677734375, rank=3),\n",
-       "        Result(doc_id=2, text='War and Peace is a great book', score=-5.40234375, rank=4),\n",
-       "        Result(doc_id=3, text='I love dogs', score=-5.5859375, rank=5),\n",
-       "        Result(doc_id=4, text=\"Ginger cats aren't very smart\", score=-5.94921875, rank=6)\n",
-       "    ],\n",
-       "    query=\"What's your favorite sport?\",\n",
-       "    has_scores=True\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mRankedResults\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mresults\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m5\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'I like to play basketball'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-0.46533203125\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m0\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'I like to play soccer'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-0.7353515625\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m2\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'I like to play football'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-0.9677734375\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m3\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m2\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'War and Peace is a great book'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-5.40234375\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m4\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m3\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'I love dogs'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-5.5859375\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m5\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m4\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m\"Ginger\u001b[0m\u001b[32m cats aren't very smart\"\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-5.94921875\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m6\u001b[0m\u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33mquery\u001b[0m=\u001b[32m\"What\u001b[0m\u001b[32m's your favorite sport?\"\u001b[0m,\n", - " \u001b[33mhas_scores\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "print(results)" ] @@ -475,7 +216,7 @@ ], "metadata": { "kernelspec": { - "display_name": "new-rag", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -489,9 +230,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.9" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/llm-complete-guide/pipelines/__init__.py b/llm-complete-guide/pipelines/__init__.py index e93f1401..ae127fa3 100644 --- a/llm-complete-guide/pipelines/__init__.py +++ b/llm-complete-guide/pipelines/__init__.py @@ -20,3 +20,4 @@ from pipelines.llm_basic_rag import llm_basic_rag from pipelines.llm_eval import llm_eval from pipelines.rag_deployment import rag_deployment +from pipelines.llm_index_and_evaluate import llm_index_and_evaluate \ No newline at end of file diff --git a/llm-complete-guide/pipelines/llm_basic_rag.py b/llm-complete-guide/pipelines/llm_basic_rag.py index 895c4df3..82a97b21 100644 --- a/llm-complete-guide/pipelines/llm_basic_rag.py +++ b/llm-complete-guide/pipelines/llm_basic_rag.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from litellm import config_path from steps.populate_index import ( generate_embeddings, diff --git a/llm-complete-guide/pipelines/llm_eval.py b/llm-complete-guide/pipelines/llm_eval.py index 8f604dac..0a83c5b1 100644 --- a/llm-complete-guide/pipelines/llm_eval.py +++ b/llm-complete-guide/pipelines/llm_eval.py @@ -29,16 +29,16 @@ @pipeline(enable_cache=False) -def llm_eval() -> None: +def llm_eval(after: Optional[str] = None) -> None: """Executes the pipeline to evaluate a RAG pipeline.""" # Retrieval evals - failure_rate_retrieval = retrieval_evaluation_small() - full_retrieval_answers = retrieval_evaluation_full() + failure_rate_retrieval = retrieval_evaluation_small(after=after) + full_retrieval_answers = retrieval_evaluation_full(after=after) failure_rate_retrieval_reranking = ( - retrieval_evaluation_small_with_reranking() + retrieval_evaluation_small_with_reranking(after=after) ) full_retrieval_answers_reranking = ( - retrieval_evaluation_full_with_reranking() + retrieval_evaluation_full_with_reranking(after=after) ) # E2E evals @@ -46,13 +46,13 @@ def llm_eval() -> None: failure_rate_bad_answers, failure_rate_bad_immediate_responses, failure_rate_good_responses, - ) = e2e_evaluation() + ) = e2e_evaluation(after=after) ( average_toxicity_score, average_faithfulness_score, average_helpfulness_score, average_relevance_score, - ) = e2e_evaluation_llm_judged() + ) = e2e_evaluation_llm_judged(after=after) visualize_evaluation_results( failure_rate_retrieval, diff --git a/llm-complete-guide/pipelines/llm_index_and_evaluate.py b/llm-complete-guide/pipelines/llm_index_and_evaluate.py new file mode 100644 index 00000000..16423867 --- /dev/null +++ b/llm-complete-guide/pipelines/llm_index_and_evaluate.py @@ -0,0 +1,35 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pipelines import llm_basic_rag, llm_eval +from zenml import pipeline + + +@pipeline +def llm_index_and_evaluate() -> None: + """Executes the pipeline to train a basic RAG model. + + This function performs the following steps: + 1. Scrapes URLs using the url_scraper function. + 2. Loads documents from the scraped URLs using the web_url_loader function. + 3. Preprocesses the loaded documents using the preprocess_documents function. + 4. Generates embeddings for the preprocessed documents using the generate_embeddings function. + 5. Generates an index for the embeddings and documents using the index_generator function. + 6. Evaluates the RAG pipeline using the llm_eval pipeline. + """ + llm_basic_rag() + llm_eval(after="index_generator") diff --git a/llm-complete-guide/run.py b/llm-complete-guide/run.py index 2152fda4..a2ba1f94 100644 --- a/llm-complete-guide/run.py +++ b/llm-complete-guide/run.py @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os import warnings +from pathlib import Path # Suppress the specific FutureWarning from huggingface_hub warnings.filterwarnings( @@ -48,9 +48,11 @@ llm_basic_rag, llm_eval, rag_deployment, + llm_index_and_evaluate, ) from structures import Document from zenml.materializers.materializer_registry import materializer_registry +from zenml import Model logger = get_logger(__name__) @@ -62,33 +64,21 @@ Run the ZenML LLM RAG complete guide project pipelines. """ ) -@click.option( - "--rag", - "rag", - is_flag=True, - default=False, - help="Whether to run the pipeline that creates the dataset.", -) -@click.option( - "--deploy", - "deploy", - is_flag=True, - default=False, - help="Whether to deploy a Gradio app to serve the RAG functionality.", -) -@click.option( - "--evaluation", - "evaluation", - is_flag=True, - default=False, - help="Whether to run the evaluation pipeline.", -) -@click.option( - "--query", - "query", - type=str, - required=False, - help="Query the RAG model.", +@click.argument( + "pipeline", + type=click.Choice( + [ + "rag", + "deploy", + "evaluation", + "query", + "synthetic", + "embeddings", + "chunks", + "basic_rag", + ] + ), + required=True, ) @click.option( "--model", @@ -106,47 +96,40 @@ help="The model to use for the completion.", ) @click.option( - "--no-cache", - "no_cache", - is_flag=True, - default=False, - help="Disable cache.", + "--zenml-model-name", + "zenml_model_name", + default="zenml-docs-qa-chatbot", + required=False, + help="The name of the ZenML model to use.", ) @click.option( - "--synthetic", - "synthetic", - is_flag=True, - default=False, - help="Run the synthetic data pipeline.", + "--zenml-model-version", + "zenml_model_version", + required=False, + default=None, + help="The name of the ZenML model version to use.", ) @click.option( - "--embeddings", - "embeddings", + "--no-cache", + "no_cache", is_flag=True, default=False, - help="Fine-tunes embeddings.", + help="Disable cache.", ) @click.option( "--argilla", - "argilla", + "use_argilla", is_flag=True, default=False, help="Uses Argilla annotations.", ) @click.option( "--reranked", - "reranked", + "use_reranker", is_flag=True, default=False, help="Whether to use the reranker.", ) -@click.option( - "--chunks", - "chunks", - is_flag=True, - default=False, - help="Generate chunks for Hugging Face dataset", -) @click.option( "--config", "config", @@ -154,108 +137,140 @@ help="Path to config", ) def main( - rag: bool = False, - deploy: bool = False, - evaluation: bool = False, - query: Optional[str] = None, + pipeline: str, + query_text: Optional[str] = None, model: str = OPENAI_MODEL, + zenml_model_name: str = "zenml-docs-qa-chatbot", + zenml_model_version: str = None, no_cache: bool = False, - synthetic: bool = False, - embeddings: bool = False, - argilla: bool = False, - reranked: bool = False, - chunks: bool = False, - config: str = None, + use_argilla: bool = False, + use_reranker: bool = False, + config: Optional[str] = None, ): """Main entry point for the pipeline execution. Args: - rag (bool): If `True`, the basic RAG pipeline will be run. - deploy (bool): If `True`, a Gradio app will be deployed to serve the RAG functionality. - evaluation (bool): If `True`, the evaluation pipeline will be run. - query (Optional[str]): If provided, the RAG model will be queried with this string. - model (str): The model to use for the completion. Default is OPENAI_MODEL. - no_cache (bool): If `True`, cache will be disabled. - synthetic (bool): If `True`, the synthetic data pipeline will be run. - embeddings (bool): If `True`, the embeddings will be fine-tuned. - argilla (bool): If `True`, the Argilla annotations will be used. - chunks (bool): If `True`, the chunks pipeline will be run. - reranked (bool): If `True`, rerankers will be used - config (str): Path to config + pipeline (str): The pipeline to execute (rag, deploy, evaluation, etc.) + query_text (Optional[str]): Query text when using 'query' command + model (str): The model to use for the completion + zenml_model_name (str): The name of the ZenML model to use + zenml_model_version (str): The name of the ZenML model version to use + no_cache (bool): If True, cache will be disabled + use_argilla (bool): If True, Argilla an notations will be used + use_reranker (bool): If True, rerankers will be used + config (Optional[str]): Path to config file """ pipeline_args = {"enable_cache": not no_cache} embeddings_finetune_args = { "enable_cache": not no_cache, "steps": { "prepare_load_data": { - "parameters": {"use_argilla_annotations": argilla} + "parameters": {"use_argilla_annotations": use_argilla} } }, } + + # Read the model version from a file in the root of the repo + # called "ZENML_VERSION.txt". + if zenml_model_version == "staging": + postfix = "-rc0" + elif zenml_model_version == "production": + postfix = "" + else: + postfix = "-dev" - if query: - response = process_input_with_retrieval( - query, model=model, use_reranking=reranked + if Path("ZENML_VERSION.txt").exists(): + with open("ZENML_VERSION.txt", "r") as file: + zenml_model_version = file.read().strip() + zenml_model_version += postfix + else: + raise RuntimeError( + "No model version file found. Please create a file called ZENML_VERSION.txt in the root of the repo with the model version." ) - # print rich markdown to the console - console = Console() - md = Markdown(response) - console.print(md) + # Create ZenML model + zenml_model = Model( + name=zenml_model_name, + version=zenml_model_version, + license="Apache 2.0", + description="RAG application for ZenML docs", + tags=["rag", "finetuned", "chatbot"], + limitations="Only works for ZenML documentation. Not generalizable to other domains. Entirely build with synthetic data. The data is also quite noisy on account of how the chunks were split.", + trade_offs="Focused on a specific RAG retrieval use case. Not generalizable to other domains.", + audience="ZenML users", + use_cases="RAG retrieval", + ) + # Handle config path config_path = None if config: - config_path = os.path.join( - os.path.dirname(os.path.realpath(__file__)), - "configs", - config, - ) + config_path = Path(__file__).parent / "configs" / config + + # Set default config paths based on pipeline + if not config_path: + config_mapping = { + "basic_rag": "dev/rag.yaml", + "rag": "dev/rag.yaml", + "evaluation": "dev/rag_eval.yaml", + "synthetic": "dev/synthetic.yaml", + "embeddings": "dev/embeddings.yaml", + } + if pipeline in config_mapping: + config_path = ( + Path(__file__).parent / "configs" / config_mapping[pipeline] + ) - if rag: - if not config_path: - config_path = os.path.join( - os.path.dirname(os.path.realpath(__file__)), - "configs", - "rag_local_dev.yaml", + # Execute query + if pipeline == "query": + if not query_text: + raise click.UsageError( + "--query-text is required when using 'query' command" ) - llm_basic_rag.with_options(config_path=config_path, **pipeline_args)() - if deploy: + response = process_input_with_retrieval( + query_text, model=model, use_reranking=use_reranker + ) + console = Console() + md = Markdown(response) + console.print(md) + return + + # Execute the appropriate pipeline + if pipeline == "basic_rag": + llm_basic_rag.with_options( + model=zenml_model, config_path=config_path, **pipeline_args + )() + # Also deploy if config is provided + if config: rag_deployment.with_options( config_path=config_path, **pipeline_args )() - if deploy: - rag_deployment.with_options(**pipeline_args)() - if evaluation: - if not config_path: - config_path = os.path.join( - os.path.dirname(os.path.realpath(__file__)), - "configs", - "rag_eval.yaml", - ) + + if pipeline == "rag": + llm_index_and_evaluate.with_options( + model=zenml_model, config_path=config_path, **pipeline_args + )() + + elif pipeline == "deploy": + rag_deployment.with_options(model=zenml_model, **pipeline_args)() + + elif pipeline == "evaluation": pipeline_args["enable_cache"] = False - llm_eval.with_options(config_path=config_path)() - if synthetic: - if not config_path: - config_path = os.path.join( - os.path.dirname(os.path.realpath(__file__)), - "configs", - "synthetic.yaml", - ) + llm_eval.with_options(model=zenml_model, config_path=config_path)() + + elif pipeline == "synthetic": generate_synthetic_data.with_options( - config_path=config_path, **pipeline_args + model=zenml_model, config_path=config_path, **pipeline_args )() - if embeddings: - if not config_path: - config_path = os.path.join( - os.path.dirname(os.path.realpath(__file__)), - "configs", - "embeddings.yaml", - ) + + elif pipeline == "embeddings": finetune_embeddings.with_options( - config_path=config_path, **embeddings_finetune_args + model=zenml_model, config_path=config_path, **embeddings_finetune_args + )() + + elif pipeline == "chunks": + generate_chunk_questions.with_options( + model=zenml_model, config_path=config_path, **pipeline_args )() - if chunks: - generate_chunk_questions.with_options(**pipeline_args)() if __name__ == "__main__": diff --git a/llm-complete-guide/steps/eval_visualisation.py b/llm-complete-guide/steps/eval_visualisation.py index 4b7b004b..badd62c1 100644 --- a/llm-complete-guide/steps/eval_visualisation.py +++ b/llm-complete-guide/steps/eval_visualisation.py @@ -65,7 +65,7 @@ def create_image( fontweight="bold", ) else: - bar_color = colors[i] if alternate_colours else "blue" + colors[i] if alternate_colours else "blue" text_color = "white" ax.text( v diff --git a/llm-complete-guide/steps/populate_index.py b/llm-complete-guide/steps/populate_index.py index 2772ca04..bb17dc94 100644 --- a/llm-complete-guide/steps/populate_index.py +++ b/llm-complete-guide/steps/populate_index.py @@ -35,7 +35,10 @@ from sentence_transformers import SentenceTransformer from structures import Document from utils.llm_utils import get_db_conn, split_documents -from zenml import ArtifactConfig, log_artifact_metadata, step +from zenml import ArtifactConfig, log_artifact_metadata, step, log_model_metadata +from zenml.metadata.metadata_types import Uri +from zenml.client import Client +from constants import SECRET_NAME logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -443,26 +446,15 @@ def draw_bar_chart( labels: List[str], title: str, ) -> None: - """Draws a bar chart on the given image. - - Args: - draw: The ImageDraw object to draw on - x: The x coordinate of the top-left corner - y: The y coordinate of the top-left corner - width: The total width of the chart area - height: The total height of the chart area - data: List of values for each bar - labels: List of labels for each bar - title: The title of the chart - - Returns: - None - """ - if label is None: - label = "" - + """Draws a bar chart on the given image.""" + # Ensure labels is a list, even if empty + labels = labels or [] + + # Skip drawing if no data + if not data: + return + max_value = max(data) - bar_width = width // len(data) bar_spacing = 10 @@ -486,14 +478,15 @@ def draw_bar_chart( title_y = y - 30 draw.text((title_x, title_y), title, font=title_font, fill="black") - for i, label in enumerate(labels): - font = ImageFont.load_default(size=10) - bbox = draw.textbbox((0, 0), label, font=font) - label_width = bbox[2] - bbox[0] - label_x = ( - x + i * (bar_width + bar_spacing) + (bar_width - label_width) // 2 - ) - draw.text((label_x, y + height - 15), label, font=font, fill="black") + # Only try to draw labels if they exist + if labels: + for i, label in enumerate(labels): + if label is not None: # Add null check for individual labels + font = ImageFont.load_default(size=10) + bbox = draw.textbbox((0, 0), str(label), font=font) # Convert to string + label_width = bbox[2] - bbox[0] + label_x = x + i * (bar_width + bar_spacing) + (bar_width - label_width) // 2 + draw.text((label_x, y + height - 15), str(label), font=font, fill="black") @step @@ -693,3 +686,47 @@ def index_generator( finally: if conn: conn.close() + + # Log the model metadata + prompt = """ + You are a friendly chatbot. \ + You can answer questions about ZenML, its features and its use cases. \ + You respond in a concise, technically credible tone. \ + You ONLY use the context from the ZenML documentation to provide relevant + answers. \ + You do not make up answers or provide opinions that you don't have + information to support. \ + If you are unsure or don't know, just say so. \ + """ + + client = Client() + CONNECTION_DETAILS = { + "user": client.get_secret(SECRET_NAME).secret_values["supabase_user"], + "password": "**********", + "host": client.get_secret(SECRET_NAME).secret_values["supabase_host"], + "port": client.get_secret(SECRET_NAME).secret_values["supabase_port"], + "dbname": "postgres", + } + + log_model_metadata( + metadata={ + "embeddings": { + "model": EMBEDDINGS_MODEL, + "dimensionality": EMBEDDING_DIMENSIONALITY, + "model_url": Uri( + f"https://huggingface.co/{EMBEDDINGS_MODEL}" + ), + }, + "prompt": { + "content": prompt, + }, + "vector_store": { + "name": "pgvector", + "connection_details": CONNECTION_DETAILS, + # TODO: Hard-coded for now + "database_url": Uri( + "https://supabase.com/dashboard/project/rkoiacgkeiwpwceahtlp/editor/29505?schema=public" + ), + }, + }, + ) diff --git a/llm-complete-guide/steps/rag_deployment.py b/llm-complete-guide/steps/rag_deployment.py index a750dde6..99a8c911 100644 --- a/llm-complete-guide/steps/rag_deployment.py +++ b/llm-complete-guide/steps/rag_deployment.py @@ -2,6 +2,8 @@ import webbrowser from huggingface_hub import HfApi + +from utils.hf_utils import get_hf_token from utils.llm_utils import process_input_with_retrieval from zenml import step from zenml.client import Client @@ -9,9 +11,8 @@ secret = Client().get_secret("llm-complete") -ZENML_API_TOKEN = secret.secret_values["zenml_api_token"] -ZENML_STORE_URL = secret.secret_values["zenml_store_url"] -HF_TOKEN = os.getenv("HF_TOKEN") +ZENML_API_TOKEN = os.environ.get("ZENML_API_TOKEN") +ZENML_STORE_URL = os.environ.get("ZENML_STORE_URL") SPACE_USERNAME = os.environ.get("ZENML_HF_USERNAME", "zenml") SPACE_NAME = os.environ.get("ZENML_HF_SPACE_NAME", "llm-complete-guide-rag") @@ -50,7 +51,7 @@ def predict(message, history): def upload_files_to_repo( - api, repo_id: str, files_mapping: dict, token: str = HF_TOKEN + api, repo_id: str, files_mapping: dict, token: str ): """Upload multiple files to a Hugging Face repository @@ -89,7 +90,7 @@ def gradio_rag_deployment() -> None: space_sdk="gradio", private=True, exist_ok=True, - token=HF_TOKEN, + token=get_hf_token(), ) api.add_space_secret( repo_id=hf_repo_id, @@ -112,6 +113,6 @@ def gradio_rag_deployment() -> None: hf_repo_requirements: "requirements.txt", } - upload_files_to_repo(api, hf_repo_id, files_to_upload, HF_TOKEN) + upload_files_to_repo(api, hf_repo_id, files_to_upload, get_hf_token()) webbrowser.open(f"https://huggingface.co/spaces/{hf_repo_id}") diff --git a/llm-complete-guide/steps/url_scraper.py b/llm-complete-guide/steps/url_scraper.py index f7910e26..9c54563b 100644 --- a/llm-complete-guide/steps/url_scraper.py +++ b/llm-complete-guide/steps/url_scraper.py @@ -21,11 +21,12 @@ from steps.url_scraping_utils import get_all_pages -@step(enable_cache=True) +@step def url_scraper( docs_url: str = "https://docs.zenml.io", repo_url: str = "https://github.com/zenml-io/zenml", website_url: str = "https://zenml.io", + use_dev_set: bool = False ) -> Annotated[str, ArtifactConfig(name="urls")]: """Generates a list of relevant URLs to scrape. @@ -39,18 +40,20 @@ def url_scraper( """ # We comment this out to make this pipeline faster # examples_readme_urls = get_nested_readme_urls(repo_url) - docs_urls = get_all_pages(docs_url) + use_dev_set = False + if use_dev_set: - # FOR TESTING ONLY - # docs_urls = [ - # "https://docs.zenml.io/getting-started/system-architectures", - # "https://docs.zenml.io/getting-started/core-concepts", - # "https://docs.zenml.io/user-guide/llmops-guide/rag-with-zenml/rag-85-loc", - # "https://docs.zenml.io/how-to/track-metrics-metadata/logging-metadata", - # "https://docs.zenml.io/how-to/debug-and-solve-issues", - # "https://docs.zenml.io/stack-components/step-operators/azureml", - # "https://docs.zenml.io/how-to/interact-with-secrets", - # ] + docs_urls = [ + "https://docs.zenml.io/getting-started/system-architectures", + "https://docs.zenml.io/getting-started/core-concepts", + "https://docs.zenml.io/user-guide/llmops-guide/rag-with-zenml/rag-85-loc", + "https://docs.zenml.io/how-to/track-metrics-metadata/logging-metadata", + "https://docs.zenml.io/how-to/debug-and-solve-issues", + "https://docs.zenml.io/stack-components/step-operators/azureml", + "https://docs.zenml.io/how-to/interact-with-secrets", + ] + else: + docs_urls = get_all_pages(docs_url) # website_urls = get_all_pages(website_url) # all_urls = docs_urls + website_urls + examples_readme_urls diff --git a/llm-complete-guide/steps/url_scraping_utils.py b/llm-complete-guide/steps/url_scraping_utils.py index 5adc42a5..d6367cbf 100644 --- a/llm-complete-guide/steps/url_scraping_utils.py +++ b/llm-complete-guide/steps/url_scraping_utils.py @@ -13,200 +13,36 @@ # permissions and limitations under the License. import re -from functools import lru_cache -from logging import getLogger -from time import sleep -from typing import List, Set, Tuple -from urllib.parse import urljoin, urlparse - import requests from bs4 import BeautifulSoup -from constants import RATE_LIMIT -from ratelimit import limits, sleep_and_retry - -logger = getLogger(__name__) - - -def is_valid_url(url: str, base: str) -> bool: - """ - Check if the given URL is valid, has the same base as the provided base, - and does not contain any version-specific paths. - - Args: - url (str): The URL to check. - base (str): The base URL to compare against. - - Returns: - bool: True if the URL is valid, has the same base, and does not contain version-specific paths, False otherwise. - """ - parsed = urlparse(url) - if not bool(parsed.netloc) or parsed.netloc != base: - return False - - # Check if the URL contains a version pattern (e.g., /v/0.x.x/) - version_pattern = r"/v/0\.\d+\.\d+/" - return not re.search(version_pattern, url) - - -def strip_query_params(url: str) -> str: - """Strip query parameters from a URL. - - Args: - url (str): The URL to strip query parameters from. - - Returns: - str: The URL without query parameters. - """ - return url.split("?")[0] - - -def get_all_pages(url: str) -> List[str]: - """ - Retrieve all pages with the same base as the given URL. - - Args: - url (str): The URL to retrieve pages from. - - Returns: - List[str]: A list of all pages with the same base. - """ - logger.info(f"Scraping all pages from {url}...") - base_url = urlparse(url).netloc - - # Use a queue-based approach instead of recursion - pages = set() - queue = [url] - while queue: - current_url = queue.pop(0) - if current_url not in pages: - pages.add(current_url) - links = get_all_links(current_url, base_url) - queue.extend(links) - sleep(1 / RATE_LIMIT) # Rate limit the requests - - stripped_pages = [strip_query_params(page) for page in pages] - - logger.info(f"Found {len(stripped_pages)} pages.") - logger.info("Done scraping pages.") - return list(stripped_pages) - - -def crawl(url: str, base: str, visited: Set[str] = None) -> Set[str]: - """ - Recursively crawl a URL and its links, retrieving all valid links with the same base. - - Args: - url (str): The URL to crawl. - base (str): The base URL to compare against. - visited (Set[str]): A set of URLs that have been visited. Defaults to None. - - Returns: - Set[str]: A set of all valid links with the same base. - """ - if visited is None: - visited = set() - - visited.add(url) - logger.debug(f"Crawling URL: {url}") - links = get_all_links(url, base) - - for link in links: - if link not in visited: - visited.update(crawl(link, base, visited)) - sleep(1 / RATE_LIMIT) # Rate limit the recursive calls - - return visited - - -@sleep_and_retry -@limits(calls=RATE_LIMIT, period=1) -@lru_cache(maxsize=128) -def get_all_links(url: str, base: str) -> List[str]: - """ - Retrieve all valid links from a given URL with the same base. - - Args: - url (str): The URL to retrieve links from. - base (str): The base URL to compare against. - - Returns: - List[str]: A list of valid links with the same base. - """ - logger.debug(f"Retrieving links from {url}") - response = requests.get(url) - soup = BeautifulSoup(response.text, "html.parser") - links = [] - - for link in soup.find_all("a", href=True): - href = link["href"] - full_url = urljoin(url, href) - parsed_url = urlparse(full_url) - cleaned_url = parsed_url._replace(fragment="").geturl() - if is_valid_url(cleaned_url, base): - print(cleaned_url) - links.append(cleaned_url) - - logger.debug(f"Found {len(links)} valid links from {url}") - return links - - -@sleep_and_retry -@limits(calls=RATE_LIMIT, period=1) -@lru_cache(maxsize=128) -def get_readme_urls(repo_url: str) -> Tuple[List[str], List[str]]: - """ - Retrieve folder and README links from a GitHub repository. - - Args: - repo_url (str): The URL of the GitHub repository. - - Returns: - Tuple[List[str], List[str]]: A tuple containing two lists: folder links and README links. - """ - logger.debug(f"Retrieving README links from {repo_url}") - headers = {"Accept": "application/vnd.github+json"} - r = requests.get(repo_url, headers=headers) - soup = BeautifulSoup(r.text, "html.parser") - - folder_links = [] - readme_links = [] - - for link in soup.find_all("a", class_="js-navigation-open Link--primary"): - href = link["href"] - full_url = f"https://github.com{href}" - if "tree" in href: - folder_links.append(full_url) - elif "README.md" in href: - readme_links.append(full_url) +from typing import List +from logging import getLogger - logger.debug( - f"Found {len(folder_links)} folder links and {len(readme_links)} README links from {repo_url}" - ) - return folder_links, readme_links +logger = getLogger(__name__) -def get_nested_readme_urls(repo_url: str) -> List[str]: +def get_all_pages(base_url: str = "https://docs.zenml.io") -> List[str]: """ - Retrieve all nested README links from a GitHub repository. + Retrieve all pages from the ZenML documentation sitemap. Args: - repo_url (str): The URL of the GitHub repository. + base_url (str): The base URL of the documentation. Defaults to "https://docs.zenml.io" Returns: - List[str]: A list of all nested README links. + List[str]: A list of all documentation page URLs. """ - logger.info(f"Retrieving nested README links from {repo_url}...") - folder_links, readme_links = get_readme_urls(repo_url) - - for folder_link in folder_links: - _, nested_readme_links = get_readme_urls(folder_link) - readme_links.extend(nested_readme_links) - - logger.info( - f"Found {len(readme_links)} nested README links from {repo_url}" - ) - return readme_links - + logger.info("Fetching sitemap from docs.zenml.io...") + + # Fetch the sitemap + sitemap_url = f"{base_url}/sitemap.xml" + response = requests.get(sitemap_url) + soup = BeautifulSoup(response.text, "xml") + + # Extract all URLs from the sitemap + urls = [loc.text for loc in soup.find_all("loc")] + + logger.info(f"Found {len(urls)} pages in the sitemap.") + return urls def extract_parent_section(url: str) -> str: """ diff --git a/llm-complete-guide/utils/hf_utils.py b/llm-complete-guide/utils/hf_utils.py new file mode 100644 index 00000000..2de954fa --- /dev/null +++ b/llm-complete-guide/utils/hf_utils.py @@ -0,0 +1,8 @@ +from constants import SECRET_NAME +from zenml.client import Client + + +def get_hf_token() -> str: + api_key = Client().get_secret(SECRET_NAME).secret_values["hf_token"] + + return api_key diff --git a/llm-complete-guide/utils/openai_utils.py b/llm-complete-guide/utils/openai_utils.py index e67ba5f9..15b84cc5 100644 --- a/llm-complete-guide/utils/openai_utils.py +++ b/llm-complete-guide/utils/openai_utils.py @@ -2,7 +2,7 @@ from zenml.client import Client -def get_openai_api_key(): +def get_openai_api_key() -> str: api_key = Client().get_secret(SECRET_NAME).secret_values["openai_api_key"] return api_key