Merge pull request #1020 from nkwangleiGIT/main

Add ray-serve-vllm deployment sample
kubeagi · Apr 29, 2024 · dd1e60c · dd1e60c
2 parents 594fd59 + 179e6f9
commit dd1e60c
Show file tree

Hide file tree

Showing 10 changed files with 278 additions and 7 deletions.
diff --git a/deploy/llms/Dockerfile.fastchat-server → ...y/fastchat-llm/Dockerfile.fastchat-server b/deploy/llms/Dockerfile.fastchat-server → ...y/fastchat-llm/Dockerfile.fastchat-server
diff --git a/deploy/llms/Dockerfile.fastchat-worker → ...y/fastchat-llm/Dockerfile.fastchat-worker b/deploy/llms/Dockerfile.fastchat-worker → ...y/fastchat-llm/Dockerfile.fastchat-worker
diff --git a/deploy/llms/start-worker.sh → deploy/fastchat-llm/start-worker.sh b/deploy/llms/start-worker.sh → deploy/fastchat-llm/start-worker.sh
diff --git a/deploy/llms/utils.py → deploy/fastchat-llm/utils.py b/deploy/llms/utils.py → deploy/fastchat-llm/utils.py
diff --git a/deploy/ray-serve-vllm/Dockerfile.app b/deploy/ray-serve-vllm/Dockerfile.app
@@ -0,0 +1,5 @@
+# the base image is built from Dockerfile.vllm.ray
+FROM vllm/vllm-openai:ray-2.11.0-py3.10.12-patched
+
+# Copy the packaged python application
+COPY llm-serving-app.zip /vllm-workspace/
diff --git a/deploy/ray-serve-vllm/Dockerfile.vllm.ray b/deploy/ray-serve-vllm/Dockerfile.vllm.ray
@@ -0,0 +1,10 @@
+# Use vllm 0.4.1 and ray will be 2.11.0 for now
+# python version is py3.10.12
+FROM vllm/vllm-openai:v0.4.1
+
+# wget for health check and ray for default packages
+RUN apt-get install -y curl wget && pip install 'ray[default]' -i https://pypi.mirrors.ustc.edu.cn/simple/
+
+# Patch for vllm and can be remove once https://github.com/vllm-project/vllm/issues/2683 is fixed
+COPY vllm-patched/serving_chat.py /usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/serving_chat.py
+COPY vllm-patched/serving_engine.py /usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/serving_engine.py
diff --git a/deploy/ray-serve-vllm/llm-serving-app/llm-serving.py b/deploy/ray-serve-vllm/llm-serving-app/llm-serving.py
@@ -0,0 +1,141 @@
+import json
+import logging
+from typing import AsyncGenerator
+
+import ray
+import fastapi
+# from huggingface_hub import login
+from ray import serve
+
+from fastapi import Request
+from fastapi.exceptions import RequestValidationError
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, Response, StreamingResponse
+
+import vllm
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.openai.cli_args import make_arg_parser
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              CompletionRequest, ErrorResponse)
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
+from vllm.logger import init_logger
+from vllm.usage.usage_lib import UsageContext
+
+
+TIMEOUT_KEEP_ALIVE = 5  # seconds
+
+logger = logging.getLogger("ray.serve")
+
+app = fastapi.FastAPI()
+
+# Modified based on https://github.com/vllm-project/vllm/blob/v0.4.1/vllm/entrypoints/openai/api_server.py
+
+@serve.deployment(num_replicas=1)
+@serve.ingress(app)
+class VLLMPredictDeployment():
+    def __init__(self, **kwargs):
+        """
+        Construct a VLLM deployment.
+
+        Refer to https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py
+        for the full list of arguments.
+
+        Args:
+            model: name or path of the huggingface model to use
+            download_dir: directory to download and load the weights,
+                default to the default cache dir of huggingface.
+            use_np_weights: save a numpy copy of model weights for
+                faster loading. This can increase the disk usage by up to 2x.
+            use_dummy_weights: use dummy values for model weights.
+            dtype: data type for model weights and activations.
+                The "auto" option will use FP16 precision
+                for FP32 and FP16 models, and BF16 precision.
+                for BF16 models.
+            seed: random seed.
+            worker_use_ray: use Ray for distributed serving, will be
+                automatically set when using more than 1 GPU
+            pipeline_parallel_size: number of pipeline stages.
+            tensor_parallel_size: number of tensor parallel replicas.
+            block_size: token block size.
+            swap_space: CPU swap space size (GiB) per GPU.
+            gpu_memory_utilization: the percentage of GPU memory to be used for
+                the model executor
+            max_num_batched_tokens: maximum number of batched tokens per iteration
+            max_num_seqs: maximum number of sequences per iteration.
+            disable_log_stats: disable logging statistics.
+            engine_use_ray: use Ray to start the LLM engine in a separate
+                process as the server process.
+            disable_log_requests: disable logging requests.
+        """
+        kwargs = {**kwargs, 'tensor_parallel_size': 1, 'gpu_memory_utilization': 0.9, 'model': '/data/models/qwen1.5-7b-chat', 'trust_remote_code': 'true', 'worker_use_ray': 'true', 'max_model_len': 6000}
+
+        logger.info(f"vLLM API server version {vllm.__version__}")
+        logger.info(f"kwargs: {kwargs}")
+
+        args = AsyncEngineArgs(**kwargs)
+        logger.info(f"args: {args}")
+        served_model = args.model
+        engine_args = AsyncEngineArgs.from_cli_args(args)
+        engine = AsyncLLMEngine.from_engine_args(
+            engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
+        args.response_role = ""
+        args.lora_modules = ""
+        args.chat_template = "./templates/chat-template-qwen.jinja2"
+        self.openai_serving_chat = OpenAIServingChat(engine, served_model,
+                                            args.response_role,
+                                            args.lora_modules,
+                                            args.chat_template)
+        self.openai_serving_completion = OpenAIServingCompletion(
+            engine, served_model, args.lora_modules)
+
+
+    @app.get("/health")
+    async def health(self) -> Response:
+        """Health check."""
+        await self.openai_serving_chat.engine.check_health()
+        return Response(status_code=200)
+
+
+    @app.get("/v1/models")
+    async def show_available_models(self):
+        models = await self.openai_serving_chat.show_available_models()
+        return JSONResponse(content=models.model_dump())
+
+
+    @app.get("/version")
+    async def show_version(self):
+        ver = {"version": vllm.__version__}
+        return JSONResponse(content=ver)
+
+
+    @app.post("/v1/chat/completions")
+    async def create_chat_completion(self, request: ChatCompletionRequest,
+                                     raw_request: Request):
+        generator = await self.openai_serving_chat.create_chat_completion(
+            request, raw_request)
+        if isinstance(generator, ErrorResponse):
+            return JSONResponse(content=generator.model_dump(),
+                                status_code=generator.code)
+        if request.stream:
+            return StreamingResponse(content=generator,
+                                     media_type="text/event-stream")
+        else:
+            return JSONResponse(content=generator.model_dump())
+
+
+    @app.post("/v1/completions")
+    async def create_completion(self, request: CompletionRequest, raw_request: Request):
+        generator = await self.openai_serving_completion.create_completion(
+            request, raw_request)
+        if isinstance(generator, ErrorResponse):
+            return JSONResponse(content=generator.model_dump(),
+                                status_code=generator.code)
+        if request.stream:
+            return StreamingResponse(content=generator,
+                                     media_type="text/event-stream")
+        else:
+            return JSONResponse(content=generator.model_dump())
+
+deployment = VLLMPredictDeployment.bind()
diff --git a/deploy/ray-serve-vllm/llm-serving-app/templates/chat-template-qwen.jinja2 b/deploy/ray-serve-vllm/llm-serving-app/templates/chat-template-qwen.jinja2
@@ -0,0 +1,4 @@
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}
diff --git a/deploy/ray-serve-vllm/raycluster-serve-llm.yaml b/deploy/ray-serve-vllm/raycluster-serve-llm.yaml
@@ -0,0 +1,118 @@
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: rayservice-sample
+  namespace: kuberay-system
+spec:
+  # serveConfigV2 takes a yaml multi-line scalar, which should be a Ray Serve multi-application config. See https://docs.ray.io/en/latest/serve/multi-app.html.
+  serviceUnhealthySecondThreshold: 900
+  deploymentUnhealthySecondThreshold: 300
+  serveConfigV2: |
+    applications:
+      - name: llm-serving-app
+        import_path: llm-serving:deployment
+        route_prefix: /
+        runtime_env:
+          working_dir: FILE:///vllm-workspace/llm-app.zip
+        deployments:
+          - name: VLLMPredictDeployment
+            num_replicas: 1
+  rayClusterConfig:
+    rayVersion: '2.11.0' # should match the Ray version in the image of the containers
+    ######################headGroupSpecs#################################
+    # Ray head pod template.
+    headGroupSpec:
+      # The `rayStartParams` are used to configure the `ray start` command.
+      # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
+      # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
+      rayStartParams:
+        resources: '"{\"accelerator_type_cpu\": 4}"'
+        dashboard-host: '0.0.0.0'
+      #pod template
+      template:
+        spec:
+          volumes:
+          # mount the model from hostPath
+          - name: model-data
+            hostPath:
+              path: /data/models
+              type: Directory
+          - name: tz-config
+            hostPath:
+              path: /etc/localtime
+          containers:
+            - name: ray-head
+              image: vllm/vllm-openai:ray-2.11.0-py3.10.12-llm-app
+              #image: rayproject/ray-ml:2.10.0-py310-zip
+              resources:
+                limits:
+                  cpu: 4
+                  memory: 16Gi
+                requests:
+                  cpu: 2
+                  memory: 2Gi
+              ports:
+                - containerPort: 6379
+                  name: gcs-server
+                - containerPort: 8265 # Ray dashboard
+                  name: dashboard
+                - containerPort: 10001
+                  name: client
+                - containerPort: 8000
+                  name: serve
+              volumeMounts:
+              - mountPath: /data/models
+                name: model-data
+              - name: tz-config
+                mountPath: /etc/localtime
+    workerGroupSpecs:
+      # the pod replicas in this group typed worker
+      - replicas: 1
+        minReplicas: 0
+        maxReplicas: 5
+        # logical group name, for this called small-group, also can be functional
+        groupName: small-group
+        # The `rayStartParams` are used to configure the `ray start` command.
+        # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
+        # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
+        rayStartParams:
+          resources: '"{\"accelerator_type_cpu\": 4, \"accelerator_type_3090\": 1}"'
+        #pod template
+        template:
+          spec:
+            volumes:
+            - name: model-data
+              hostPath:
+                path: /NFS/125_bakup/models
+                type: Directory
+            - name: tz-config
+              hostPath:
+                path: /etc/localtime
+            - name: dshm
+              emptyDir:
+                medium: Memory
+                sizeLimit: "5.24Gi"
+            containers:
+              - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
+                #image: rayproject/ray-ml:2.10.0-py310-zip
+                image: vllm/vllm-openai:ray-2.11.0-py3.10.12-llm-app
+                lifecycle:
+                  preStop:
+                    exec:
+                      command: ["/bin/sh","-c","ray stop"]
+                resources:
+                  limits:
+                    cpu: "4"
+                    memory: "16Gi"
+                    nvidia.com/gpu: 1
+                  requests:
+                    cpu: "2"
+                    memory: "2Gi"
+                    nvidia.com/gpu: 1
+                volumeMounts:
+                - mountPath: /data/models
+                  name: model-data
+                - name: tz-config
+                  mountPath: /etc/localtime
+                - mountPath: /dev/shm
+                  name: dshm
diff --git a/deploy/ray/Dockerfile b/deploy/ray/Dockerfile