Skip to content

Commit

Permalink
Merge pull request #1020 from nkwangleiGIT/main
Browse files Browse the repository at this point in the history
Add ray-serve-vllm deployment sample
  • Loading branch information
nkwangleiGIT authored Apr 29, 2024
2 parents 594fd59 + 179e6f9 commit dd1e60c
Show file tree
Hide file tree
Showing 10 changed files with 278 additions and 7 deletions.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
5 changes: 5 additions & 0 deletions deploy/ray-serve-vllm/Dockerfile.app
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# the base image is built from Dockerfile.vllm.ray
FROM vllm/vllm-openai:ray-2.11.0-py3.10.12-patched

# Copy the packaged python application
COPY llm-serving-app.zip /vllm-workspace/
10 changes: 10 additions & 0 deletions deploy/ray-serve-vllm/Dockerfile.vllm.ray
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Use vllm 0.4.1 and ray will be 2.11.0 for now
# python version is py3.10.12
FROM vllm/vllm-openai:v0.4.1

# wget for health check and ray for default packages
RUN apt-get install -y curl wget && pip install 'ray[default]' -i https://pypi.mirrors.ustc.edu.cn/simple/

# Patch for vllm and can be remove once https://github.com/vllm-project/vllm/issues/2683 is fixed
COPY vllm-patched/serving_chat.py /usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/serving_chat.py
COPY vllm-patched/serving_engine.py /usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/serving_engine.py
141 changes: 141 additions & 0 deletions deploy/ray-serve-vllm/llm-serving-app/llm-serving.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import json
import logging
from typing import AsyncGenerator

import ray
import fastapi
# from huggingface_hub import login
from ray import serve

from fastapi import Request
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, Response, StreamingResponse

import vllm
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
CompletionRequest, ErrorResponse)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.logger import init_logger
from vllm.usage.usage_lib import UsageContext


TIMEOUT_KEEP_ALIVE = 5 # seconds

logger = logging.getLogger("ray.serve")

app = fastapi.FastAPI()

# Modified based on https://github.com/vllm-project/vllm/blob/v0.4.1/vllm/entrypoints/openai/api_server.py

@serve.deployment(num_replicas=1)
@serve.ingress(app)
class VLLMPredictDeployment():
def __init__(self, **kwargs):
"""
Construct a VLLM deployment.
Refer to https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py
for the full list of arguments.
Args:
model: name or path of the huggingface model to use
download_dir: directory to download and load the weights,
default to the default cache dir of huggingface.
use_np_weights: save a numpy copy of model weights for
faster loading. This can increase the disk usage by up to 2x.
use_dummy_weights: use dummy values for model weights.
dtype: data type for model weights and activations.
The "auto" option will use FP16 precision
for FP32 and FP16 models, and BF16 precision.
for BF16 models.
seed: random seed.
worker_use_ray: use Ray for distributed serving, will be
automatically set when using more than 1 GPU
pipeline_parallel_size: number of pipeline stages.
tensor_parallel_size: number of tensor parallel replicas.
block_size: token block size.
swap_space: CPU swap space size (GiB) per GPU.
gpu_memory_utilization: the percentage of GPU memory to be used for
the model executor
max_num_batched_tokens: maximum number of batched tokens per iteration
max_num_seqs: maximum number of sequences per iteration.
disable_log_stats: disable logging statistics.
engine_use_ray: use Ray to start the LLM engine in a separate
process as the server process.
disable_log_requests: disable logging requests.
"""
kwargs = {**kwargs, 'tensor_parallel_size': 1, 'gpu_memory_utilization': 0.9, 'model': '/data/models/qwen1.5-7b-chat', 'trust_remote_code': 'true', 'worker_use_ray': 'true', 'max_model_len': 6000}

logger.info(f"vLLM API server version {vllm.__version__}")
logger.info(f"kwargs: {kwargs}")

args = AsyncEngineArgs(**kwargs)
logger.info(f"args: {args}")
served_model = args.model
engine_args = AsyncEngineArgs.from_cli_args(args)
engine = AsyncLLMEngine.from_engine_args(
engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
args.response_role = ""
args.lora_modules = ""
args.chat_template = "./templates/chat-template-qwen.jinja2"
self.openai_serving_chat = OpenAIServingChat(engine, served_model,
args.response_role,
args.lora_modules,
args.chat_template)
self.openai_serving_completion = OpenAIServingCompletion(
engine, served_model, args.lora_modules)


@app.get("/health")
async def health(self) -> Response:
"""Health check."""
await self.openai_serving_chat.engine.check_health()
return Response(status_code=200)


@app.get("/v1/models")
async def show_available_models(self):
models = await self.openai_serving_chat.show_available_models()
return JSONResponse(content=models.model_dump())


@app.get("/version")
async def show_version(self):
ver = {"version": vllm.__version__}
return JSONResponse(content=ver)


@app.post("/v1/chat/completions")
async def create_chat_completion(self, request: ChatCompletionRequest,
raw_request: Request):
generator = await self.openai_serving_chat.create_chat_completion(
request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
status_code=generator.code)
if request.stream:
return StreamingResponse(content=generator,
media_type="text/event-stream")
else:
return JSONResponse(content=generator.model_dump())


@app.post("/v1/completions")
async def create_completion(self, request: CompletionRequest, raw_request: Request):
generator = await self.openai_serving_completion.create_completion(
request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
status_code=generator.code)
if request.stream:
return StreamingResponse(content=generator,
media_type="text/event-stream")
else:
return JSONResponse(content=generator.model_dump())

deployment = VLLMPredictDeployment.bind()
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{% for message in messages %}{{'<|im_start|>' + message['role'] + '
' + message['content'] + '<|im_end|>' + '
'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
' }}{% endif %}
118 changes: 118 additions & 0 deletions deploy/ray-serve-vllm/raycluster-serve-llm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
apiVersion: ray.io/v1
kind: RayService
metadata:
name: rayservice-sample
namespace: kuberay-system
spec:
# serveConfigV2 takes a yaml multi-line scalar, which should be a Ray Serve multi-application config. See https://docs.ray.io/en/latest/serve/multi-app.html.
serviceUnhealthySecondThreshold: 900
deploymentUnhealthySecondThreshold: 300
serveConfigV2: |
applications:
- name: llm-serving-app
import_path: llm-serving:deployment
route_prefix: /
runtime_env:
working_dir: FILE:///vllm-workspace/llm-app.zip
deployments:
- name: VLLMPredictDeployment
num_replicas: 1
rayClusterConfig:
rayVersion: '2.11.0' # should match the Ray version in the image of the containers
######################headGroupSpecs#################################
# Ray head pod template.
headGroupSpec:
# The `rayStartParams` are used to configure the `ray start` command.
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
rayStartParams:
resources: '"{\"accelerator_type_cpu\": 4}"'
dashboard-host: '0.0.0.0'
#pod template
template:
spec:
volumes:
# mount the model from hostPath
- name: model-data
hostPath:
path: /data/models
type: Directory
- name: tz-config
hostPath:
path: /etc/localtime
containers:
- name: ray-head
image: vllm/vllm-openai:ray-2.11.0-py3.10.12-llm-app
#image: rayproject/ray-ml:2.10.0-py310-zip
resources:
limits:
cpu: 4
memory: 16Gi
requests:
cpu: 2
memory: 2Gi
ports:
- containerPort: 6379
name: gcs-server
- containerPort: 8265 # Ray dashboard
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
volumeMounts:
- mountPath: /data/models
name: model-data
- name: tz-config
mountPath: /etc/localtime
workerGroupSpecs:
# the pod replicas in this group typed worker
- replicas: 1
minReplicas: 0
maxReplicas: 5
# logical group name, for this called small-group, also can be functional
groupName: small-group
# The `rayStartParams` are used to configure the `ray start` command.
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
rayStartParams:
resources: '"{\"accelerator_type_cpu\": 4, \"accelerator_type_3090\": 1}"'
#pod template
template:
spec:
volumes:
- name: model-data
hostPath:
path: /NFS/125_bakup/models
type: Directory
- name: tz-config
hostPath:
path: /etc/localtime
- name: dshm
emptyDir:
medium: Memory
sizeLimit: "5.24Gi"
containers:
- name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
#image: rayproject/ray-ml:2.10.0-py310-zip
image: vllm/vllm-openai:ray-2.11.0-py3.10.12-llm-app
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
resources:
limits:
cpu: "4"
memory: "16Gi"
nvidia.com/gpu: 1
requests:
cpu: "2"
memory: "2Gi"
nvidia.com/gpu: 1
volumeMounts:
- mountPath: /data/models
name: model-data
- name: tz-config
mountPath: /etc/localtime
- mountPath: /dev/shm
name: dshm
7 changes: 0 additions & 7 deletions deploy/ray/Dockerfile

This file was deleted.

0 comments on commit dd1e60c

Please sign in to comment.