-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1020 from nkwangleiGIT/main
Add ray-serve-vllm deployment sample
- Loading branch information
Showing
10 changed files
with
278 additions
and
7 deletions.
There are no files selected for viewing
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# the base image is built from Dockerfile.vllm.ray | ||
FROM vllm/vllm-openai:ray-2.11.0-py3.10.12-patched | ||
|
||
# Copy the packaged python application | ||
COPY llm-serving-app.zip /vllm-workspace/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# Use vllm 0.4.1 and ray will be 2.11.0 for now | ||
# python version is py3.10.12 | ||
FROM vllm/vllm-openai:v0.4.1 | ||
|
||
# wget for health check and ray for default packages | ||
RUN apt-get install -y curl wget && pip install 'ray[default]' -i https://pypi.mirrors.ustc.edu.cn/simple/ | ||
|
||
# Patch for vllm and can be remove once https://github.com/vllm-project/vllm/issues/2683 is fixed | ||
COPY vllm-patched/serving_chat.py /usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/serving_chat.py | ||
COPY vllm-patched/serving_engine.py /usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/serving_engine.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
import json | ||
import logging | ||
from typing import AsyncGenerator | ||
|
||
import ray | ||
import fastapi | ||
# from huggingface_hub import login | ||
from ray import serve | ||
|
||
from fastapi import Request | ||
from fastapi.exceptions import RequestValidationError | ||
from fastapi.middleware.cors import CORSMiddleware | ||
from fastapi.responses import JSONResponse, Response, StreamingResponse | ||
|
||
import vllm | ||
from vllm.engine.arg_utils import AsyncEngineArgs | ||
from vllm.engine.async_llm_engine import AsyncLLMEngine | ||
from vllm.entrypoints.openai.cli_args import make_arg_parser | ||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, | ||
CompletionRequest, ErrorResponse) | ||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat | ||
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion | ||
from vllm.logger import init_logger | ||
from vllm.usage.usage_lib import UsageContext | ||
|
||
|
||
TIMEOUT_KEEP_ALIVE = 5 # seconds | ||
|
||
logger = logging.getLogger("ray.serve") | ||
|
||
app = fastapi.FastAPI() | ||
|
||
# Modified based on https://github.com/vllm-project/vllm/blob/v0.4.1/vllm/entrypoints/openai/api_server.py | ||
|
||
@serve.deployment(num_replicas=1) | ||
@serve.ingress(app) | ||
class VLLMPredictDeployment(): | ||
def __init__(self, **kwargs): | ||
""" | ||
Construct a VLLM deployment. | ||
Refer to https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py | ||
for the full list of arguments. | ||
Args: | ||
model: name or path of the huggingface model to use | ||
download_dir: directory to download and load the weights, | ||
default to the default cache dir of huggingface. | ||
use_np_weights: save a numpy copy of model weights for | ||
faster loading. This can increase the disk usage by up to 2x. | ||
use_dummy_weights: use dummy values for model weights. | ||
dtype: data type for model weights and activations. | ||
The "auto" option will use FP16 precision | ||
for FP32 and FP16 models, and BF16 precision. | ||
for BF16 models. | ||
seed: random seed. | ||
worker_use_ray: use Ray for distributed serving, will be | ||
automatically set when using more than 1 GPU | ||
pipeline_parallel_size: number of pipeline stages. | ||
tensor_parallel_size: number of tensor parallel replicas. | ||
block_size: token block size. | ||
swap_space: CPU swap space size (GiB) per GPU. | ||
gpu_memory_utilization: the percentage of GPU memory to be used for | ||
the model executor | ||
max_num_batched_tokens: maximum number of batched tokens per iteration | ||
max_num_seqs: maximum number of sequences per iteration. | ||
disable_log_stats: disable logging statistics. | ||
engine_use_ray: use Ray to start the LLM engine in a separate | ||
process as the server process. | ||
disable_log_requests: disable logging requests. | ||
""" | ||
kwargs = {**kwargs, 'tensor_parallel_size': 1, 'gpu_memory_utilization': 0.9, 'model': '/data/models/qwen1.5-7b-chat', 'trust_remote_code': 'true', 'worker_use_ray': 'true', 'max_model_len': 6000} | ||
|
||
logger.info(f"vLLM API server version {vllm.__version__}") | ||
logger.info(f"kwargs: {kwargs}") | ||
|
||
args = AsyncEngineArgs(**kwargs) | ||
logger.info(f"args: {args}") | ||
served_model = args.model | ||
engine_args = AsyncEngineArgs.from_cli_args(args) | ||
engine = AsyncLLMEngine.from_engine_args( | ||
engine_args, usage_context=UsageContext.OPENAI_API_SERVER) | ||
args.response_role = "" | ||
args.lora_modules = "" | ||
args.chat_template = "./templates/chat-template-qwen.jinja2" | ||
self.openai_serving_chat = OpenAIServingChat(engine, served_model, | ||
args.response_role, | ||
args.lora_modules, | ||
args.chat_template) | ||
self.openai_serving_completion = OpenAIServingCompletion( | ||
engine, served_model, args.lora_modules) | ||
|
||
|
||
@app.get("/health") | ||
async def health(self) -> Response: | ||
"""Health check.""" | ||
await self.openai_serving_chat.engine.check_health() | ||
return Response(status_code=200) | ||
|
||
|
||
@app.get("/v1/models") | ||
async def show_available_models(self): | ||
models = await self.openai_serving_chat.show_available_models() | ||
return JSONResponse(content=models.model_dump()) | ||
|
||
|
||
@app.get("/version") | ||
async def show_version(self): | ||
ver = {"version": vllm.__version__} | ||
return JSONResponse(content=ver) | ||
|
||
|
||
@app.post("/v1/chat/completions") | ||
async def create_chat_completion(self, request: ChatCompletionRequest, | ||
raw_request: Request): | ||
generator = await self.openai_serving_chat.create_chat_completion( | ||
request, raw_request) | ||
if isinstance(generator, ErrorResponse): | ||
return JSONResponse(content=generator.model_dump(), | ||
status_code=generator.code) | ||
if request.stream: | ||
return StreamingResponse(content=generator, | ||
media_type="text/event-stream") | ||
else: | ||
return JSONResponse(content=generator.model_dump()) | ||
|
||
|
||
@app.post("/v1/completions") | ||
async def create_completion(self, request: CompletionRequest, raw_request: Request): | ||
generator = await self.openai_serving_completion.create_completion( | ||
request, raw_request) | ||
if isinstance(generator, ErrorResponse): | ||
return JSONResponse(content=generator.model_dump(), | ||
status_code=generator.code) | ||
if request.stream: | ||
return StreamingResponse(content=generator, | ||
media_type="text/event-stream") | ||
else: | ||
return JSONResponse(content=generator.model_dump()) | ||
|
||
deployment = VLLMPredictDeployment.bind() |
4 changes: 4 additions & 0 deletions
4
deploy/ray-serve-vllm/llm-serving-app/templates/chat-template-qwen.jinja2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
{% for message in messages %}{{'<|im_start|>' + message['role'] + ' | ||
' + message['content'] + '<|im_end|>' + ' | ||
'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant | ||
' }}{% endif %} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
apiVersion: ray.io/v1 | ||
kind: RayService | ||
metadata: | ||
name: rayservice-sample | ||
namespace: kuberay-system | ||
spec: | ||
# serveConfigV2 takes a yaml multi-line scalar, which should be a Ray Serve multi-application config. See https://docs.ray.io/en/latest/serve/multi-app.html. | ||
serviceUnhealthySecondThreshold: 900 | ||
deploymentUnhealthySecondThreshold: 300 | ||
serveConfigV2: | | ||
applications: | ||
- name: llm-serving-app | ||
import_path: llm-serving:deployment | ||
route_prefix: / | ||
runtime_env: | ||
working_dir: FILE:///vllm-workspace/llm-app.zip | ||
deployments: | ||
- name: VLLMPredictDeployment | ||
num_replicas: 1 | ||
rayClusterConfig: | ||
rayVersion: '2.11.0' # should match the Ray version in the image of the containers | ||
######################headGroupSpecs################################# | ||
# Ray head pod template. | ||
headGroupSpec: | ||
# The `rayStartParams` are used to configure the `ray start` command. | ||
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay. | ||
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`. | ||
rayStartParams: | ||
resources: '"{\"accelerator_type_cpu\": 4}"' | ||
dashboard-host: '0.0.0.0' | ||
#pod template | ||
template: | ||
spec: | ||
volumes: | ||
# mount the model from hostPath | ||
- name: model-data | ||
hostPath: | ||
path: /data/models | ||
type: Directory | ||
- name: tz-config | ||
hostPath: | ||
path: /etc/localtime | ||
containers: | ||
- name: ray-head | ||
image: vllm/vllm-openai:ray-2.11.0-py3.10.12-llm-app | ||
#image: rayproject/ray-ml:2.10.0-py310-zip | ||
resources: | ||
limits: | ||
cpu: 4 | ||
memory: 16Gi | ||
requests: | ||
cpu: 2 | ||
memory: 2Gi | ||
ports: | ||
- containerPort: 6379 | ||
name: gcs-server | ||
- containerPort: 8265 # Ray dashboard | ||
name: dashboard | ||
- containerPort: 10001 | ||
name: client | ||
- containerPort: 8000 | ||
name: serve | ||
volumeMounts: | ||
- mountPath: /data/models | ||
name: model-data | ||
- name: tz-config | ||
mountPath: /etc/localtime | ||
workerGroupSpecs: | ||
# the pod replicas in this group typed worker | ||
- replicas: 1 | ||
minReplicas: 0 | ||
maxReplicas: 5 | ||
# logical group name, for this called small-group, also can be functional | ||
groupName: small-group | ||
# The `rayStartParams` are used to configure the `ray start` command. | ||
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay. | ||
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`. | ||
rayStartParams: | ||
resources: '"{\"accelerator_type_cpu\": 4, \"accelerator_type_3090\": 1}"' | ||
#pod template | ||
template: | ||
spec: | ||
volumes: | ||
- name: model-data | ||
hostPath: | ||
path: /NFS/125_bakup/models | ||
type: Directory | ||
- name: tz-config | ||
hostPath: | ||
path: /etc/localtime | ||
- name: dshm | ||
emptyDir: | ||
medium: Memory | ||
sizeLimit: "5.24Gi" | ||
containers: | ||
- name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' | ||
#image: rayproject/ray-ml:2.10.0-py310-zip | ||
image: vllm/vllm-openai:ray-2.11.0-py3.10.12-llm-app | ||
lifecycle: | ||
preStop: | ||
exec: | ||
command: ["/bin/sh","-c","ray stop"] | ||
resources: | ||
limits: | ||
cpu: "4" | ||
memory: "16Gi" | ||
nvidia.com/gpu: 1 | ||
requests: | ||
cpu: "2" | ||
memory: "2Gi" | ||
nvidia.com/gpu: 1 | ||
volumeMounts: | ||
- mountPath: /data/models | ||
name: model-data | ||
- name: tz-config | ||
mountPath: /etc/localtime | ||
- mountPath: /dev/shm | ||
name: dshm |
This file was deleted.
Oops, something went wrong.