Skip to content

Commit

Permalink
[ci] try to add multi-node tests (vllm-project#6280)
Browse files Browse the repository at this point in the history
Signed-off-by: Muralidhar Andoorveedu <[email protected]>
Co-authored-by: Muralidhar Andoorveedu <[email protected]>
  • Loading branch information
2 people authored and dtrifiro committed Jul 17, 2024
1 parent 3b406fe commit 3bb03de
Show file tree
Hide file tree
Showing 13 changed files with 230 additions and 275 deletions.
52 changes: 40 additions & 12 deletions .buildkite/run-multi-node-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@

set -euox pipefail

if [[ $# -lt 3 ]]; then
echo "Please provide the number of nodes and GPU per node."
if [[ $# -lt 4 ]]; then
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
exit 1
fi

NUM_NODES=$1
NUM_GPUS=$2
DOCKER_IMAGE=$3
WORKING_DIR=$1
NUM_NODES=$2
NUM_GPUS=$3
DOCKER_IMAGE=$4

shift 3
shift 4
COMMANDS=("$@")
if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
echo "The number of commands must be equal to the number of nodes."
Expand Down Expand Up @@ -40,13 +41,40 @@ start_nodes() {
fi
done
GPU_DEVICES+='"'
# echo "Starting node$node with GPU devices: $GPU_DEVICES"
docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE tail -f /dev/null

# start the container in detached mode
# things to note:
# 1. --shm-size=10.24gb is required. don't use --ipc=host
# 2. pass HF_TOKEN to the container
# 3. map the huggingface cache directory to the container
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
# starting from 192.168.10.11)
docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"

# organize containers into a ray cluster
if [ $node -eq 0 ]; then
# start the ray head node
docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
# wait for the head node to be ready
sleep 10
else
# start the ray worker nodes, and connect them to the head node
docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
fi
done

# wait for the cluster to be ready
sleep 10

# print the cluster status
docker exec node0 /bin/bash -c "ray status"
}

run_nodes() {
for node in $(seq 0 $(($NUM_NODES-1))); do
# important: iterate in reverse order to start the head node last
# we start the worker nodes first, in detached mode, and then start the head node
# in the foreground, so that the output of the head node is visible in the buildkite logs
for node in $(seq $(($NUM_NODES - 1)) -1 0); do
GPU_DEVICES='"device='
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
Expand All @@ -57,10 +85,10 @@ run_nodes() {
done
GPU_DEVICES+='"'
echo "Running node$node with GPU devices: $GPU_DEVICES"
if [ $node -lt $(($NUM_NODES - 1)) ]; then
docker exec -d node$node /bin/bash -c "${COMMANDS[$node]}"
if [ $node -ne 0 ]; then
docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
else
docker exec node$node /bin/bash -c "${COMMANDS[$node]}"
docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
fi
done
}
Expand Down
16 changes: 15 additions & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,17 @@ steps:
- pytest -v -s distributed/test_comm_ops.py
- pytest -v -s distributed/test_shm_broadcast.py

- label: 2 Node Tests (4 GPUs in total)
working_dir: "/vllm-workspace/tests"
num_gpus: 2
num_nodes: 2
commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
- TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py

- label: Distributed Tests (2 GPUs)
mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
Expand Down Expand Up @@ -213,7 +224,10 @@ steps:

- label: Tensorizer Test
#mirror_hardwares: [amd]
command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
commands:
- apt-get install curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s tensorizer_loader

- label: Metrics Test
mirror_hardwares: [amd]
Expand Down
37 changes: 14 additions & 23 deletions tests/async_engine/test_openapi_server_ray.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,26 @@
import openai # use the official client for correctness check
import pytest
# using Ray for overall ease of process management, parallel requests,
# and debugging.
import ray

from ..utils import VLLM_PATH, RemoteOpenAIServer
from ..utils import RemoteOpenAIServer

# any model with a chat template should work here
MODEL_NAME = "facebook/opt-125m"


@pytest.fixture(scope="module")
def ray_ctx():
ray.init(runtime_env={"working_dir": VLLM_PATH})
yield
ray.shutdown()


@pytest.fixture(scope="module")
def server(ray_ctx):
return RemoteOpenAIServer([
"--model",
MODEL_NAME,
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--max-model-len",
"2048",
"--enforce-eager",
"--engine-use-ray"
])
def server():
with RemoteOpenAIServer([
"--model",
MODEL_NAME,
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--max-model-len",
"2048",
"--enforce-eager",
"--engine-use-ray"
]) as remote_server:
yield remote_server


@pytest.fixture(scope="module")
Expand Down
17 changes: 4 additions & 13 deletions tests/distributed/test_pipeline_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,8 @@

import openai # use the official client for correctness check
import pytest
# using Ray for overall ease of process management, parallel requests,
# and debugging.
import ray

from ..utils import VLLM_PATH, RemoteOpenAIServer
from ..utils import RemoteOpenAIServer

# downloading lora to test lora requests

Expand All @@ -21,14 +18,7 @@


@pytest.fixture(scope="module")
def ray_ctx():
ray.init(runtime_env={"working_dir": VLLM_PATH})
yield
ray.shutdown()


@pytest.fixture(scope="module")
def server(ray_ctx):
def server():
args = [
"--model",
MODEL_NAME,
Expand All @@ -50,7 +40,8 @@ def server(ray_ctx):
args += [
"--enforce-eager",
]
return RemoteOpenAIServer(args, num_gpus=PP_SIZE * TP_SIZE)
with RemoteOpenAIServer(args) as remote_server:
yield remote_server


@pytest.fixture(scope="module")
Expand Down
1 change: 1 addition & 0 deletions tests/distributed/test_same_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@

expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
assert test_result == expected, f"Expected {expected}, got {test_result}"
print("Same node test passed!")
57 changes: 24 additions & 33 deletions tests/entrypoints/openai/test_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,12 @@
import jsonschema
import openai # use the official client for correctness check
import pytest
# using Ray for overall ease of process management, parallel requests,
# and debugging.
import ray
import torch
# downloading lora to test lora requests
from huggingface_hub import snapshot_download
from openai import BadRequestError

from ...utils import VLLM_PATH, RemoteOpenAIServer
from ...utils import RemoteOpenAIServer

# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
Expand All @@ -29,35 +26,29 @@ def zephyr_lora_files():


@pytest.fixture(scope="module")
def ray_ctx():
ray.init(runtime_env={"working_dir": VLLM_PATH})
yield
ray.shutdown()


@pytest.fixture(scope="module")
def server(zephyr_lora_files, ray_ctx):
return RemoteOpenAIServer([
"--model",
MODEL_NAME,
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
# lora config below
"--enable-lora",
"--lora-modules",
f"zephyr-lora={zephyr_lora_files}",
f"zephyr-lora2={zephyr_lora_files}",
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
"--max-num-seqs",
"128",
])
def server(zephyr_lora_files):
with RemoteOpenAIServer([
"--model",
MODEL_NAME,
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
# lora config below
"--enable-lora",
"--lora-modules",
f"zephyr-lora={zephyr_lora_files}",
f"zephyr-lora2={zephyr_lora_files}",
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
"--max-num-seqs",
"128",
]) as remote_server:
yield remote_server


@pytest.fixture(scope="module")
Expand Down
57 changes: 24 additions & 33 deletions tests/entrypoints/openai/test_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,14 @@
import jsonschema
import openai # use the official client for correctness check
import pytest
# using Ray for overall ease of process management, parallel requests,
# and debugging.
import ray
import requests
# downloading lora to test lora requests
from huggingface_hub import snapshot_download
from openai import BadRequestError

from vllm.transformers_utils.tokenizer import get_tokenizer

from ...utils import VLLM_PATH, RemoteOpenAIServer
from ...utils import RemoteOpenAIServer

# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
Expand All @@ -31,35 +28,29 @@ def zephyr_lora_files():


@pytest.fixture(scope="module")
def ray_ctx():
ray.init(runtime_env={"working_dir": VLLM_PATH})
yield
ray.shutdown()


@pytest.fixture(scope="module")
def server(zephyr_lora_files, ray_ctx):
return RemoteOpenAIServer([
"--model",
MODEL_NAME,
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
# lora config below
"--enable-lora",
"--lora-modules",
f"zephyr-lora={zephyr_lora_files}",
f"zephyr-lora2={zephyr_lora_files}",
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
"--max-num-seqs",
"128",
])
def server(zephyr_lora_files):
with RemoteOpenAIServer([
"--model",
MODEL_NAME,
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
# lora config below
"--enable-lora",
"--lora-modules",
f"zephyr-lora={zephyr_lora_files}",
f"zephyr-lora2={zephyr_lora_files}",
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
"--max-num-seqs",
"128",
]) as remote_server:
yield remote_server


@pytest.fixture(scope="module")
Expand Down
Loading

0 comments on commit 3bb03de

Please sign in to comment.