Skip to content

Commit

Permalink
Merge branch 'vllm-project:main' into cpu-only
Browse files Browse the repository at this point in the history
  • Loading branch information
adityagoel14 authored Jul 11, 2024
2 parents f17286c + 2d23b42 commit 22bfb03
Show file tree
Hide file tree
Showing 112 changed files with 4,488 additions and 1,538 deletions.
35 changes: 17 additions & 18 deletions .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ steps:
- sh
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- wait
- label: "A100 Benchmark"
- label: "A100"
agents:
queue: A100
plugins:
Expand Down Expand Up @@ -42,21 +42,20 @@ steps:
- name: devshm
emptyDir:
medium: Memory
# - label: "H100: NVIDIA SMI"
# agents:
# queue: H100
# plugins:
# - docker#v5.11.0:
# image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
# command:
# - bash
# - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
# mount-buildkite-agent: true
# propagate-environment: true
# propagate-uid-gid: false
# ipc: host
# gpus: all
# environment:
# - VLLM_USAGE_SOURCE
# - HF_TOKEN
- label: "H100"
agents:
queue: H100
plugins:
- docker#v5.11.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command:
- bash
- .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
mount-buildkite-agent: true
propagate-environment: true
ipc: host
gpus: all
environment:
- VLLM_USAGE_SOURCE
- HF_TOKEN

28 changes: 23 additions & 5 deletions .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
timeout 1200 bash -c '
until curl localhost:8000/v1/completions; do
until curl -X POST localhost:8000/v1/completions; do
sleep 1
done' && return 0 || return 1
}
Expand All @@ -73,8 +73,17 @@ kill_gpu_processes() {
echo "All GPU processes have been killed."
fi

# Sometimes kill with pid doesn't work properly, we can also kill all process running python or python3
# since we are in container anyway
pkill -9 -f python
pkill -9 -f python3

# waiting for GPU processes to be fully killed
sleep 10
# loop while nvidia-smi returns any processes
while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
sleep 1
echo "Waiting for GPU processes to be killed"
done

# remove vllm config file
rm -rf ~/.config/vllm
Expand All @@ -90,12 +99,19 @@ upload_to_buildkite() {
# upload the benchmarking results to buildkite

# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
# Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
if command -v buildkite-agent >/dev/null 2>&1; then
BUILDKITE_AGENT_COMMAND="buildkite-agent"
elif [ -f /workspace/buildkite-agent ]; then
BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
else
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"

# Use the determined command to annotate and upload artifacts
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
}

run_latency_tests() {
Expand Down Expand Up @@ -269,6 +285,7 @@ run_serving_tests() {
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
server_pid=$!

# wait until the server is alive
wait_for_server
Expand Down Expand Up @@ -318,6 +335,7 @@ run_serving_tests() {
done

# clean up
kill -9 $server_pid
kill_gpu_processes
done
}
Expand Down
77 changes: 77 additions & 0 deletions .buildkite/run-multi-node-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/bin/bash

set -euox pipefail

if [[ $# -lt 3 ]]; then
echo "Please provide the number of nodes and GPU per node."
exit 1
fi

NUM_NODES=$1
NUM_GPUS=$2
DOCKER_IMAGE=$3

shift 3
COMMANDS=("$@")
if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
echo "The number of commands must be equal to the number of nodes."
echo "Number of nodes: $NUM_NODES"
echo "Number of commands: ${#COMMANDS[@]}"
exit 1
fi

echo "List of commands"
for command in "${COMMANDS[@]}"; do
echo $command
done

start_network() {
docker network create --subnet=192.168.10.0/24 docker-net
}

start_nodes() {
for node in $(seq 0 $(($NUM_NODES-1))); do
GPU_DEVICES='"device='
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM))
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
GPU_DEVICES+=','
fi
done
GPU_DEVICES+='"'
# echo "Starting node$node with GPU devices: $GPU_DEVICES"
docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE tail -f /dev/null
done
}

run_nodes() {
for node in $(seq 0 $(($NUM_NODES-1))); do
GPU_DEVICES='"device='
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM))
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
GPU_DEVICES+=','
fi
done
GPU_DEVICES+='"'
echo "Running node$node with GPU devices: $GPU_DEVICES"
if [ $node -lt $(($NUM_NODES - 1)) ]; then
docker exec -d node$node /bin/bash -c "${COMMANDS[$node]}"
else
docker exec node$node /bin/bash -c "${COMMANDS[$node]}"
fi
done
}
cleanup() {
for node in $(seq 0 $(($NUM_NODES-1))); do
docker stop node$node
done
docker network rm docker-net
}
trap cleanup EXIT
start_network
start_nodes
run_nodes

18 changes: 10 additions & 8 deletions .github/workflows/mypy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,22 @@ jobs:
pip install types-setuptools
- name: Mypy
run: |
mypy tests --config-file pyproject.toml
mypy vllm/*.py --config-file pyproject.toml
mypy vllm/attention --config-file pyproject.toml
mypy vllm/core --config-file pyproject.toml
mypy vllm/distributed --config-file pyproject.toml
mypy vllm/engine --config-file pyproject.toml
mypy vllm/entrypoints --config-file pyproject.toml
mypy vllm/executor --config-file pyproject.toml
mypy vllm/inputs --config-file pyproject.toml
mypy vllm/logging --config-file pyproject.toml
mypy vllm/lora --config-file pyproject.toml
mypy vllm/model_executor --config-file pyproject.toml
mypy vllm/multimodal --config-file pyproject.toml
mypy vllm/usage --config-file pyproject.toml
mypy vllm/*.py --config-file pyproject.toml
mypy vllm/platforms --config-file pyproject.toml
mypy vllm/spec_decode --config-file pyproject.toml
mypy vllm/transformers_utils --config-file pyproject.toml
mypy vllm/engine --config-file pyproject.toml
mypy vllm/usage --config-file pyproject.toml
mypy vllm/worker --config-file pyproject.toml
mypy vllm/spec_decode --config-file pyproject.toml
mypy vllm/model_executor --config-file pyproject.toml
mypy vllm/lora --config-file pyproject.toml
mypy vllm/logging --config-file pyproject.toml
mypy tests --config-file pyproject.toml
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,10 @@ vLLM is flexible and easy to use with:

- Seamless integration with popular Hugging Face models
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
- Tensor parallelism support for distributed inference
- Tensor parallelism and pipieline parallelism support for distributed inference
- Streaming outputs
- OpenAI-compatible API server
- Support NVIDIA GPUs, AMD GPUs, Intel CPUs and GPUs
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs
- (Experimental) Prefix caching support
- (Experimental) Multi-lora support

Expand Down
18 changes: 9 additions & 9 deletions benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,17 +390,17 @@ def remove_prefix(text: str, prefix: str) -> str:
return text


def get_model(pretrained_model_name_or_path: str):
def get_model(pretrained_model_name_or_path: str) -> str:
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
from modelscope import snapshot_download
else:
from huggingface_hub import snapshot_download

model_path = snapshot_download(
model_id=pretrained_model_name_or_path,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
return model_path

model_path = snapshot_download(
model_id=pretrained_model_name_or_path,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])

return model_path
return pretrained_model_name_or_path


def get_tokenizer(
Expand Down
10 changes: 6 additions & 4 deletions benchmarks/kernels/benchmark_marlin.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@
from benchmark_shapes import WEIGHT_SHAPES

from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.gptq_marlin import (
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
MarlinWorkspace, marlin_24_quantize, marlin_quantize)
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
MarlinWorkspace, marlin_quantize)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
marlin_24_quantize)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
gptq_pack, quantize_weights, sort_weights)
from vllm.utils import FlexibleArgumentParser
Expand Down
1 change: 1 addition & 0 deletions docs/source/_templates/sections/header.html
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
justify-content: center;
align-items: center;
font-size: 16px;
padding: 0 6px 0 6px;
}
.notification-bar p {
margin: 0;
Expand Down
17 changes: 17 additions & 0 deletions docs/source/dev/multimodal/adding_multimodal_plugin.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
.. _adding_multimodal_plugin:

Adding a Multimodal Plugin
==========================

This document teaches you how to add a new modality to vLLM.

Each modality in vLLM is represented by a :class:`~vllm.multimodal.MultiModalPlugin` and registered to :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to :meth:`~vllm.multimodal.MultiModalRegistry.register_plugin`.

The remainder of this document details how to define custom :class:`~vllm.multimodal.MultiModalPlugin` s.

.. note::
This article is a work in progress.

..
TODO: Add more instructions on how to add new plugins once embeddings is in.
24 changes: 16 additions & 8 deletions docs/source/dev/multimodal/multimodal_index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,21 @@ Multi-Modality

vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.

Multi-modal input can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptStrictInputs`.

.. note::
``multi_modal_data`` can accept keys and values beyond the builtin ones, as long as a customized plugin is registered through
the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
by following :ref:`this guide <adding_multimodal_plugin>`.

To implement a new multi-modal model in vLLM, please follow :ref:`this guide <enabling_multimodal_inputs>`.
Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here <enabling_multimodal_inputs>`.

..
TODO: Add more instructions on how to add new plugins once embeddings is in.
Guides
++++++

.. toctree::
:maxdepth: 1

adding_multimodal_plugin

Module Contents
+++++++++++++++
Expand All @@ -36,10 +40,14 @@ Registry
Base Classes
------------

.. autoclass:: vllm.multimodal.MultiModalDataDict
.. autodata:: vllm.multimodal.BatchedTensors

.. autoclass:: vllm.multimodal.MultiModalDataBuiltins
:members:
:show-inheritance:

.. autodata:: vllm.multimodal.MultiModalDataDict

.. autoclass:: vllm.multimodal.MultiModalInputs
:members:
:show-inheritance:
Expand Down
2 changes: 1 addition & 1 deletion docs/source/getting_started/cpu-installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Requirements

* OS: Linux
* Compiler: gcc/g++>=12.3.0 (optional, recommended)
* Instruction set architecture (ISA) requirement: AVX512 is required.
* Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)

.. _cpu_backend_quick_start_dockerfile:

Expand Down
2 changes: 1 addition & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ vLLM is flexible and easy to use with:

* Seamless integration with popular HuggingFace models
* High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
* Tensor parallelism support for distributed inference
* Tensor parallelism and pipieline parallelism support for distributed inference
* Streaming outputs
* OpenAI-compatible API server
* Support NVIDIA GPUs and AMD GPUs
Expand Down
Loading

0 comments on commit 22bfb03

Please sign in to comment.