diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index a4eca078568fd..87d08c8c7fdcb 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -30,7 +30,7 @@ function cpu_tests() { # offline inference docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c " set -e - python3 examples/offline_inference.py" + python3 examples/offline_inference/offline_inference.py" # Run basic model test docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh index 4fc6d089cc666..1e5ff77895a38 100644 --- a/.buildkite/run-gh200-test.sh +++ b/.buildkite/run-gh200-test.sh @@ -24,5 +24,5 @@ remove_docker_container # Run the image and test offline inference docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' - python3 examples/offline_inference.py + python3 examples/offline_inference/offline_inference.py ' diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh index fa4f74fca7a11..a50570ab53438 100644 --- a/.buildkite/run-hpu-test.sh +++ b/.buildkite/run-hpu-test.sh @@ -13,4 +13,4 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py \ No newline at end of file +docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py \ No newline at end of file diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh index aa29c434e7cfb..52d485939b1d0 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/run-neuron-test.sh @@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ --name "${container_name}" \ ${image_name} \ - /bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py" + /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py" diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh index 6b12f424fd828..380f7a44a429a 100755 --- a/.buildkite/run-openvino-test.sh +++ b/.buildkite/run-openvino-test.sh @@ -13,4 +13,4 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py +docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh index 770dad6ffa3a1..13605a3e97142 100644 --- a/.buildkite/run-tpu-test.sh +++ b/.buildkite/run-tpu-test.sh @@ -14,4 +14,4 @@ remove_docker_container # For HF_TOKEN. source /etc/environment # Run a simple end-to-end example. -docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py" +docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py" diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh index e0a12afbe7320..160e10aa3bb9b 100644 --- a/.buildkite/run-xpu-test.sh +++ b/.buildkite/run-xpu-test.sh @@ -14,6 +14,6 @@ remove_docker_container # Run the image and test offline inference/tensor parallel docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c ' - python3 examples/offline_inference.py - python3 examples/offline_inference_cli.py -tp 2 + python3 examples/offline_inference/offline_inference.py + python3 examples/offline_inference/offline_inference_cli.py -tp 2 ' diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index dcfe228ce8eae..b7178b94f481a 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -187,19 +187,19 @@ steps: - examples/ commands: - pip install tensorizer # for tensorizer test - - python3 offline_inference.py - - python3 cpu_offload.py - - python3 offline_inference_chat.py - - python3 offline_inference_with_prefix.py - - python3 llm_engine_example.py - - python3 offline_inference_vision_language.py - - python3 offline_inference_vision_language_multi_image.py - - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference_encoder_decoder.py - - python3 offline_inference_classification.py - - python3 offline_inference_embedding.py - - python3 offline_inference_scoring.py - - python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2 + - python3 offline_inference/offline_inference.py + - python3 offline_inference/cpu_offload.py + - python3 offline_inference/offline_inference_chat.py + - python3 offline_inference/offline_inference_with_prefix.py + - python3 offline_inference/llm_engine_example.py + - python3 offline_inference/offline_inference_vision_language.py + - python3 offline_inference/offline_inference_vision_language_multi_image.py + - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/offline_inference_encoder_decoder.py + - python3 offline_inference/offline_inference_classification.py + - python3 offline_inference/offline_inference_embedding.py + - python3 offline_inference/offline_inference_scoring.py + - python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2 - label: Prefix Caching Test # 9min mirror_hardwares: [amd] diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index ab6f6e5d2060d..ee768db63c96c 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -27,7 +27,7 @@ jobs: version: v3.10.1 - name: Run chart-testing (lint) - run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm + run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm - name: Setup minio run: | @@ -64,7 +64,7 @@ jobs: run: | export AWS_ACCESS_KEY_ID=minioadmin export AWS_SECRET_ACCESS_KEY=minioadmin - helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" + helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" - name: curl test run: | diff --git a/Dockerfile b/Dockerfile index 088314eb38dbe..808cf675acf4d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -250,7 +250,7 @@ ENV VLLM_USAGE_SOURCE production-docker-image # define sagemaker first, so it is not default from `docker build` FROM vllm-openai-base AS vllm-sagemaker -COPY examples/sagemaker-entrypoint.sh . +COPY examples/online_serving/sagemaker-entrypoint.sh . RUN chmod +x sagemaker-entrypoint.sh ENTRYPOINT ["./sagemaker-entrypoint.sh"] diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md index 46210957c19ec..97de40ff469f1 100644 --- a/docs/source/contributing/profiling/profiling_index.md +++ b/docs/source/contributing/profiling/profiling_index.md @@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve ### Offline Inference -Refer to for an example. +Refer to for an example. ### OpenAI Server diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/source/deployment/frameworks/skypilot.md index f02a943026922..657e7f2bc72cc 100644 --- a/docs/source/deployment/frameworks/skypilot.md +++ b/docs/source/deployment/frameworks/skypilot.md @@ -61,7 +61,7 @@ run: | echo 'Starting gradio server...' git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ + python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \ -m $MODEL_NAME \ --port 8811 \ --model-url http://localhost:8081/v1 \ @@ -321,7 +321,7 @@ run: | echo 'Starting gradio server...' git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ + python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \ -m $MODEL_NAME \ --port 8811 \ --model-url http://$ENDPOINT/v1 \ diff --git a/docs/source/features/disagg_prefill.md b/docs/source/features/disagg_prefill.md index 645dc60807dd3..efa2efc66192e 100644 --- a/docs/source/features/disagg_prefill.md +++ b/docs/source/features/disagg_prefill.md @@ -21,7 +21,7 @@ Disaggregated prefill DOES NOT improve throughput. ## Usage example -Please refer to `examples/disaggregated_prefill.sh` for the example usage of disaggregated prefilling. +Please refer to `examples/online_serving/disaggregated_prefill.sh` for the example usage of disaggregated prefilling. ## Benchmarks diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md index cf06916d70f44..b00d05147bb32 100644 --- a/docs/source/features/lora.md +++ b/docs/source/features/lora.md @@ -47,7 +47,7 @@ outputs = llm.generate( ) ``` -Check out for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. +Check out for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. ## Serving LoRA Adapters diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md index c02fbf0605a8c..3679595e3d4d0 100644 --- a/docs/source/features/quantization/auto_awq.md +++ b/docs/source/features/quantization/auto_awq.md @@ -47,7 +47,7 @@ print(f'Model is quantized and saved at "{quant_path}"') To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: ```console -$ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq +$ python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq ``` AWQ models are also supported directly through the LLM entrypoint: diff --git a/docs/source/features/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md index f200c722d1d42..50edaf81fddd3 100644 --- a/docs/source/features/quantization/fp8_e4m3_kvcache.md +++ b/docs/source/features/quantization/fp8_e4m3_kvcache.md @@ -28,7 +28,7 @@ Here is an example of how to enable this feature: ```python # two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to -# https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own. +# https://github.com/vllm-project/vllm/blob/main/examples/other/fp8/README.md to generate kv_cache_scales.json of your own. from vllm import LLM, SamplingParams sampling_params = SamplingParams(temperature=1.3, top_p=0.8) diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md index 26c09bb0d8a0c..ccd9a6a1b1a14 100644 --- a/docs/source/features/structured_outputs.md +++ b/docs/source/features/structured_outputs.md @@ -131,7 +131,7 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` -Full example: +Full example: ## Experimental Automatic Parsing (OpenAI API) @@ -257,4 +257,4 @@ outputs = llm.generate( print(outputs[0].outputs[0].text) ``` -Full example: +Full example: diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index 32bb86c469c78..aaa13d0fb6d3f 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -12,6 +12,7 @@ def fix_case(text: str) -> str: subs = { "api": "API", + "Cli": "CLI", "cpu": "CPU", "llm": "LLM", "tpu": "TPU", @@ -58,7 +59,7 @@ def generate(self) -> str: content = f"# {self.title}\n\n{self.description}\n\n" content += "```{toctree}\n" content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n" - content += "\n".join(sorted(self.documents)) + "\n```\n" + content += "\n".join(self.documents) + "\n```\n" return content @@ -131,11 +132,14 @@ def generate(self) -> str: ROOT_DIR) content = f"Source .\n\n" - if self.main_file.suffix == ".py": - content += f"# {self.title}\n\n" include = "include" if self.main_file.suffix == ".md" else \ "literalinclude" - content += f":::{{{include}}} {make_relative(self.main_file)}\n:::\n\n" + if include == "literalinclude": + content += f"# {self.title}\n\n" + content += f":::{{{include}}} {make_relative(self.main_file)}\n" + if include == "literalinclude": + content += f":language: {self.main_file.suffix[1:]}\n" + content += ":::\n\n" if not self.other_files: return content @@ -163,14 +167,16 @@ def generate_examples(): description= "A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using from examples found in .", # noqa: E501 caption="Examples", - maxdepth=1) # TODO change to 2 when examples start being categorised + maxdepth=2) + # Category indices stored in reverse order because they are inserted into + # examples_index.documents at index 0 in order category_indices = { - "offline_inference": + "other": Index( - path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md", - title="Offline Inference", + path=EXAMPLE_DOC_DIR / "examples_other_index.md", + title="Other", description= - "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.", # noqa: E501 + "Other examples that don't strongly fit into the online or offline serving categories.", # noqa: E501 caption="Examples", ), "online_serving": @@ -181,31 +187,30 @@ def generate_examples(): "Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.", # noqa: E501 caption="Examples", ), - "other": + "offline_inference": Index( - path=EXAMPLE_DOC_DIR / "examples_other_index.md", - title="Other", + path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md", + title="Offline Inference", description= - "Other examples that don't strongly fit into the online or offline serving categories.", # noqa: E501 + "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.", # noqa: E501 caption="Examples", ), } examples = [] + glob_patterns = ["*.py", "*.md", "*.sh"] # Find categorised examples for category in category_indices: category_dir = EXAMPLE_DIR / category - py = category_dir.glob("*.py") - md = category_dir.glob("*.md") - for path in itertools.chain(py, md): + globs = [category_dir.glob(pattern) for pattern in glob_patterns] + for path in itertools.chain(*globs): examples.append(Example(path, category)) # Find examples in subdirectories for path in category_dir.glob("*/*.md"): examples.append(Example(path.parent, category)) # Find uncategorised examples - py = EXAMPLE_DIR.glob("*.py") - md = EXAMPLE_DIR.glob("*.md") - for path in itertools.chain(py, md): + globs = [EXAMPLE_DIR.glob(pattern) for pattern in glob_patterns] + for path in itertools.chain(*globs): examples.append(Example(path)) # Find examples in subdirectories for path in EXAMPLE_DIR.glob("*/*.md"): @@ -215,7 +220,7 @@ def generate_examples(): examples.append(Example(path.parent)) # Generate the example documentation - for example in examples: + for example in sorted(examples, key=lambda e: e.path.stem): doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md" with open(doc_path, "w+") as f: f.write(example.generate()) diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu-x86.md index bbb2d1872ef39..bb046dd0fd9dc 100644 --- a/docs/source/getting_started/installation/cpu-x86.md +++ b/docs/source/getting_started/installation/cpu-x86.md @@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library $ find / -name *libtcmalloc* # find the dynamic link library path $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD -$ python examples/offline_inference.py # run vLLM +$ python examples/offline_inference/offline_inference.py # run vLLM ``` - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: @@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 $ export VLLM_CPU_OMP_THREADS_BIND=0-7 -$ python examples/offline_inference.py +$ python examples/offline_inference/offline_inference.py ``` - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access. diff --git a/docs/source/getting_started/installation/xpu.md b/docs/source/getting_started/installation/xpu.md index be4e3b9bd1bc5..c1ab5478eb652 100644 --- a/docs/source/getting_started/installation/xpu.md +++ b/docs/source/getting_started/installation/xpu.md @@ -71,4 +71,4 @@ $ --pipeline-parallel-size=2 \ $ -tp=8 ``` -By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. +By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index 3f9556165ece4..6b56918ce5638 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -31,7 +31,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in ## Offline Batched Inference -With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: +With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`: @@ -133,7 +133,7 @@ completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", print("Completion result:", completion) ``` -A more detailed client example can be found here: +A more detailed client example can be found here: ### OpenAI Chat Completions API with vLLM diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md index 5a0310da0f2cb..f5efe0bef7506 100644 --- a/docs/source/getting_started/troubleshooting.md +++ b/docs/source/getting_started/troubleshooting.md @@ -24,7 +24,7 @@ To isolate the model downloading and loading issue, you can use the `--load-form ## Model is too large -If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. +If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. ## Enable more logging diff --git a/docs/source/models/extensions/tensorizer.md b/docs/source/models/extensions/tensorizer.md index 42ed5c795dd27..ae17e3437bca6 100644 --- a/docs/source/models/extensions/tensorizer.md +++ b/docs/source/models/extensions/tensorizer.md @@ -9,7 +9,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor For more information on CoreWeave's Tensorizer, please refer to [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see -the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html). +the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html). ```{note} Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md index 383299d61b5dd..6228c7c2ac957 100644 --- a/docs/source/models/generative_models.md +++ b/docs/source/models/generative_models.md @@ -46,7 +46,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -A code example can be found here: +A code example can be found here: ### `LLM.beam_search` @@ -103,7 +103,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -A code example can be found here: +A code example can be found here: If the model doesn't have a chat template or you want to specify another one, you can explicitly pass a chat template: diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 12ded68eb30b5..3e4407cfdc233 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -65,7 +65,7 @@ embeds = output.outputs.embedding print(f"Embeddings: {embeds!r} (size={len(embeds)})") ``` -A code example can be found here: +A code example can be found here: ### `LLM.classify` @@ -80,7 +80,7 @@ probs = output.outputs.probs print(f"Class Probabilities: {probs!r} (size={len(probs)})") ``` -A code example can be found here: +A code example can be found here: ### `LLM.score` @@ -102,7 +102,7 @@ score = output.outputs.score print(f"Score: {score}") ``` -A code example can be found here: +A code example can be found here: ## Online Inference diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index b1703249d7224..4e0a9ef6ecf7d 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -51,7 +51,7 @@ $ --pipeline-parallel-size 2 If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration. -The first step, is to start containers and organize them into a cluster. We have provided the helper script to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command. +The first step, is to start containers and organize them into a cluster. We have provided the helper script to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command. Pick a node as the head node, and run the following command: diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md index 0efa09f2869ca..9f5e1b908d786 100644 --- a/docs/source/serving/multimodal_inputs.md +++ b/docs/source/serving/multimodal_inputs.md @@ -60,7 +60,7 @@ for o in outputs: print(generated_text) ``` -Full example: +Full example: To substitute multiple images inside the same text prompt, you can pass in a list of images instead: @@ -91,7 +91,7 @@ for o in outputs: print(generated_text) ``` -Full example: +Full example: Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: @@ -125,13 +125,13 @@ for o in outputs: You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary instead of using multi-image input. -Full example: +Full example: ### Audio You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary. -Full example: +Full example: ### Embedding @@ -271,7 +271,7 @@ chat_response = client.chat.completions.create( print("Chat completion output:", chat_response.choices[0].message.content) ``` -Full example: +Full example: ```{tip} Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine, @@ -342,7 +342,7 @@ result = chat_completion_from_url.choices[0].message.content print("Chat completion output from image url:", result) ``` -Full example: +Full example: ````{note} By default, the timeout for fetching videos through HTTP URL is `30` seconds. @@ -445,7 +445,7 @@ result = chat_completion_from_url.choices[0].message.content print("Chat completion output from audio url:", result) ``` -Full example: +Full example: ````{note} By default, the timeout for fetching audios through HTTP URL is `10` seconds. @@ -529,4 +529,4 @@ Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of th example below for details. ``` -Full example: +Full example: diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 1e5ea6357d202..022dd3ae8a237 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -191,7 +191,7 @@ The order of priorities is `command line > config file values > defaults`. Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions); you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. -Code example: +Code example: #### Extra parameters @@ -222,7 +222,7 @@ We support both [Vision](https://platform.openai.com/docs/guides/vision)- and see our [Multimodal Inputs](#multimodal-inputs) guide for more information. - *Note: `image_url.detail` parameter is not supported.* -Code example: +Code example: #### Extra parameters @@ -255,7 +255,7 @@ which will be treated as a single prompt to the model. This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details. ``` -Code example: +Code example: #### Extra parameters @@ -299,7 +299,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_ The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats. -Code example: +Code example: (score-api)= ### Score API @@ -309,7 +309,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). -Code example: +Code example: #### Single inference diff --git a/examples/aqlm_example.py b/examples/offline_inference/aqlm_example.py similarity index 100% rename from examples/aqlm_example.py rename to examples/offline_inference/aqlm_example.py diff --git a/examples/cpu_offload.py b/examples/offline_inference/cpu_offload.py similarity index 100% rename from examples/cpu_offload.py rename to examples/offline_inference/cpu_offload.py diff --git a/examples/florence2_inference.py b/examples/offline_inference/florence2_inference.py similarity index 92% rename from examples/florence2_inference.py rename to examples/offline_inference/florence2_inference.py index b58ac2e1f7ed4..49dd2c331db5a 100644 --- a/examples/florence2_inference.py +++ b/examples/offline_inference/florence2_inference.py @@ -3,7 +3,8 @@ encoder/decoder models, specifically Florence-2 ''' # TODO(Isotr0py): -# Move to offline_inference_vision_language.py after porting vision backbone +# Move to offline_inference/offline_inference_vision_language.py +# after porting vision backbone from vllm import LLM, SamplingParams dtype = "float" diff --git a/examples/gguf_inference.py b/examples/offline_inference/gguf_inference.py similarity index 100% rename from examples/gguf_inference.py rename to examples/offline_inference/gguf_inference.py diff --git a/examples/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py similarity index 100% rename from examples/llm_engine_example.py rename to examples/offline_inference/llm_engine_example.py diff --git a/examples/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py similarity index 100% rename from examples/lora_with_quantization_inference.py rename to examples/offline_inference/lora_with_quantization_inference.py diff --git a/examples/multilora_inference.py b/examples/offline_inference/multilora_inference.py similarity index 100% rename from examples/multilora_inference.py rename to examples/offline_inference/multilora_inference.py diff --git a/examples/offline_chat_with_tools.py b/examples/offline_inference/offline_chat_with_tools.py similarity index 100% rename from examples/offline_chat_with_tools.py rename to examples/offline_inference/offline_chat_with_tools.py diff --git a/examples/offline_inference.py b/examples/offline_inference/offline_inference.py similarity index 100% rename from examples/offline_inference.py rename to examples/offline_inference/offline_inference.py diff --git a/examples/offline_inference_arctic.py b/examples/offline_inference/offline_inference_arctic.py similarity index 100% rename from examples/offline_inference_arctic.py rename to examples/offline_inference/offline_inference_arctic.py diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference/offline_inference_audio_language.py similarity index 100% rename from examples/offline_inference_audio_language.py rename to examples/offline_inference/offline_inference_audio_language.py diff --git a/examples/offline_inference_chat.py b/examples/offline_inference/offline_inference_chat.py similarity index 100% rename from examples/offline_inference_chat.py rename to examples/offline_inference/offline_inference_chat.py diff --git a/examples/offline_inference_classification.py b/examples/offline_inference/offline_inference_classification.py similarity index 100% rename from examples/offline_inference_classification.py rename to examples/offline_inference/offline_inference_classification.py diff --git a/examples/offline_inference_cli.py b/examples/offline_inference/offline_inference_cli.py similarity index 100% rename from examples/offline_inference_cli.py rename to examples/offline_inference/offline_inference_cli.py diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference/offline_inference_distributed.py similarity index 100% rename from examples/offline_inference_distributed.py rename to examples/offline_inference/offline_inference_distributed.py diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference/offline_inference_embedding.py similarity index 100% rename from examples/offline_inference_embedding.py rename to examples/offline_inference/offline_inference_embedding.py diff --git a/examples/offline_inference_encoder_decoder.py b/examples/offline_inference/offline_inference_encoder_decoder.py similarity index 100% rename from examples/offline_inference_encoder_decoder.py rename to examples/offline_inference/offline_inference_encoder_decoder.py diff --git a/examples/offline_inference_mlpspeculator.py b/examples/offline_inference/offline_inference_mlpspeculator.py similarity index 100% rename from examples/offline_inference_mlpspeculator.py rename to examples/offline_inference/offline_inference_mlpspeculator.py diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference/offline_inference_neuron.py similarity index 100% rename from examples/offline_inference_neuron.py rename to examples/offline_inference/offline_inference_neuron.py diff --git a/examples/offline_inference_neuron_int8_quantization.py b/examples/offline_inference/offline_inference_neuron_int8_quantization.py similarity index 100% rename from examples/offline_inference_neuron_int8_quantization.py rename to examples/offline_inference/offline_inference_neuron_int8_quantization.py diff --git a/examples/offline_inference_openai.md b/examples/offline_inference/offline_inference_openai/offline_inference_openai.md similarity index 90% rename from examples/offline_inference_openai.md rename to examples/offline_inference/offline_inference_openai/offline_inference_openai.md index 2436417cb543a..6278a1943fe4a 100644 --- a/examples/offline_inference_openai.md +++ b/examples/offline_inference/offline_inference_openai/offline_inference_openai.md @@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format The OpenAI batch file format consists of a series of json objects on new lines. -[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl) +[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl) Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. @@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ``` -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this ``` -$ cat openai_example_batch.jsonl +$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line. You can run the batch with the following command, which will write its results to a file called `results.jsonl` ``` -python -m vllm.entrypoints.openai.run_batch -i openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch -i offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct ``` ### Step 3: Check your results @@ -66,10 +66,10 @@ $ cat results.jsonl The batch runner supports remote input and output urls that are accessible via http/https. -For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl`, you can run +For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl`, you can run ``` -python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct ``` ## Example 3: Integrating with AWS S3 @@ -90,13 +90,13 @@ To integrate with cloud blob storage, we recommend using presigned urls. To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ``` -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this ``` -$ cat openai_example_batch.jsonl +$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -104,7 +104,7 @@ $ cat openai_example_batch.jsonl Now upload your batch file to your S3 bucket. ``` -aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl +aws s3 cp offline_inference/offline_inference_openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl ``` ### Step 2: Generate your presigned urls diff --git a/examples/openai_example_batch.jsonl b/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl similarity index 100% rename from examples/openai_example_batch.jsonl rename to examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl diff --git a/examples/offline_inference_pixtral.py b/examples/offline_inference/offline_inference_pixtral.py similarity index 100% rename from examples/offline_inference_pixtral.py rename to examples/offline_inference/offline_inference_pixtral.py diff --git a/examples/offline_inference_scoring.py b/examples/offline_inference/offline_inference_scoring.py similarity index 100% rename from examples/offline_inference_scoring.py rename to examples/offline_inference/offline_inference_scoring.py diff --git a/examples/offline_inference_structured_outputs.py b/examples/offline_inference/offline_inference_structured_outputs.py similarity index 100% rename from examples/offline_inference_structured_outputs.py rename to examples/offline_inference/offline_inference_structured_outputs.py diff --git a/examples/offline_inference_tpu.py b/examples/offline_inference/offline_inference_tpu.py similarity index 100% rename from examples/offline_inference_tpu.py rename to examples/offline_inference/offline_inference_tpu.py diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference/offline_inference_vision_language.py similarity index 100% rename from examples/offline_inference_vision_language.py rename to examples/offline_inference/offline_inference_vision_language.py diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference/offline_inference_vision_language_embedding.py similarity index 100% rename from examples/offline_inference_vision_language_embedding.py rename to examples/offline_inference/offline_inference_vision_language_embedding.py diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference/offline_inference_vision_language_multi_image.py similarity index 100% rename from examples/offline_inference_vision_language_multi_image.py rename to examples/offline_inference/offline_inference_vision_language_multi_image.py diff --git a/examples/offline_inference_whisper.py b/examples/offline_inference/offline_inference_whisper.py similarity index 100% rename from examples/offline_inference_whisper.py rename to examples/offline_inference/offline_inference_whisper.py diff --git a/examples/offline_inference_with_default_generation_config.py b/examples/offline_inference/offline_inference_with_default_generation_config.py similarity index 100% rename from examples/offline_inference_with_default_generation_config.py rename to examples/offline_inference/offline_inference_with_default_generation_config.py diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference/offline_inference_with_prefix.py similarity index 100% rename from examples/offline_inference_with_prefix.py rename to examples/offline_inference/offline_inference_with_prefix.py diff --git a/examples/offline_inference_with_profiler.py b/examples/offline_inference/offline_inference_with_profiler.py similarity index 100% rename from examples/offline_inference_with_profiler.py rename to examples/offline_inference/offline_inference_with_profiler.py diff --git a/examples/offline_profile.py b/examples/offline_inference/offline_profile.py similarity index 99% rename from examples/offline_profile.py rename to examples/offline_inference/offline_profile.py index 46afe8aa2604b..187a05e4d70a2 100644 --- a/examples/offline_profile.py +++ b/examples/offline_inference/offline_profile.py @@ -363,7 +363,7 @@ def abort_requests(): example: ``` - python examples/offline_profile.py \\ + python examples/offline_inference/offline_profile.py \\ --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\ --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\ --enforce-eager run_num_steps -n 2 diff --git a/examples/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py similarity index 100% rename from examples/save_sharded_state.py rename to examples/offline_inference/save_sharded_state.py diff --git a/examples/api_client.py b/examples/online_serving/api_client.py similarity index 100% rename from examples/api_client.py rename to examples/online_serving/api_client.py diff --git a/examples/chart-helm/.helmignore b/examples/online_serving/chart-helm/.helmignore similarity index 100% rename from examples/chart-helm/.helmignore rename to examples/online_serving/chart-helm/.helmignore diff --git a/examples/chart-helm/Chart.yaml b/examples/online_serving/chart-helm/Chart.yaml similarity index 100% rename from examples/chart-helm/Chart.yaml rename to examples/online_serving/chart-helm/Chart.yaml diff --git a/examples/online_serving/chart-helm/README.md b/examples/online_serving/chart-helm/README.md new file mode 100644 index 0000000000000..6aa126d4fd22c --- /dev/null +++ b/examples/online_serving/chart-helm/README.md @@ -0,0 +1,21 @@ +# Helm Charts + +This directory contains a Helm chart for deploying the vllm application. The chart includes configurations for deployment, autoscaling, resource management, and more. + +## Files + +- Chart.yaml: Defines the chart metadata including name, version, and maintainers. +- ct.yaml: Configuration for chart testing. +- lintconf.yaml: Linting rules for YAML files. +- values.schema.json: JSON schema for validating values.yaml. +- values.yaml: Default values for the Helm chart. +- templates/_helpers.tpl: Helper templates for defining common configurations. +- templates/configmap.yaml: Template for creating ConfigMaps. +- templates/custom-objects.yaml: Template for custom Kubernetes objects. +- templates/deployment.yaml: Template for creating Deployments. +- templates/hpa.yaml: Template for Horizontal Pod Autoscaler. +- templates/job.yaml: Template for Kubernetes Jobs. +- templates/poddisruptionbudget.yaml: Template for Pod Disruption Budget. +- templates/pvc.yaml: Template for Persistent Volume Claims. +- templates/secrets.yaml: Template for Kubernetes Secrets. +- templates/service.yaml: Template for creating Services. \ No newline at end of file diff --git a/examples/chart-helm/ct.yaml b/examples/online_serving/chart-helm/ct.yaml similarity index 100% rename from examples/chart-helm/ct.yaml rename to examples/online_serving/chart-helm/ct.yaml diff --git a/examples/chart-helm/lintconf.yaml b/examples/online_serving/chart-helm/lintconf.yaml similarity index 100% rename from examples/chart-helm/lintconf.yaml rename to examples/online_serving/chart-helm/lintconf.yaml diff --git a/examples/chart-helm/templates/_helpers.tpl b/examples/online_serving/chart-helm/templates/_helpers.tpl similarity index 100% rename from examples/chart-helm/templates/_helpers.tpl rename to examples/online_serving/chart-helm/templates/_helpers.tpl diff --git a/examples/chart-helm/templates/configmap.yaml b/examples/online_serving/chart-helm/templates/configmap.yaml similarity index 100% rename from examples/chart-helm/templates/configmap.yaml rename to examples/online_serving/chart-helm/templates/configmap.yaml diff --git a/examples/chart-helm/templates/custom-objects.yaml b/examples/online_serving/chart-helm/templates/custom-objects.yaml similarity index 100% rename from examples/chart-helm/templates/custom-objects.yaml rename to examples/online_serving/chart-helm/templates/custom-objects.yaml diff --git a/examples/chart-helm/templates/deployment.yaml b/examples/online_serving/chart-helm/templates/deployment.yaml similarity index 100% rename from examples/chart-helm/templates/deployment.yaml rename to examples/online_serving/chart-helm/templates/deployment.yaml diff --git a/examples/chart-helm/templates/hpa.yaml b/examples/online_serving/chart-helm/templates/hpa.yaml similarity index 100% rename from examples/chart-helm/templates/hpa.yaml rename to examples/online_serving/chart-helm/templates/hpa.yaml diff --git a/examples/chart-helm/templates/job.yaml b/examples/online_serving/chart-helm/templates/job.yaml similarity index 100% rename from examples/chart-helm/templates/job.yaml rename to examples/online_serving/chart-helm/templates/job.yaml diff --git a/examples/chart-helm/templates/poddisruptionbudget.yaml b/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml similarity index 100% rename from examples/chart-helm/templates/poddisruptionbudget.yaml rename to examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml diff --git a/examples/chart-helm/templates/pvc.yaml b/examples/online_serving/chart-helm/templates/pvc.yaml similarity index 100% rename from examples/chart-helm/templates/pvc.yaml rename to examples/online_serving/chart-helm/templates/pvc.yaml diff --git a/examples/chart-helm/templates/secrets.yaml b/examples/online_serving/chart-helm/templates/secrets.yaml similarity index 100% rename from examples/chart-helm/templates/secrets.yaml rename to examples/online_serving/chart-helm/templates/secrets.yaml diff --git a/examples/chart-helm/templates/service.yaml b/examples/online_serving/chart-helm/templates/service.yaml similarity index 100% rename from examples/chart-helm/templates/service.yaml rename to examples/online_serving/chart-helm/templates/service.yaml diff --git a/examples/chart-helm/values.schema.json b/examples/online_serving/chart-helm/values.schema.json similarity index 100% rename from examples/chart-helm/values.schema.json rename to examples/online_serving/chart-helm/values.schema.json diff --git a/examples/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml similarity index 100% rename from examples/chart-helm/values.yaml rename to examples/online_serving/chart-helm/values.yaml diff --git a/examples/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh similarity index 100% rename from examples/disaggregated_prefill.sh rename to examples/online_serving/disaggregated_prefill.sh diff --git a/examples/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py similarity index 100% rename from examples/gradio_openai_chatbot_webserver.py rename to examples/online_serving/gradio_openai_chatbot_webserver.py diff --git a/examples/gradio_webserver.py b/examples/online_serving/gradio_webserver.py similarity index 100% rename from examples/gradio_webserver.py rename to examples/online_serving/gradio_webserver.py diff --git a/examples/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py similarity index 100% rename from examples/openai_chat_completion_client.py rename to examples/online_serving/openai_chat_completion_client.py diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py similarity index 100% rename from examples/openai_chat_completion_client_for_multimodal.py rename to examples/online_serving/openai_chat_completion_client_for_multimodal.py diff --git a/examples/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py similarity index 100% rename from examples/openai_chat_completion_client_with_tools.py rename to examples/online_serving/openai_chat_completion_client_with_tools.py diff --git a/examples/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py similarity index 100% rename from examples/openai_chat_completion_structured_outputs.py rename to examples/online_serving/openai_chat_completion_structured_outputs.py diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py similarity index 100% rename from examples/openai_chat_embedding_client_for_multimodal.py rename to examples/online_serving/openai_chat_embedding_client_for_multimodal.py diff --git a/examples/openai_completion_client.py b/examples/online_serving/openai_completion_client.py similarity index 100% rename from examples/openai_completion_client.py rename to examples/online_serving/openai_completion_client.py diff --git a/examples/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py similarity index 100% rename from examples/openai_cross_encoder_score.py rename to examples/online_serving/openai_cross_encoder_score.py diff --git a/examples/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py similarity index 100% rename from examples/openai_embedding_client.py rename to examples/online_serving/openai_embedding_client.py diff --git a/examples/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py similarity index 100% rename from examples/openai_pooling_client.py rename to examples/online_serving/openai_pooling_client.py diff --git a/examples/opentelemetry/Otel.md b/examples/online_serving/opentelemetry/Otel.md similarity index 100% rename from examples/opentelemetry/Otel.md rename to examples/online_serving/opentelemetry/Otel.md diff --git a/examples/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py similarity index 100% rename from examples/opentelemetry/dummy_client.py rename to examples/online_serving/opentelemetry/dummy_client.py diff --git a/examples/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md similarity index 100% rename from examples/prometheus_grafana/README.md rename to examples/online_serving/prometheus_grafana/README.md diff --git a/examples/prometheus_grafana/docker-compose.yaml b/examples/online_serving/prometheus_grafana/docker-compose.yaml similarity index 100% rename from examples/prometheus_grafana/docker-compose.yaml rename to examples/online_serving/prometheus_grafana/docker-compose.yaml diff --git a/examples/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json similarity index 100% rename from examples/prometheus_grafana/grafana.json rename to examples/online_serving/prometheus_grafana/grafana.json diff --git a/examples/prometheus_grafana/prometheus.yaml b/examples/online_serving/prometheus_grafana/prometheus.yaml similarity index 100% rename from examples/prometheus_grafana/prometheus.yaml rename to examples/online_serving/prometheus_grafana/prometheus.yaml diff --git a/examples/run_cluster.sh b/examples/online_serving/run_cluster.sh similarity index 100% rename from examples/run_cluster.sh rename to examples/online_serving/run_cluster.sh diff --git a/examples/sagemaker-entrypoint.sh b/examples/online_serving/sagemaker-entrypoint.sh similarity index 100% rename from examples/sagemaker-entrypoint.sh rename to examples/online_serving/sagemaker-entrypoint.sh diff --git a/examples/fp8/README.md b/examples/other/fp8/README.md similarity index 88% rename from examples/fp8/README.md rename to examples/other/fp8/README.md index 5492872cae93a..4e8031d954113 100644 --- a/examples/fp8/README.md +++ b/examples/other/fp8/README.md @@ -20,12 +20,12 @@ Before incorporating the FP8 datatype for inference workloads, you must adhere t ### 2. Convert HF model into a quantized HF model. Note: The following steps are adapted from the [TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/README.md). -`quantize.py` (examples/fp8/quantizer/quantize.py) uses the quantization toolkit (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format). +`quantize.py` (examples/other/fp8/quantizer/quantize.py) uses the quantization toolkit (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format). -The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found at `examples/fp8/quantizer/README.md`. +The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found at `examples/other/fp8/quantizer/README.md`. ### 3. Extract KV Cache Scaling Factors from quantized HF model. -`extract_scales.py` (examples/fp8/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following: +`extract_scales.py` (examples/other/fp8/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following: 1. **File Structure**: The utility operates under the assumption that all parameters, including KV cache scaling factors, corresponding to a particular Tensor Parallelism (TP) rank are stored in a single file. These files must adhere to a specific naming convention where the TP rank is immediately identified after a specific keyword (e.g., "rank") in the filename. 2. **TP Decomposition**: The utility assumes consistency between the TP decomposition employed by the quantizer tool and that used by vLLM. @@ -35,7 +35,7 @@ The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found a ```python # prerequisites: # - Quantized HF LLaMa 2 model -python3 examples/fp8/extract_scales.py --help +python3 examples/other/fp8/extract_scales.py --help Usage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--load_format {auto,safetensors,npz,pt}] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE] KV Scale Extraction Example @@ -52,7 +52,7 @@ Optional arguments: ``` ```python Example: -python3 examples/fp8/extract_scales.py --quantized_model --tp_size --output_dir +python3 examples/other/fp8/extract_scales.py --quantized_model --tp_size --output_dir ``` ### 4. Load KV Cache Scaling Factors into VLLM. This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for KV cache scaling factors to be utilized for FP8. diff --git a/examples/fp8/extract_scales.py b/examples/other/fp8/extract_scales.py similarity index 100% rename from examples/fp8/extract_scales.py rename to examples/other/fp8/extract_scales.py diff --git a/examples/fp8/quantizer/README.md b/examples/other/fp8/quantizer/README.md similarity index 100% rename from examples/fp8/quantizer/README.md rename to examples/other/fp8/quantizer/README.md diff --git a/examples/fp8/quantizer/quantize.py b/examples/other/fp8/quantizer/quantize.py similarity index 100% rename from examples/fp8/quantizer/quantize.py rename to examples/other/fp8/quantizer/quantize.py diff --git a/examples/logging_configuration.md b/examples/other/logging_configuration.md similarity index 100% rename from examples/logging_configuration.md rename to examples/other/logging_configuration.md diff --git a/examples/tensorize_vllm_model.py b/examples/other/tensorize_vllm_model.py similarity index 96% rename from examples/tensorize_vllm_model.py rename to examples/other/tensorize_vllm_model.py index dd77a4ad0c6b7..5fff1fdf502c9 100644 --- a/examples/tensorize_vllm_model.py +++ b/examples/other/tensorize_vllm_model.py @@ -25,7 +25,7 @@ To serialize a model, install vLLM from source, then run something like this from the root level of this repository: -python -m examples.tensorize_vllm_model \ +python -m examples.offline_inference.tensorize_vllm_model \ --model facebook/opt-125m \ serialize \ --serialized-directory s3://my-bucket \ @@ -45,7 +45,7 @@ To deserialize a model, you can run something like this from the root level of this repository: -python -m examples.tensorize_vllm_model \ +python -m examples.offline_inference.tensorize_vllm_model \ --model EleutherAI/gpt-j-6B \ --dtype float16 \ deserialize \ @@ -63,11 +63,11 @@ model-rank-%03d.tensors For more information on the available arguments for serializing, run -`python -m examples.tensorize_vllm_model serialize --help`. +`python -m examples.offline_inference.tensorize_vllm_model serialize --help`. Or for deserializing: -`python -m examples.tensorize_vllm_model deserialize --help`. +`python -m examples.offline_inference.tensorize_vllm_model deserialize --help`. Once a model is serialized, tensorizer can be invoked with the `LLM` class directly to load models: @@ -88,7 +88,7 @@ In order to see all of the available arguments usable to configure loading with tensorizer that are given to `TensorizerConfig`, run: -`python -m examples.tensorize_vllm_model deserialize --help` +`python -m examples.offline_inference.tensorize_vllm_model deserialize --help` under the `tensorizer options` section. These can also be used for deserialization in this example script, although `--tensorizer-uri` and diff --git a/pyproject.toml b/pyproject.toml index 45fa4bff4e680..0ac3f39ef7a5f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ build-backend = "setuptools.build_meta" line-length = 80 exclude = [ # External file, leaving license intact - "examples/fp8/quantizer/quantize.py" + "examples/other/fp8/quantizer/quantize.py" ] [tool.ruff.lint.per-file-ignores] diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py index 0d27cf9f152e0..57518bd3e8299 100644 --- a/tests/plugins_tests/test_platform_plugins.py +++ b/tests/plugins_tests/test_platform_plugins.py @@ -5,7 +5,7 @@ def test_platform_plugins(): import os example_file = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(current_file))), - "examples", "offline_inference.py") + "examples", "offline_inference/offline_inference.py") runpy.run_path(example_file) # check if the plugin is loaded correctly diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 0b0792b6b845f..bf409d2d97aa1 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -163,8 +163,8 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner, def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): multilora_inference = import_from_path( - "examples.multilora_inference", - EXAMPLES_PATH / "multilora_inference.py", + "examples.offline_inference.multilora_inference", + EXAMPLES_PATH / "offline_inference/multilora_inference.py", ) model_ref = "meta-llama/Llama-2-7b-hf" diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py index 394ca8663e189..49366abc7fb56 100644 --- a/tools/profiler/print_layerwise_table.py +++ b/tools/profiler/print_layerwise_table.py @@ -31,7 +31,7 @@ def get_entries(node, curr_depth=0): type=str, required=True, help="json trace file output by " - "examples/offline_profile.py") + "examples/offline_inference/offline_profile.py") parser.add_argument("--phase", type=str, required=True, diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py index da7a28da15c19..fa88ed4204d8f 100644 --- a/tools/profiler/visualize_layerwise_profile.py +++ b/tools/profiler/visualize_layerwise_profile.py @@ -534,11 +534,11 @@ def make_plot_title_suffix(profile_json: dict) -> str: if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument( - "--json-trace", - type=str, - required=True, - help="json trace file output by examples/offline_profile.py") + parser.add_argument("--json-trace", + type=str, + required=True, + help="json trace file output by \ + examples/offline_inference/offline_profile.py") parser.add_argument("--output-directory", type=str, required=False, diff --git a/vllm/distributed/kv_transfer/README.md b/vllm/distributed/kv_transfer/README.md index dab2d10c4c9d0..e20c992a381a3 100644 --- a/vllm/distributed/kv_transfer/README.md +++ b/vllm/distributed/kv_transfer/README.md @@ -22,7 +22,7 @@ NOTE: If you want to not only transfer KV caches, but adjust the model execution ## Disaggregated prefilling -The example usage is in [this file](../../../examples/disaggregated_prefill.sh). +The example usage is in [this file](../../../examples/online_serving/disaggregated_prefill.sh). Here is the diagram of how we run disaggretgated prefilling. diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index a9c1fa7221217..0033fbff0e9ac 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -452,9 +452,9 @@ def _load_model_serialized_cpu( """Load a serialized model with tensorizer to the CPU. This is only necessary when the model isn't vLLM-tensorized (see - examples/tensorize_vllm_model.py) This should still be faster than - default HuggingFace loading, but will be slower than loading a - vLLM-tensorized model. + examples/other/tensorize_vllm_model.py) This should still + be faster than default HuggingFace loading, but will be slower than + loading a vLLM-tensorized model. """ device_config = vllm_config.device_config model_config = vllm_config.model_config @@ -472,7 +472,7 @@ def _load_model_serialized( """Load a serialized model with tensorizer. Expects a vLLM-tensorized model. See the - examples/tensorize_vllm_model.py example script + examples/other/tensorize_vllm_model.py example script for serializing vLLM models.""" device_config = vllm_config.device_config @@ -529,7 +529,8 @@ class ShardedStateLoader(BaseModelLoader): Model loader that directly loads each worker's model state dict, which enables a fast load path for large tensor-parallel models where each worker only needs to read its own shard rather than the entire checkpoint. See - `examples/save_sharded_state.py` for creating a sharded checkpoint. + `examples/offline_inference/save_sharded_state.py` for creating a sharded + checkpoint. """ DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors" diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 8b929f299c8d8..fbd4937112e11 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -155,7 +155,7 @@ class TensorizerArgs: encryption_keyfile: File path to a binary file containing a binary key to use for decryption. `None` (the default) means no decryption. See the example script in - examples/tensorize_vllm_model.py. + examples/other/tensorize_vllm_model.py. s3_access_key_id: The access key for the S3 bucket. Can also be set via the S3_ACCESS_KEY_ID environment variable. s3_secret_access_key: The secret access key for the S3 bucket. Can also @@ -363,12 +363,12 @@ def deserialize(self): def tensorizer_weights_iterator( tensorizer_args: "TensorizerArgs" ) -> Generator[Tuple[str, torch.Tensor], None, None]: - logger.warning( - "Deserializing HuggingFace models is not optimized for " - "loading on vLLM, as tensorizer is forced to load to CPU. " - "Consider deserializing a vLLM model instead for faster " - "load times. See the examples/tensorize_vllm_model.py example " - "script for serializing vLLM models.") + logger.warning("Deserializing HuggingFace models is not optimized for " + "loading on vLLM, as tensorizer is forced to load to CPU. " + "Consider deserializing a vLLM model instead for faster " + "load times. See the " + "examples/other/tensorize_vllm_model.py example script " + "for serializing vLLM models.") deserializer_args = tensorizer_args.deserializer_params stream_params = tensorizer_args.stream_params diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 8aa0c98df70d2..a2c991cfdb74e 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -503,7 +503,8 @@ def kv_cache_scales_loader( KV cache scaling factors. The serialization should represent a dictionary whose keys are the TP ranks and values are another dictionary mapping layers to their KV cache scaling factors. - Keep this function in sync with the output of examples/fp8/extract_scales.py + Keep this function in sync with the output of + examples/other/fp8/extract_scales.py """ try: with open(filename) as f: