From c5e350bd641770f343cc59973d48eb1a69c4eb94 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 17 Jul 2024 14:11:30 -0700 Subject: [PATCH 01/12] Cache importlib in ModelRegistry --- vllm/model_executor/models/__init__.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 87508a1168e0c..dec26b90e534d 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -1,3 +1,4 @@ +import functools import importlib from typing import Dict, List, Optional, Type @@ -97,6 +98,13 @@ class ModelRegistry: + @staticmethod + @functools.lru_cache(maxsize=128) + def _get_model(model_arch: str): + module_name, model_cls_name = _MODELS[model_arch] + module = importlib.import_module( + f"vllm.model_executor.models.{module_name}") + return getattr(module, model_cls_name, None) @staticmethod def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]: @@ -114,10 +122,7 @@ def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]: "Model architecture %s is partially supported by ROCm: %s", model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]) - module_name, model_cls_name = _MODELS[model_arch] - module = importlib.import_module( - f"vllm.model_executor.models.{module_name}") - return getattr(module, model_cls_name, None) + return ModelRegistry._get_model(model_arch) @staticmethod def get_supported_archs() -> List[str]: From f269738751e9cd0bb200830d0dd227e87c37fcf7 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 17 Jul 2024 14:12:14 -0700 Subject: [PATCH 02/12] Fast return for get_common_computed_block_ids --- vllm/core/block/prefix_caching_block.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index f272e23ee6088..d102ad4045591 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -552,9 +552,12 @@ def get_common_computed_block_ids( # runner. # It returns a list of int although type annotation says list of string. + if len(computed_seq_block_ids) == 1: + return computed_seq_block_ids[0] + return commonprefix([ ids for ids in computed_seq_block_ids # type: ignore - if ids != [] + if ids ]) def get_num_blocks_touched(self, From a36da809b5ac77c77e40e6046b993c2a88d902f3 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 17 Jul 2024 14:13:30 -0700 Subject: [PATCH 03/12] chunk_list into an iterator --- vllm/core/block/block_table.py | 17 ++++++++++++----- vllm/utils.py | 5 +++-- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 49e63c23155b8..c8cd57cab9801 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -1,3 +1,4 @@ +import math from typing import List, Optional from vllm.core.block.common import BlockList @@ -337,10 +338,16 @@ def get_num_blocks_touched_by_append_slots( This is required for the scheduler to determine whether a sequence can continue generation, or if it must be preempted. """ + # Math below is equivalent to: + # all_token_ids = token_ids + [-1] * num_lookahead_slots + # token_blocks = self._chunk_token_blocks_for_append(all_token_ids) + # return len(token_blocks) - all_token_ids = token_ids + [-1] * num_lookahead_slots - token_blocks = self._chunk_token_blocks_for_append(all_token_ids) - return len(token_blocks) + num_token_ids = len(token_ids) + num_lookahead_slots + first_chunk_size = self._block_size - (self._num_full_slots % + self._block_size) + num_token_blocks = 1 + math.ceil(num_token_ids-first_chunk_size / self._block_size) + return num_token_blocks def _chunk_token_blocks_for_append( self, token_ids: List[int]) -> List[List[int]]: @@ -351,6 +358,6 @@ def _chunk_token_blocks_for_append( """ first_chunk_size = self._block_size - (self._num_full_slots % self._block_size) - token_blocks = [token_ids[:first_chunk_size]] + chunk_list( - token_ids[first_chunk_size:], self._block_size) + token_blocks = [token_ids[:first_chunk_size]] + list(chunk_list( + token_ids[first_chunk_size:], self._block_size)) return token_blocks diff --git a/vllm/utils.py b/vllm/utils.py index 8be1528230b5f..aad5cb94b9d62 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -415,9 +415,10 @@ def init_kmp_env(): os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist" -def chunk_list(lst: List[T], chunk_size: int) -> List[List[T]]: +def chunk_list(lst: List[T], chunk_size: int): """Yield successive chunk_size chunks from lst.""" - return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)] + for i in range(0, len(lst), chunk_size): + yield lst[i:i + chunk_size] def cdiv(a: int, b: int) -> int: From 47ce44fcf364af1dfa222af82c977f8844dfdc6f Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 17 Jul 2024 14:13:40 -0700 Subject: [PATCH 04/12] Cache _first_seq in SequenceGroup --- vllm/sequence.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index 1cebf68d463db..6c12a01bd0b2b 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -457,24 +457,25 @@ def __init__( self.prompt_adapter_request = prompt_adapter_request self.encoder_seq = encoder_seq self.trace_headers = trace_headers + self._first_seq = next(iter(self.seqs_dict.values())) @property def prompt(self) -> Optional[str]: # All sequences in the group should have the same prompt. # We use the prompt of an arbitrary sequence. - return next(iter(self.seqs_dict.values())).prompt + return self._first_seq.prompt @property def prompt_token_ids(self) -> List[int]: # All sequences in the group should have the same prompt. # We use the prompt of an arbitrary sequence. - return next(iter(self.seqs_dict.values())).prompt_token_ids + return self._first_seq.prompt_token_ids @property def multi_modal_data(self) -> "MultiModalDataDict": # All sequences in the group should have the same multi-modal data. # We use the multi-modal data of an arbitrary sequence. - return next(iter(self.seqs_dict.values())).multi_modal_data + return self._first_seq.multi_modal_data @property def lora_int_id(self) -> int: From 69d73a30564aa6913ccd266e0fc467f68386c827 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 17 Jul 2024 14:16:23 -0700 Subject: [PATCH 05/12] Lint --- vllm/core/block/block_table.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index c8cd57cab9801..982d628f24017 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -346,7 +346,9 @@ def get_num_blocks_touched_by_append_slots( num_token_ids = len(token_ids) + num_lookahead_slots first_chunk_size = self._block_size - (self._num_full_slots % self._block_size) - num_token_blocks = 1 + math.ceil(num_token_ids-first_chunk_size / self._block_size) + num_token_blocks = ( + 1 + math.ceil(num_token_ids-first_chunk_size / self._block_size) + ) return num_token_blocks def _chunk_token_blocks_for_append( From bbef0e1025c6cbdf817ea989c2cf418047c02b2d Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 17 Jul 2024 14:16:46 -0700 Subject: [PATCH 06/12] Lint --- vllm/core/block/block_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 982d628f24017..5a32c8d0a4df1 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -347,7 +347,7 @@ def get_num_blocks_touched_by_append_slots( first_chunk_size = self._block_size - (self._num_full_slots % self._block_size) num_token_blocks = ( - 1 + math.ceil(num_token_ids-first_chunk_size / self._block_size) + 1 + math.ceil(num_token_ids - first_chunk_size / self._block_size) ) return num_token_blocks From 34c30dfba5e7add3c4b77dfb7f952e341f8c1535 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 17 Jul 2024 14:25:46 -0700 Subject: [PATCH 07/12] Lint --- vllm/core/block/block_table.py | 7 +++---- vllm/model_executor/models/__init__.py | 1 + 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 5a32c8d0a4df1..21f98a63ec52c 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -347,8 +347,7 @@ def get_num_blocks_touched_by_append_slots( first_chunk_size = self._block_size - (self._num_full_slots % self._block_size) num_token_blocks = ( - 1 + math.ceil(num_token_ids - first_chunk_size / self._block_size) - ) + 1 + math.ceil(num_token_ids - first_chunk_size / self._block_size)) return num_token_blocks def _chunk_token_blocks_for_append( @@ -360,6 +359,6 @@ def _chunk_token_blocks_for_append( """ first_chunk_size = self._block_size - (self._num_full_slots % self._block_size) - token_blocks = [token_ids[:first_chunk_size]] + list(chunk_list( - token_ids[first_chunk_size:], self._block_size)) + token_blocks = [token_ids[:first_chunk_size]] + list( + chunk_list(token_ids[first_chunk_size:], self._block_size)) return token_blocks diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index dec26b90e534d..aa5a70757b31c 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -98,6 +98,7 @@ class ModelRegistry: + @staticmethod @functools.lru_cache(maxsize=128) def _get_model(model_arch: str): From 6b45138139db2ff202c6b88c8d459b7d07f76a15 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 17 Jul 2024 15:06:49 -0700 Subject: [PATCH 08/12] Fix test --- tests/core/block/test_block_manager_v2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index d0ca09c4be0d4..03244be27e0a0 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -249,10 +249,10 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append, # Expect consumed blocks to be new blocks required to support the new slots. expected_consumed_blocks = len( - chunk_list( + list(chunk_list( list( range(prompt_len + num_slots_to_append + num_lookahead_slots)), - block_size)) - len(chunk_list(list(range(prompt_len)), block_size)) + block_size)) - len(chunk_list(list(range(prompt_len)), block_size))) assert num_consumed_blocks == expected_consumed_blocks From f27f653cbb3a571aa6256cf0b3f417783abe94da Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 17 Jul 2024 15:09:10 -0700 Subject: [PATCH 09/12] Fix --- tests/core/block/test_block_manager_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index 03244be27e0a0..4afbdb96213a3 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -252,7 +252,7 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append, list(chunk_list( list( range(prompt_len + num_slots_to_append + num_lookahead_slots)), - block_size)) - len(chunk_list(list(range(prompt_len)), block_size))) + block_size))) - len(list(chunk_list(list(range(prompt_len)), block_size))) assert num_consumed_blocks == expected_consumed_blocks From 31e4c763a39c46357d86fe0c6051be0019e84f40 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 17 Jul 2024 15:10:56 -0700 Subject: [PATCH 10/12] Lint --- tests/core/block/test_block_manager_v2.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index 4afbdb96213a3..d7863a9ae1ada 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -249,10 +249,13 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append, # Expect consumed blocks to be new blocks required to support the new slots. expected_consumed_blocks = len( - list(chunk_list( - list( - range(prompt_len + num_slots_to_append + num_lookahead_slots)), - block_size))) - len(list(chunk_list(list(range(prompt_len)), block_size))) + list( + chunk_list( + list( + range(prompt_len + num_slots_to_append + + num_lookahead_slots)), + block_size))) - len( + list(chunk_list(list(range(prompt_len)), block_size))) assert num_consumed_blocks == expected_consumed_blocks From dd897dbbf95a4bb72bf51693c740d638126ddc2d Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 17 Jul 2024 15:11:49 -0700 Subject: [PATCH 11/12] Fix --- tests/core/block/test_cpu_gpu_block_allocator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py index 15b76d9093c63..a9e38d40444a9 100644 --- a/tests/core/block/test_cpu_gpu_block_allocator.py +++ b/tests/core/block/test_cpu_gpu_block_allocator.py @@ -58,10 +58,10 @@ def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int, unique_token_ids = list( range((num_cpu_blocks + num_gpu_blocks) * block_size)) - gpu_token_ids = chunk_list(unique_token_ids[:num_gpu_blocks * block_size], - block_size) - cpu_token_ids = chunk_list(unique_token_ids[num_gpu_blocks * block_size:], - block_size) + gpu_token_ids = list( + chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size)) + cpu_token_ids = list( + chunk_list(unique_token_ids[num_gpu_blocks * block_size:], block_size)) assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks From ebf4794ee14775046de0d911a6e25c387e056f91 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 18 Jul 2024 18:16:19 +0000 Subject: [PATCH 12/12] Fix --- vllm/core/block/block_table.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 21f98a63ec52c..06b816eb367f5 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -346,8 +346,8 @@ def get_num_blocks_touched_by_append_slots( num_token_ids = len(token_ids) + num_lookahead_slots first_chunk_size = self._block_size - (self._num_full_slots % self._block_size) - num_token_blocks = ( - 1 + math.ceil(num_token_ids - first_chunk_size / self._block_size)) + num_token_blocks = (1 + math.ceil( + (num_token_ids - first_chunk_size) / self._block_size)) return num_token_blocks def _chunk_token_blocks_for_append( @@ -359,6 +359,7 @@ def _chunk_token_blocks_for_append( """ first_chunk_size = self._block_size - (self._num_full_slots % self._block_size) - token_blocks = [token_ids[:first_chunk_size]] + list( + token_blocks = [token_ids[:first_chunk_size]] + token_blocks.extend( chunk_list(token_ids[first_chunk_size:], self._block_size)) return token_blocks