From 64ed737767ba7fa5711c3accf8d3f5556d2b688a Mon Sep 17 00:00:00 2001 From: "caishangming.csm" Date: Tue, 16 Jul 2024 16:36:37 +0800 Subject: [PATCH 1/2] Fix spec_decode typos. --- vllm/spec_decode/multi_step_worker.py | 2 +- vllm/spec_decode/ngram_worker.py | 4 ++-- vllm/spec_decode/proposer_worker_base.py | 2 +- vllm/spec_decode/spec_decode_worker.py | 4 ++-- vllm/spec_decode/top1_proposer.py | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 09a77f9e870fb..11e99882e3f0b 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -43,7 +43,7 @@ def init_device(self) -> None: ) def set_include_gpu_probs_tensor(self) -> None: - # Need include_gpu_probs_tensor for multi_step_worker + # Need include_gpu_probs_tensor for MultiStepWorker self.model_runner.model.sampler.include_gpu_probs_tensor = True @torch.inference_mode() diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index 07991df52e655..a21222fec269b 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -13,7 +13,7 @@ class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase): """NGramWorker provides a light drafter without need for model. - Current NGramWorker only implement prompt lookup decoding, + Current NGramWorker only implements prompt lookup decoding, and in future we may also do RAG type drafter and other scenarios which don't rely on LLM model to give proposals. """ @@ -37,7 +37,7 @@ def init_device(self): self.device = torch.device(f"cuda:{self.local_rank}") self.load_model = lambda *args, **kwargs: None - # Current only support Top1Proposer + # Current NGramWorker only supports Top1Proposer self._proposer = Top1Proposer( weakref.proxy(self), # type: ignore[arg-type] device=self.device, diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py index fffa557121e17..51cefc0cbca8b 100644 --- a/vllm/spec_decode/proposer_worker_base.py +++ b/vllm/spec_decode/proposer_worker_base.py @@ -24,7 +24,7 @@ def sampler_output( ) -> Tuple[Optional[List[SamplerOutput]], bool]: raise NotImplementedError - def set_include_gpu_probs_tensor(self): + def set_include_gpu_probs_tensor(self) -> None: """Implementation optional""" pass diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 3c8e3dee46831..b2eade7136ed5 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -144,7 +144,7 @@ def create_worker( spec_decode_sampler: SpecDecodeBaseSampler = None if draft_token_acceptance_method == "rejection_sampler": spec_decode_sampler = RejectionSampler( - disable_bonus_tokens=False, ) + disable_bonus_tokens=False) elif draft_token_acceptance_method == "typical_acceptance_sampler": spec_decode_sampler = TypicalAcceptanceSampler( disable_bonus_tokens=False, @@ -206,7 +206,7 @@ def __init__( self.probs_dtype = self.spec_decode_sampler.probs_dtype self.token_id_dtype = self.spec_decode_sampler.token_id_dtype - # Lazy initiazliation. + # Lazy initialization. self.scorer: SpeculativeScorer # Hidden states from target model to pass to proposer diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py index 7b34b5d34208b..ade293c2c0757 100644 --- a/vllm/spec_decode/top1_proposer.py +++ b/vllm/spec_decode/top1_proposer.py @@ -138,7 +138,7 @@ def _split_by_proposal_len( # Currently only proposal lens of 0 or the global batch proposal len # are supported. - # If max_proposal_len is defined, then we shall no exccess this + # If max_proposal_len is defined, then we shall no exceed this # quota for nonzero_proposal new_k = 0 if (self.max_proposal_len is None @@ -219,7 +219,7 @@ def _merge_outputs( proposal_lens: List[int], nonzero_proposal_len_indices: List[int], sampler_transposed: bool, - ) -> Tuple[torch.Tensor, torch.tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """After speculations are produced, merge the speculation results with the skipped sequences. """ From 90787e61cb81fdc1a9da63ac98e6f54c03b6a774 Mon Sep 17 00:00:00 2001 From: "caishangming.csm" Date: Tue, 16 Jul 2024 17:00:10 +0800 Subject: [PATCH 2/2] fix --- vllm/spec_decode/spec_decode_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index b2eade7136ed5..903264aad7a15 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -144,7 +144,7 @@ def create_worker( spec_decode_sampler: SpecDecodeBaseSampler = None if draft_token_acceptance_method == "rejection_sampler": spec_decode_sampler = RejectionSampler( - disable_bonus_tokens=False) + disable_bonus_tokens=False, ) elif draft_token_acceptance_method == "typical_acceptance_sampler": spec_decode_sampler = TypicalAcceptanceSampler( disable_bonus_tokens=False,