diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 09a77f9e870fb..11e99882e3f0b 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -43,7 +43,7 @@ def init_device(self) -> None: ) def set_include_gpu_probs_tensor(self) -> None: - # Need include_gpu_probs_tensor for multi_step_worker + # Need include_gpu_probs_tensor for MultiStepWorker self.model_runner.model.sampler.include_gpu_probs_tensor = True @torch.inference_mode() diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index 07991df52e655..a21222fec269b 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -13,7 +13,7 @@ class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase): """NGramWorker provides a light drafter without need for model. - Current NGramWorker only implement prompt lookup decoding, + Current NGramWorker only implements prompt lookup decoding, and in future we may also do RAG type drafter and other scenarios which don't rely on LLM model to give proposals. """ @@ -37,7 +37,7 @@ def init_device(self): self.device = torch.device(f"cuda:{self.local_rank}") self.load_model = lambda *args, **kwargs: None - # Current only support Top1Proposer + # Current NGramWorker only supports Top1Proposer self._proposer = Top1Proposer( weakref.proxy(self), # type: ignore[arg-type] device=self.device, diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py index fffa557121e17..51cefc0cbca8b 100644 --- a/vllm/spec_decode/proposer_worker_base.py +++ b/vllm/spec_decode/proposer_worker_base.py @@ -24,7 +24,7 @@ def sampler_output( ) -> Tuple[Optional[List[SamplerOutput]], bool]: raise NotImplementedError - def set_include_gpu_probs_tensor(self): + def set_include_gpu_probs_tensor(self) -> None: """Implementation optional""" pass diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 3c8e3dee46831..903264aad7a15 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -206,7 +206,7 @@ def __init__( self.probs_dtype = self.spec_decode_sampler.probs_dtype self.token_id_dtype = self.spec_decode_sampler.token_id_dtype - # Lazy initiazliation. + # Lazy initialization. self.scorer: SpeculativeScorer # Hidden states from target model to pass to proposer diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py index 7b34b5d34208b..ade293c2c0757 100644 --- a/vllm/spec_decode/top1_proposer.py +++ b/vllm/spec_decode/top1_proposer.py @@ -138,7 +138,7 @@ def _split_by_proposal_len( # Currently only proposal lens of 0 or the global batch proposal len # are supported. - # If max_proposal_len is defined, then we shall no exccess this + # If max_proposal_len is defined, then we shall no exceed this # quota for nonzero_proposal new_k = 0 if (self.max_proposal_len is None @@ -219,7 +219,7 @@ def _merge_outputs( proposal_lens: List[int], nonzero_proposal_len_indices: List[int], sampler_transposed: bool, - ) -> Tuple[torch.Tensor, torch.tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """After speculations are produced, merge the speculation results with the skipped sequences. """