diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index a8e6f3b6c6d41..09c5087c2dc36 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -475,15 +475,23 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + bos_token_id = tokenizer.bos_token_id + assert isinstance(bos_token_id, int) + + image_token_id = vocab["image"] num_image_tokens = self.info.get_num_image_tokens() + image_tokens = [image_token_id] * num_image_tokens return [ PromptReplacement( modality="image", target="", replacement=PromptReplacementDetails( - full="" * num_image_tokens + "", - features="" * num_image_tokens, + full=image_tokens + [bos_token_id], + features=image_tokens, ), ) ] diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 2df50b3c4efe6..e834c9004f140 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -122,8 +122,9 @@ def _apply_hf_processor_tokens_only( ) -> list[int]: # HF processor adds sep token for chat mode tokenizer = self.info.get_tokenizer() - sep_token_id: int = \ - tokenizer.vocab[tokenizer.sep_token] # type: ignore + vocab = tokenizer.get_vocab() + + sep_token_id = vocab[tokenizer.sep_token] # type: ignore return prompt_tokens + [sep_token_id] @@ -141,18 +142,22 @@ def _get_prompt_replacements( out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - image_tokens = processor.image_token * self.info.get_num_image_tokens() + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + image_start_id = vocab[processor.image_start_token] + image_token_id = vocab[processor.image_token] + image_end_id = vocab[processor.image_end_token] + + num_image_tokens = self.info.get_num_image_tokens() + image_tokens = [image_token_id] * num_image_tokens return [ PromptReplacement( modality="image", - target="", + target=[image_token_id], replacement=PromptReplacementDetails( - full="".join([ - processor.image_start_token, - image_tokens, - processor.image_end_token, - ]), + full=([image_start_id] + image_tokens + [image_end_id]), features=image_tokens, ), ) diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 4d3d1c329a2c0..344832d8b33e6 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -249,8 +249,10 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_processor = self.info.get_hf_processor() - image_token_id: int = hf_processor.image_token_id + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + image_token_id = hf_processor.image_token_id + assert isinstance(image_token_id, int) def get_replacement_deepseek_vl2(item_idx: int): images = mm_items.get_items( diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 4865eb170364c..dbf9da50cc9de 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -183,7 +183,9 @@ def _apply_hf_processor_tokens_only( ) -> list[int]: # HF processor adds boa_token_id tokenizer = self.info.get_tokenizer() - boa_token_id: int = tokenizer.vocab["<0x04>"] # type: ignore + vocab = tokenizer.get_vocab() + + boa_token_id = vocab["<0x04>"] return prompt_tokens + [boa_token_id] @@ -202,6 +204,7 @@ def _get_prompt_replacements( ) -> list[PromptReplacement]: hf_config = self.info.get_hf_config() bos_token_id = hf_config.bos_token_id + assert isinstance(bos_token_id, int) tokenizer = self.info.get_tokenizer() eot_token_id = tokenizer.bos_token_id diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index a355ae494afd0..296af2aac5660 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -315,13 +315,14 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: + processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_config = self.info.get_hf_config() - image_token_id = hf_config.image_token_index + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() - processor = self.info.get_hf_processor() - image_token = processor.image_token - image_break_token = processor.image_break_token - image_end_token = processor.image_end_token + image_break_id = vocab[processor.image_break_token] + image_token_id = hf_config.image_token_index + image_end_id = vocab[processor.image_end_token] vision_config = hf_config.vision_config assert isinstance(vision_config, PixtralVisionConfig) @@ -336,10 +337,10 @@ def get_replacement(item_idx: int): image_height=image_size.height, ) - tokens = ([image_token] * ncols + [image_break_token]) * nrows - tokens[-1] = image_end_token + tokens = ([image_token_id] * ncols + [image_break_id]) * nrows + tokens[-1] = image_end_id - return "".join(tokens) + return tokens return [ PromptReplacement( diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index c21cb815b5283..fc5aed5c94abb 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -188,7 +188,9 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - processor = self.info.get_hf_processor() + processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() # Use getattr with default to be compatible with transformers<4.48 audio_token = getattr(processor, "audio_token", "<|AUDIO|>") @@ -197,6 +199,10 @@ def _get_prompt_replacements( audio_eos_token = getattr(processor, "audio_eos_token", "<|audio_eos|>") + audio_token_id = vocab[audio_token] + audio_bos_id = vocab[audio_bos_token] + audio_eos_id = vocab[audio_eos_token] + feature_attention_mask = out_mm_kwargs.get("feature_attention_mask") if feature_attention_mask is None: audio_output_lengths = [] @@ -208,22 +214,18 @@ def _get_prompt_replacements( audio_output_lengths = audio_output_lens.tolist() def get_replacement_qwen2_audio(item_idx: int): - num_placeholders = audio_output_lengths[item_idx] - if num_placeholders == 0: + num_features = audio_output_lengths[item_idx] + if num_features == 0: audios = mm_items.get_items("audio", AudioProcessorItems) audio = audios.get(item_idx) raise ValueError( f"The audio {audio} (len={len(audio)}) is too short " "to be represented inside the model") - audio_tokens = audio_token * num_placeholders + audio_tokens = [audio_token_id] * num_features return PromptReplacementDetails( - full="".join([ - audio_bos_token, - audio_tokens, - audio_eos_token, - ]), + full=[audio_bos_id] + audio_tokens + [audio_eos_id], features=audio_tokens, ) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 34d5c8ad089a3..f1ff8badaf35a 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -953,12 +953,14 @@ def _get_prompt_replacements( hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_processor = self.info.get_image_processor( **hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has # image_token and video_token registered placeholder = { - "image": hf_processor.image_token, - "video": hf_processor.video_token, + "image": vocab[hf_processor.image_token], + "video": vocab[hf_processor.video_token], } merge_length = image_processor.merge_size**2 @@ -967,13 +969,13 @@ def get_replacement_qwen2vl(item_idx: int, modality: str): grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx] assert isinstance(grid_thw, torch.Tensor) - num_tokens = grid_thw.prod().item() // merge_length - return placeholder[modality] * num_tokens + num_tokens = int(grid_thw.prod()) // merge_length + return [placeholder[modality]] * num_tokens return [ PromptReplacement( modality=modality, - target=placeholder[modality], + target=[placeholder[modality]], replacement=partial(get_replacement_qwen2vl, modality=modality), ) for modality in ("image", "video") diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 9301422383696..145df09a427e4 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -205,16 +205,20 @@ def _get_prompt_replacements( out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - placeholder = hf_processor.audio_token_replacement # type: ignore + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + replacement_id = vocab[ + hf_processor.audio_token_replacement] # type: ignore def get_replacement_ultravox(item_idx: int): audio_token_len = out_mm_kwargs["audio_token_len"][item_idx] - return placeholder * audio_token_len + return [replacement_id] * int(audio_token_len) # type: ignore return [ PromptReplacement( modality="audio", - target="<|audio|>", + target='<|audio|>', replacement=get_replacement_ultravox, ) ] diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 294262484f2fb..1f1d67fabb243 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -67,9 +67,10 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer: tokenizer_all_special_tokens_extended = ( tokenizer.all_special_tokens_extended) tokenizer_all_special_tokens = set(tokenizer.all_special_tokens) + tokenizer_vocab = tokenizer.get_vocab() tokenizer_len = len(tokenizer) - max_token_id = max(tokenizer.get_vocab().values()) + max_token_id = max(tokenizer_vocab.values()) # Some tokenizers (e.g., QwenTokenizer) have special tokens that # are added and included in the implementation of the vocab_size # property, but not in get_vocab(); if there is an implementation @@ -96,6 +97,9 @@ def all_special_tokens_extended(self): def max_token_id(self): return max_token_id + def get_vocab(self): + return tokenizer_vocab + def __len__(self): return tokenizer_len