Skip to content

Commit

Permalink
Avoid unnecessary tokenization
Browse files Browse the repository at this point in the history
Signed-off-by: DarkLight1337 <[email protected]>
  • Loading branch information
DarkLight1337 committed Jan 22, 2025
1 parent cbdc4ad commit 611f5aa
Show file tree
Hide file tree
Showing 8 changed files with 66 additions and 39 deletions.
12 changes: 10 additions & 2 deletions vllm/model_executor/models/blip2.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,15 +475,23 @@ def _get_prompt_replacements(
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> list[PromptReplacement]:
tokenizer = self.info.get_tokenizer()
vocab = tokenizer.get_vocab()

bos_token_id = tokenizer.bos_token_id
assert isinstance(bos_token_id, int)

image_token_id = vocab["image"]
num_image_tokens = self.info.get_num_image_tokens()
image_tokens = [image_token_id] * num_image_tokens

return [
PromptReplacement(
modality="image",
target="</s>",
replacement=PromptReplacementDetails(
full="<image>" * num_image_tokens + "</s>",
features="<image>" * num_image_tokens,
full=image_tokens + [bos_token_id],
features=image_tokens,
),
)
]
Expand Down
23 changes: 14 additions & 9 deletions vllm/model_executor/models/chameleon.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,9 @@ def _apply_hf_processor_tokens_only(
) -> list[int]:
# HF processor adds sep token for chat mode
tokenizer = self.info.get_tokenizer()
sep_token_id: int = \
tokenizer.vocab[tokenizer.sep_token] # type: ignore
vocab = tokenizer.get_vocab()

sep_token_id = vocab[tokenizer.sep_token] # type: ignore

return prompt_tokens + [sep_token_id]

Expand All @@ -141,18 +142,22 @@ def _get_prompt_replacements(
out_mm_kwargs: MultiModalKwargs,
) -> list[PromptReplacement]:
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_tokens = processor.image_token * self.info.get_num_image_tokens()
tokenizer = self.info.get_tokenizer()
vocab = tokenizer.get_vocab()

image_start_id = vocab[processor.image_start_token]
image_token_id = vocab[processor.image_token]
image_end_id = vocab[processor.image_end_token]

num_image_tokens = self.info.get_num_image_tokens()
image_tokens = [image_token_id] * num_image_tokens

return [
PromptReplacement(
modality="image",
target="<image>",
target=[image_token_id],
replacement=PromptReplacementDetails(
full="".join([
processor.image_start_token,
image_tokens,
processor.image_end_token,
]),
full=([image_start_id] + image_tokens + [image_end_id]),
features=image_tokens,
),
)
Expand Down
6 changes: 4 additions & 2 deletions vllm/model_executor/models/deepseek_vl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,8 +249,10 @@ def _get_prompt_replacements(
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> list[PromptReplacement]:
hf_processor = self.info.get_hf_processor()
image_token_id: int = hf_processor.image_token_id
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)

image_token_id = hf_processor.image_token_id
assert isinstance(image_token_id, int)

def get_replacement_deepseek_vl2(item_idx: int):
images = mm_items.get_items(
Expand Down
5 changes: 4 additions & 1 deletion vllm/model_executor/models/fuyu.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,9 @@ def _apply_hf_processor_tokens_only(
) -> list[int]:
# HF processor adds boa_token_id
tokenizer = self.info.get_tokenizer()
boa_token_id: int = tokenizer.vocab["<0x04>"] # type: ignore
vocab = tokenizer.get_vocab()

boa_token_id = vocab["<0x04>"]

return prompt_tokens + [boa_token_id]

Expand All @@ -202,6 +204,7 @@ def _get_prompt_replacements(
) -> list[PromptReplacement]:
hf_config = self.info.get_hf_config()
bos_token_id = hf_config.bos_token_id
assert isinstance(bos_token_id, int)

tokenizer = self.info.get_tokenizer()
eot_token_id = tokenizer.bos_token_id
Expand Down
17 changes: 9 additions & 8 deletions vllm/model_executor/models/llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,13 +315,14 @@ def _get_prompt_replacements(
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> list[PromptReplacement]:
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index
tokenizer = self.info.get_tokenizer()
vocab = tokenizer.get_vocab()

processor = self.info.get_hf_processor()
image_token = processor.image_token
image_break_token = processor.image_break_token
image_end_token = processor.image_end_token
image_break_id = vocab[processor.image_break_token]
image_token_id = hf_config.image_token_index
image_end_id = vocab[processor.image_end_token]

vision_config = hf_config.vision_config
assert isinstance(vision_config, PixtralVisionConfig)
Expand All @@ -336,10 +337,10 @@ def get_replacement(item_idx: int):
image_height=image_size.height,
)

tokens = ([image_token] * ncols + [image_break_token]) * nrows
tokens[-1] = image_end_token
tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
tokens[-1] = image_end_id

return "".join(tokens)
return tokens

return [
PromptReplacement(
Expand Down
20 changes: 11 additions & 9 deletions vllm/model_executor/models/qwen2_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,9 @@ def _get_prompt_replacements(
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> list[PromptReplacement]:
processor = self.info.get_hf_processor()
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
tokenizer = self.info.get_tokenizer()
vocab = tokenizer.get_vocab()

# Use getattr with default to be compatible with transformers<4.48
audio_token = getattr(processor, "audio_token", "<|AUDIO|>")
Expand All @@ -197,6 +199,10 @@ def _get_prompt_replacements(
audio_eos_token = getattr(processor, "audio_eos_token",
"<|audio_eos|>")

audio_token_id = vocab[audio_token]
audio_bos_id = vocab[audio_bos_token]
audio_eos_id = vocab[audio_eos_token]

feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
if feature_attention_mask is None:
audio_output_lengths = []
Expand All @@ -208,22 +214,18 @@ def _get_prompt_replacements(
audio_output_lengths = audio_output_lens.tolist()

def get_replacement_qwen2_audio(item_idx: int):
num_placeholders = audio_output_lengths[item_idx]
if num_placeholders == 0:
num_features = audio_output_lengths[item_idx]
if num_features == 0:
audios = mm_items.get_items("audio", AudioProcessorItems)
audio = audios.get(item_idx)
raise ValueError(
f"The audio {audio} (len={len(audio)}) is too short "
"to be represented inside the model")

audio_tokens = audio_token * num_placeholders
audio_tokens = [audio_token_id] * num_features

return PromptReplacementDetails(
full="".join([
audio_bos_token,
audio_tokens,
audio_eos_token,
]),
full=[audio_bos_id] + audio_tokens + [audio_eos_id],
features=audio_tokens,
)

Expand Down
12 changes: 7 additions & 5 deletions vllm/model_executor/models/qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -953,12 +953,14 @@ def _get_prompt_replacements(
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_processor = self.info.get_image_processor(
**hf_processor_mm_kwargs)
tokenizer = self.info.get_tokenizer()
vocab = tokenizer.get_vocab()

# NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
# image_token and video_token registered
placeholder = {
"image": hf_processor.image_token,
"video": hf_processor.video_token,
"image": vocab[hf_processor.image_token],
"video": vocab[hf_processor.video_token],
}

merge_length = image_processor.merge_size**2
Expand All @@ -967,13 +969,13 @@ def get_replacement_qwen2vl(item_idx: int, modality: str):
grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
assert isinstance(grid_thw, torch.Tensor)

num_tokens = grid_thw.prod().item() // merge_length
return placeholder[modality] * num_tokens
num_tokens = int(grid_thw.prod()) // merge_length
return [placeholder[modality]] * num_tokens

return [
PromptReplacement(
modality=modality,
target=placeholder[modality],
target=[placeholder[modality]],
replacement=partial(get_replacement_qwen2vl,
modality=modality),
) for modality in ("image", "video")
Expand Down
10 changes: 7 additions & 3 deletions vllm/model_executor/models/ultravox.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,16 +205,20 @@ def _get_prompt_replacements(
out_mm_kwargs: MultiModalKwargs,
) -> list[PromptReplacement]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
placeholder = hf_processor.audio_token_replacement # type: ignore
tokenizer = self.info.get_tokenizer()
vocab = tokenizer.get_vocab()

replacement_id = vocab[
hf_processor.audio_token_replacement] # type: ignore

def get_replacement_ultravox(item_idx: int):
audio_token_len = out_mm_kwargs["audio_token_len"][item_idx]
return placeholder * audio_token_len
return [replacement_id] * int(audio_token_len) # type: ignore

return [
PromptReplacement(
modality="audio",
target="<|audio|>",
target='<|audio|>',
replacement=get_replacement_ultravox,
)
]
Expand Down

0 comments on commit 611f5aa

Please sign in to comment.