Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[0/N] Rename MultiModalInputs to MultiModalKwargs #10040

Merged
merged 5 commits into from
Nov 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/design/multimodal/multimodal_index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ Base Classes

.. autodata:: vllm.multimodal.MultiModalDataDict

.. autoclass:: vllm.multimodal.MultiModalInputs
.. autoclass:: vllm.multimodal.MultiModalKwargs
:members:
:show-inheritance:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from PIL.Image import Image

from vllm.inputs import InputContext, token_inputs
from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.utils import cached_get_tokenizer

from .....conftest import IMAGE_ASSETS
Expand Down Expand Up @@ -96,7 +96,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
# Ensure that we get the appropriately shaped pixel_values
# for images and image embeddings, respectively.
assert isinstance(mapped_img_data, MultiModalInputs)
assert isinstance(mapped_img_data, MultiModalKwargs)
assert "pixel_values" in mapped_img_data
assert mapped_img_data["pixel_values"].shape == expected_shape

Expand Down
22 changes: 11 additions & 11 deletions tests/multimodal/test_base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import torch

from vllm.multimodal.base import MultiModalInputs, NestedTensors
from vllm.multimodal.base import MultiModalKwargs, NestedTensors


def assert_nested_tensors_equal(expected: NestedTensors,
Expand All @@ -13,40 +13,40 @@ def assert_nested_tensors_equal(expected: NestedTensors,
assert_nested_tensors_equal(expected_item, actual_item)


def assert_multimodal_inputs_equal(expected: MultiModalInputs,
actual: MultiModalInputs):
def assert_multimodal_inputs_equal(expected: MultiModalKwargs,
actual: MultiModalKwargs):
assert set(expected.keys()) == set(actual.keys())
for key in expected:
assert_nested_tensors_equal(expected[key], actual[key])


def test_multimodal_input_batch_single_tensor():
t = torch.rand([1, 2])
result = MultiModalInputs.batch([{"image": t}])
result = MultiModalKwargs.batch([{"image": t}])
assert_multimodal_inputs_equal(result, {"image": t.unsqueeze(0)})


def test_multimodal_input_batch_multiple_tensors():
a = torch.rand([1, 1, 2])
b = torch.rand([1, 1, 2])
c = torch.rand([1, 1, 2])
result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}])
result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
assert_multimodal_inputs_equal(result, {"image": torch.stack([a, b, c])})


def test_multimodal_input_batch_multiple_heterogeneous_tensors():
a = torch.rand([1, 2, 2])
b = torch.rand([1, 3, 2])
c = torch.rand([1, 4, 2])
result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}])
result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
assert_multimodal_inputs_equal(result, {"image": [a, b, c]})


def test_multimodal_input_batch_nested_tensors():
a = torch.rand([2, 3])
b = torch.rand([2, 3])
c = torch.rand([2, 3])
result = MultiModalInputs.batch([{
result = MultiModalKwargs.batch([{
"image": [a]
}, {
"image": [b]
Expand All @@ -65,7 +65,7 @@ def test_multimodal_input_batch_heterogeneous_lists():
a = torch.rand([1, 2, 3])
b = torch.rand([1, 2, 3])
c = torch.rand([1, 2, 3])
result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}])
result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
assert_multimodal_inputs_equal(
result,
{"image": [torch.stack([a, b]), c.unsqueeze(0)]})
Expand All @@ -76,7 +76,7 @@ def test_multimodal_input_batch_multiple_batchable_lists():
b = torch.rand([1, 2, 3])
c = torch.rand([1, 2, 3])
d = torch.rand([1, 2, 3])
result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c, d]}])
result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c, d]}])
assert_multimodal_inputs_equal(
result,
{"image": torch.stack([torch.stack([a, b]),
Expand All @@ -88,8 +88,8 @@ def test_multimodal_input_batch_mixed_stacking_depths():
b = torch.rand([1, 3, 3])
c = torch.rand([1, 4, 3])

result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}])
result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
assert_multimodal_inputs_equal(result, {"image": [[a, b], c.unsqueeze(0)]})

result = MultiModalInputs.batch([{"image": [a]}, {"image": [b, c]}])
result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b, c]}])
assert_multimodal_inputs_equal(result, {"image": [a.unsqueeze(0), [b, c]]})
4 changes: 2 additions & 2 deletions vllm/model_executor/models/chatglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.base import MultiModalData
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
Expand Down Expand Up @@ -74,7 +74,7 @@ def mm_input_mapper_for_glmv(
raise
pixel_values = raw_batch_data['images']

return MultiModalInputs({'pixel_values': pixel_values})
return MultiModalKwargs({'pixel_values': pixel_values})


def merge_glm_vision_embeddings(
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/fuyu.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from vllm.model_executor.models.persimmon import PersimmonForCausalLM
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.image import cached_get_image_processor
from vllm.multimodal.utils import (cached_get_tokenizer,
consecutive_placeholder_ranges)
Expand Down Expand Up @@ -218,7 +218,7 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object):
])

# image has been processed with prompt in input processor
return MultiModalInputs({"pixel_values": data})
return MultiModalKwargs({"pixel_values": data})


@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu)
Expand Down
10 changes: 5 additions & 5 deletions vllm/model_executor/models/h2ovl.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
token_inputs)
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.utils import is_list_of

Expand Down Expand Up @@ -324,12 +324,12 @@ def input_mapper(
data: object,
*,
max_dynamic_patch: Optional[int] = None,
) -> MultiModalInputs:
) -> MultiModalKwargs:

# NOTE: Preprocessing for the image data is done in the
# 'input_processor' function during actual inference.
if isinstance(data, dict):
return MultiModalInputs(data)
return MultiModalKwargs(data)

# The section below is only used with dummy data during
# memory profiling.
Expand All @@ -347,7 +347,7 @@ def input_mapper(
pixel_values = [image_pixel_values_mapper(img) for img in data]

else:
return MultiModalInputs({"image_embeds": data})
return MultiModalKwargs({"image_embeds": data})
model_config = ctx.model_config
tokenizer = cached_get_tokenizer(
model_config.tokenizer,
Expand All @@ -359,7 +359,7 @@ def input_mapper(
return_tensors="pt",
)[0]

return MultiModalInputs({
return MultiModalKwargs({
"pixel_values": pixel_values,
"image_token_id": image_token_id
})
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/idefics3.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.image import cached_get_image_processor
from vllm.sequence import IntermediateTensors, SequenceData
from vllm.transformers_utils.processor import cached_get_processor
Expand Down Expand Up @@ -101,7 +101,7 @@ def input_mapper_for_idefics3(
logger.error("Failed to process image (%s)", data)
raise

return MultiModalInputs(batch_data)
return MultiModalKwargs(batch_data)


def _resize_output_size(height: int,
Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
InternVisionPatchModel)
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import IntermediateTensors
from vllm.utils import is_list_of
Expand Down Expand Up @@ -346,7 +346,7 @@ def input_mapper(
# we can't stack here because images may have different num_patches
data = [image_pixel_values_mapper(img) for img in data]
else:
return MultiModalInputs({"image_embeds": data})
return MultiModalKwargs({"image_embeds": data})
model_config = ctx.model_config
tokenizer = cached_get_tokenizer(
model_config.tokenizer,
Expand All @@ -355,7 +355,7 @@ def input_mapper(
add_special_tokens=False,
return_tensors="pt")[0]

return MultiModalInputs({
return MultiModalKwargs({
"pixel_values": data,
"image_token_id": image_token_id
})
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/minicpmv.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
from vllm.model_executor.models.utils import LLMWrapper
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.image import cached_get_image_processor
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import IntermediateTensors, SequenceData
Expand Down Expand Up @@ -374,7 +374,7 @@ def input_mapper_for_minicpmv(ctx: InputContext, data: object):
batch_data["slice_start_id"] = data[0]["slice_start_id"]
batch_data["slice_end_id"] = data[0]["slice_end_id"]

return MultiModalInputs(batch_data)
return MultiModalKwargs(batch_data)


class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/mllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -1162,7 +1162,7 @@ def sample(

def _parse_and_validate_image_input(self, **kwargs: object):
# tensor with the same shape will be batched together by
# MultiModalInputs.batch, so pixel_values here can be:
# MultiModalKwargs.batch, so pixel_values here can be:
# - List[List[torch.Tensor]]:
# with shape (num_tiles, 3, image_res, image_res)
# - List[torch.Tensor]:
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/molmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
SequenceData)
Expand Down Expand Up @@ -866,7 +866,7 @@ def image_input_mapper_for_molmo(
ctx: InputContext,
data: object,
):
return MultiModalInputs(data)
return MultiModalKwargs(data)


def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
Expand Down
10 changes: 5 additions & 5 deletions vllm/model_executor/models/pixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from vllm.model_executor.models.utils import merge_multimodal_embeddings
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.utils import (cached_get_tokenizer,
consecutive_placeholder_ranges)
from vllm.sequence import IntermediateTensors, SequenceData
Expand Down Expand Up @@ -94,16 +94,16 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,


def input_mapper_for_pixtral(ctx: InputContext,
data: object) -> MultiModalInputs:
"""Maps the input data to its MultiModalInputs (if any).
data: object) -> MultiModalKwargs:
"""Maps the input data to its MultiModalKwargs (if any).
Args:
ctx: Context of the loaded model.
data: data potentially containing image/image embeddings to be mapped
to pixel_values in .forward() for a visual QWenLMHeadModel model.
Returns:
MultiModalInputs containing the stacked normalized images tensor or
MultiModalKwargs containing the stacked normalized images tensor or
image embeddings.
"""
# Early exit if we have provided an image to a language only Qwen model
Expand All @@ -121,7 +121,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
dtype=torch.float16)
images.append(image)

return MultiModalInputs({"images": images})
return MultiModalKwargs({"images": images})


def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
Expand Down
12 changes: 6 additions & 6 deletions vllm/model_executor/models/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import IntermediateTensors, SequenceData
from vllm.utils import is_list_of
Expand Down Expand Up @@ -722,16 +722,16 @@ def input_processor_for_qwen(ctx: InputContext,
multi_modal_data=multi_modal_data)


def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
"""Maps the input data to its MultiModalInputs (if any).
def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalKwargs:
"""Maps the input data to its MultiModalKwargs (if any).
Args:
ctx: Context of the loaded model.
data: data potentially containing image/image embeddings to be mapped
to pixel_values in .forward() for a visual QWenLMHeadModel model.
Returns:
MultiModalInputs containing the stacked normalized images tensor or
MultiModalKwargs containing the stacked normalized images tensor or
image embeddings.
"""
# Early exit if we have provided an image to a language only Qwen model
Expand All @@ -740,7 +740,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
logger.warning(
"Images were provided but this model has no visual config; "
"multimodal inputs will not be forwarded to the model.")
return MultiModalInputs()
return MultiModalKwargs()

model_config = ctx.model_config
tokenizer = cached_get_tokenizer(
Expand Down Expand Up @@ -784,7 +784,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
data = [data]
transformed_images = [transform(datum) for datum in data]
pixel_values = torch.stack(transformed_images, dim=0)
return MultiModalInputs({"pixel_values": pixel_values})
return MultiModalKwargs({"pixel_values": pixel_values})


def build_normalization_transform(image_size: int) -> transforms.Compose:
Expand Down
8 changes: 4 additions & 4 deletions vllm/model_executor/models/qwen2_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
default_weight_loader, maybe_remap_kv_scale_name)
from vllm.model_executor.models.qwen2 import Qwen2Model
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.utils import consecutive_placeholder_ranges
from vllm.sequence import IntermediateTensors, SequenceData

Expand Down Expand Up @@ -221,13 +221,13 @@ def input_processor_for_qwen2_audio(
def input_mapper_for_qwen2_audio(
ctx: InputContext,
multi_modal_data: Union[np.ndarray, List[np.ndarray]],
) -> MultiModalInputs:
) -> MultiModalKwargs:
"""Input mapper for Qwen2-Audio."""
if not isinstance(multi_modal_data, list):
multi_modal_data = [multi_modal_data]

if len(multi_modal_data) == 0:
return MultiModalInputs()
return MultiModalKwargs()

processor = cached_get_processor(ctx.model_config.model)
audio_feature_extractor = processor.feature_extractor
Expand All @@ -254,7 +254,7 @@ def input_mapper_for_qwen2_audio(
logger.error("Failed to process audio (%s)", multi_modal_data)
raise

return MultiModalInputs(batch_data)
return MultiModalKwargs(batch_data)


@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_audio)
Expand Down
Loading