vllm-project · mgoin · Jul 30, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -307,7 +307,7 @@ def fp8_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 def scaled_fp8_quant(
     input: torch.Tensor,
     scale: Optional[torch.Tensor] = None,
-    batch_dim_padding: Optional[int] = None,
+    num_token_padding: Optional[int] = None,
     scale_ub: Optional[torch.Tensor] = None,
     use_per_token_if_dynamic: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -317,15 +317,15 @@ def scaled_fp8_quant(
     This function supports both static and dynamic quantization: If you
     provide the scale, it will use static scaling and if you omit it,
     the scale will be determined dynamically. The function also allows
-    optional padding of the output tensor for downstream kernels that
+    optional padding of the output tensors for downstream kernels that
     will benefit from padding.
 
     Args:
         input: The input tensor to be quantized to FP8
         scale: Optional scaling factor for the FP8 quantization
         scale_ub: Optional upper bound for scaling factor in dynamic 
             per token case
-        batch_dim_padding: If specified, pad the first dimension
+        num_token_padding: If specified, pad the first dimension
             of the output to at least this value.
         use_per_token_if_dynamic: Whether to do per_tensor or per_token 
             in the dynamic quantization case.
@@ -334,16 +334,16 @@ def scaled_fp8_quant(
         Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
             scaling factor.
     """
-    if batch_dim_padding:
-        shape = (max(batch_dim_padding, input.shape[0]), *input.shape[1:])
-        output = torch.empty(shape,
-                             device=input.device,
-                             dtype=torch.float8_e4m3fn)
-    else:
-        output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
+    # This code assumes batch_dim and num_tokens are flattened
+    assert (input.ndim == 2)
+    shape = input.shape
+    if num_token_padding:
+        shape = (max(num_token_padding, input.shape[0]), shape[1])
+    output = torch.empty(shape, device=input.device, dtype=torch.float8_e4m3fn)
+
     if scale is None:
         if use_per_token_if_dynamic:
-            scale = torch.empty((input.numel() // input.shape[-1], 1),
+            scale = torch.empty((shape[0], 1),
                                 device=input.device,
                                 dtype=torch.float32)
             torch.ops._C.dynamic_per_token_scaled_fp8_quant(
@@ -352,6 +352,8 @@ def scaled_fp8_quant(
             scale = torch.zeros(1, device=input.device, dtype=torch.float32)
             torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
     else:
+        # num_token_padding not implemented for this case
+        assert (scale.numel() == 1 or num_token_padding is None)
         torch.ops._C.static_scaled_fp8_quant(output, input, scale)
 
     return output, scale