update condition slightly

vllm-project · Jul 17, 2024 · 5f2cb45 · 5f2cb45
1 parent 9f9d039
commit 5f2cb45
Showing 1 changed file with 2 additions and 3 deletions.
diff --git a/...el_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/...el_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -36,9 +36,8 @@ def __init__(self, strategy: str, is_static_input_scheme: bool):
     def process_weights_after_loading(self, layer) -> None:
         # If per tensor, when we have a fused module (e.g. QKV) with per
         # tensor scales (thus N scales being passed to the kernel),
-        # requantize so we can always run per tensor with torch._scaled_mm
-        if (self.strategy == QuantizationStrategy.TENSOR
-                or not self.cutlass_fp8_supported):
+        # requantize so we can always run per tensor
+        if self.strategy == QuantizationStrategy.TENSOR:
             max_w_scale, weight = requantize_with_max_scale(
                 weight=layer.weight,
                 weight_scale=layer.weight_scale,