diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 393912881bca3..74b287c7adbfa 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -231,13 +231,15 @@ steps: - pytest -v -s test_logits_processor.py - pytest -v -s model_executor/test_guided_processors.py -- label: Speculative decoding tests # 30min +- label: Speculative decoding tests # 40min source_file_dependencies: - vllm/spec_decode - tests/spec_decode + - vllm/model_executor/models/eagle.py commands: - pytest -v -s spec_decode/e2e/test_multistep_correctness.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py + - pytest -v -s spec_decode/e2e/test_eagle_correctness.py - label: LoRA Test %N # 15min each mirror_hardwares: [amd] diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index eb7b5af19ae96..948560b4906b8 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -19,6 +19,11 @@ class DummyInputLayerNorm(nn.Module): + def __init__(self, weight=None, bias=None): + super().__init__() + self.weight = nn.Parameter(weight) if weight is not None else None + self.bias = nn.Parameter(bias) if bias is not None else None + def forward(self, x): return x @@ -69,7 +74,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # Modify layer normalization and residual connections as suggested # in the EAGLE framework: https://github.com/SafeAILab/EAGLE - self.model.model.layers[0].input_layernorm = DummyInputLayerNorm() + # While weights and biases are generally not needed, + # they are retained here to support certain unit tests + # (e.g., spec_decode/e2e/test_eagle_correctness.py). + self.model.model.layers[0].input_layernorm = DummyInputLayerNorm( + weight=self.model.model.layers[0].input_layernorm.weight) self.model.model.norm = DummyOutputNorm() self.orig_vocab_size = config.vocab_size