Automatic device mapping support (#1042)

* Initial automatic device mapping * Reverse order * Automatic device mapping * Checks in other model types * Support isq * Support uqff * Add for the rest of the models * Clippy * Update all the apis * Update for clippy 1.84 * Update docs * Ensure we always have the same activation type size * Clippy
EricLBuehler · Jan 9, 2025 · 729b81b · 729b81b
1 parent 524a8d9
commit 729b81b
Show file tree

Hide file tree

Showing 66 changed files with 2,572 additions and 604 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -47,7 +47,6 @@ half = "2.4.0"
 rayon = "1.1.0"
 url = "2.5.2"
 data-url = "0.3.1"
-buildstructor = "0.5.4"
 float8 = "0.1.2"
 regex = "1.10.6"
 metal = { version = "0.27.0", features = ["mps"] }
diff --git a/README.md b/README.md
@@ -98,12 +98,12 @@ Mistral.rs supports several model categories:
 - Grammar support with JSON Schema, Regex, Lark, and Guidance via [LLGuidance library](https://github.com/microsoft/llguidance)
 - [ISQ](docs/ISQ.md) (In situ quantization): run `.safetensors` models directly from 🤗 Hugging Face by quantizing in-place
     - Enhance performance with an [imatrix](docs/IMATRIX.md)!
+- Automatic [device mapping](docs/DEVICE_MAPPING.md) to easily load and run models across multiple GPUs and CPU.
 
 **Fast**:
 - Apple silicon support: ARM NEON, Accelerate, Metal
 - Accelerated CPU inference with MKL, AVX support
 - CUDA support with flash attention and cuDNN.
-- [Device mapping](docs/DEVICE_MAPPING.md): load and run some layers on the device and the rest on the CPU.
 
 **Quantization**:
 - [Details](docs/QUANTS.md)

diff --git a/docs/DEVICE_MAPPING.md b/docs/DEVICE_MAPPING.md
@@ -1,5 +1,19 @@
 # Device mapping
 
+In mistral.rs, device mapping is **automatically managed** to be as performant and easy as possible. Automatic device mapping is enabled
+by default in the CLI/server and Python API and does not make any changes when the model fits entirely on the GPU.
+
+Automatic device mapping works by prioritizing loading models into GPU memory, and any remaining parts are loaded into CPU memory.
+Models architectures such as vision models which greatly benefit from GPU acceleration also automatically prioritize keeping those
+components on the GPU.
+
+If you want to manually device map the model (not recommended), please continue reading.
+
+> [!NOTE]
+> Manual device mapping is deprecated in favor of automatic device mapping due to the possibility for user error in manual.
+
+## Manual device mapping
+
 There are 2 ways to do device mapping:
 1) Specify the number of layers to put on the GPU - this uses the GPU with ordinal 0.
 2) Specify the ordinals and number of layers - this allows for cross-GPU device mapping.

diff --git a/docs/TOPOLOGY.md b/docs/TOPOLOGY.md
@@ -2,6 +2,11 @@
 
 <h3>Quantization and device mapping in one file.</h3>
 
+> [!NOTE]
+> Manual device mapping is deprecated in favor of automatic device mapping due to the possibility for user error in manual.
+> The topology system will remain and be used only for quantization settings.
+> Please see the [device mapping documentation](DEVICE_MAPPING.md) for more information.
+
 Use a simple model topology to configure ISQ and device mapping for *per-layer* with a single [YAML file](../topologies/isq_and_device.yml) (examples [here](../topologies))!
 
 To support per-layer mix of ISQ, Mistral.rs supports loading a model topology YAML file. This YAML file is formatted as follows:

diff --git a/examples/python/deepseekv2.py b/examples/python/deepseekv2.py
@@ -20,4 +20,4 @@
     )
 )
 print(res.choices[0].message.content)
-print(res.usage)
+print(res.usage)
diff --git a/mistralrs-bench/src/main.rs b/mistralrs-bench/src/main.rs
@@ -3,10 +3,10 @@ use clap::Parser;
 use cli_table::{format::Justify, print_stdout, Cell, CellStruct, Style, Table};
 use mistralrs_core::{
     get_model_dtype, initialize_logging, paged_attn_supported, parse_isq_value, Constraint,
-    DefaultSchedulerMethod, DeviceLayerMapMetadata, DeviceMapMetadata, DrySamplingParams, IsqType,
-    Loader, LoaderBuilder, MemoryGpuConfig, MistralRs, MistralRsBuilder, ModelSelected,
-    NormalRequest, PagedAttentionConfig, Request, RequestMessage, Response, SamplingParams,
-    SchedulerConfig, TokenSource, Usage,
+    DefaultSchedulerMethod, DeviceLayerMapMetadata, DeviceMapMetadata, DeviceMapSetting,
+    DrySamplingParams, IsqType, Loader, LoaderBuilder, MemoryGpuConfig, MistralRs,
+    MistralRsBuilder, ModelSelected, NormalRequest, PagedAttentionConfig, Request, RequestMessage,
+    Response, SamplingParams, SchedulerConfig, TokenSource, Usage,
 };
 use std::sync::Arc;
 use std::{fmt::Display, num::NonZeroUsize};
@@ -389,10 +389,9 @@ fn main() -> anyhow::Result<()> {
     let mapper = if let Some(device_layers) = args.num_device_layers {
         if device_layers.len() == 1 && device_layers[0].parse::<usize>().is_ok() {
             let layers = device_layers[0].parse::<usize>().unwrap();
-            DeviceMapMetadata::from_num_device_layers(vec![DeviceLayerMapMetadata {
-                ordinal: 0,
-                layers,
-            }])
+            DeviceMapSetting::Map(DeviceMapMetadata::from_num_device_layers(vec![
+                DeviceLayerMapMetadata { ordinal: 0, layers },
+            ]))
         } else {
             let mut mapping = Vec::new();
             for layer in device_layers {
@@ -416,10 +415,10 @@ fn main() -> anyhow::Result<()> {
                     layers: num,
                 });
             }
-            DeviceMapMetadata::from_num_device_layers(mapping)
+            DeviceMapSetting::Map(DeviceMapMetadata::from_num_device_layers(mapping))
         }
     } else {
-        DeviceMapMetadata::dummy()
+        DeviceMapSetting::Auto
     };
 
     // Allocate 0.5 GB of CPU memory just as a placeholder.

diff --git a/mistralrs-core/Cargo.toml b/mistralrs-core/Cargo.toml
@@ -59,7 +59,6 @@ derive_more = { version = "0.99.17", default-features = false, features = [
 ] }
 akin = "0.4.0"
 variantly = "0.4.0"
-buildstructor.workspace = true
 tracing-subscriber.workspace = true
 derive-new = "0.7.0"
 itertools = "0.13.0"
@@ -82,14 +81,15 @@ float8.workspace = true
 llguidance = { git = "https://github.com/microsoft/llguidance", rev = "cfef3df97372a7b84d74976ff41cc9cb78bca6cc", default-features = false, features = ["lark"] }
 toktrie_hf_tokenizers = { git = "https://github.com/microsoft/llguidance", rev = "cfef3df97372a7b84d74976ff41cc9cb78bca6cc" }
 objc = { version = "0.2.7", optional = true }
+metal = { workspace = true, optional = true }
 
 [features]
 pyo3_macros = ["pyo3"]
 cuda = ["candle-core/cuda", "candle-nn/cuda", "dep:bindgen_cuda", "mistralrs-quant/cuda", "dep:mistralrs-paged-attn", "mistralrs-paged-attn/cuda", "float8/mistralrs_cudarc_fork"]
 cudnn = ["candle-core/cudnn"]
-metal = ["candle-core/metal", "candle-nn/metal", "dep:objc", "dep:mistralrs-paged-attn", "mistralrs-paged-attn/metal"]
+metal = ["candle-core/metal", "candle-nn/metal", "mistralrs-quant/metal", "dep:objc", "dep:mistralrs-paged-attn", "mistralrs-paged-attn/metal", "dep:metal"]
 flash-attn = ["cuda", "dep:candle-flash-attn"]
-accelerate = ["candle-core/accelerate", "candle-nn/accelerate"]
+accelerate = ["candle-core/accelerate", "candle-nn/accelerate", "mistralrs-quant/accelerate"]
 mkl = ["candle-core/mkl", "candle-nn/mkl"]
 
 [build-dependencies]

diff --git a/mistralrs-core/src/amoe/macros.rs b/mistralrs-core/src/amoe/macros.rs
@@ -15,7 +15,7 @@ macro_rules! get_delta_from_lora_ab {
         } else {
             1.0
         };
-        (proj_b.matmul(&proj_a)? * scale)?
+        (MatMul.matmul(&proj_b, &proj_a)? * scale)?
     }};
 }
 

diff --git a/mistralrs-core/src/attention.rs b/mistralrs-core/src/attention.rs
@@ -3,13 +3,10 @@
 #[cfg(feature = "metal")]
 use std::sync::atomic::AtomicUsize;
 
-use crate::{
-    cublaslt::CUBLASLT_HANDLE,
-    layers::{get_use_matmul_via_f16, MatMul},
-    pipeline::text_models_inputs_processor::FlashParams,
-};
+use crate::{cublaslt::CUBLASLT_HANDLE, pipeline::text_models_inputs_processor::FlashParams};
 
 use candle_core::{Device, Result, Tensor};
+use mistralrs_quant::{get_use_matmul_via_f16, MatMul};
 
 #[cfg(feature = "metal")]
 /// Initial, sentinel value is usize::MAX
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,7 +15,7 @@ macro_rules! get_delta_from_lora_ab { @@
             } else {
 .0
             };
-            (proj_b.matmul(&proj_a)? * scale)?
+            (MatMul.matmul(&proj_b, &proj_a)? * scale)?
         }};
     }
@@ Expand Down @@