Hot fix for the VM allocator issue with Disco (#43)

* make sure to switch VM allocators on all devices to eager recycle after profileing * add debug log to benchmark script
octoml · Oct 31, 2023 · 5ab11f5 · 5ab11f5
1 parent 2d30b96
commit 5ab11f5
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 1 deletion.
diff --git a/serve/benchmarks/benchmark_throughput.py b/serve/benchmarks/benchmark_throughput.py
@@ -15,6 +15,7 @@
 )
 from mlc_serve.engine.local import LocalProcessInferenceEngine
 from mlc_serve.model.paged_cache_model import PagedCacheModelModule
+from mlc_serve.run import setup_logging
 
 import pandas as pd
 
@@ -163,7 +164,9 @@ def main(args: argparse.Namespace):
         default=None,
         help="Append the current result to the given path if provided.",
     )
+    parser.add_argument("--debug-logging", action="store_true")
     args = parser.parse_args()
+    setup_logging(args)
 
     args.model, args.quantization = args.local_id.rsplit("-", 1)
     utils.argparse_postproc_common(args)

diff --git a/serve/mlc_serve/model/paged_cache_model.py b/serve/mlc_serve/model/paged_cache_model.py
@@ -303,7 +303,7 @@ def sample(logits, sampling_params, vocab_size):
 
 
 def load_disco_module(artifact_path, lib_path, num_shards):
-    sess = di.ThreadedSession(num_workers=num_shards)
+    sess = di.ProcessSession(num_workers=num_shards)
     devices = range(num_shards)
     sess.init_ccl("nccl", *devices)
     module = sess.load_vm_module(lib_path)
@@ -440,6 +440,7 @@ def __init__(
         self.dev = dev
         self.vocab_size = vocab_size
         self.sliding_window = sliding_window
+        self.num_shards = num_shards
 
         if sliding_window:
             self.block_sliding_window = sliding_window // CacheManager.block_size
@@ -458,6 +459,11 @@ def get_used_memory(self):
                 tvm.device("cuda", 0)
             ).debug_get_from_remote(0)
 
+            # TODO: temp hack to switch the VM allocator to eager recycling mode on all devices
+            for i in range(1, self.num_shards):
+                get_used_memory_func(
+                    tvm.device("cuda", i)
+                ).debug_get_from_remote(i)
         else:
             params = self.params