diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 006bae70a..2e44d6dc5 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -276,8 +276,6 @@ jobs:
         run: |
           robocopy build_deps\_install\bin .\build\Release zlib.dll
           robocopy build\bin\Release .\build\Release llama.dll
-          robocopy ext_libs .\build\Release libcrypto-3-x64.dll
-          robocopy ext_libs .\build\Release libssl-3-x64.dll
           7z a nitro.zip .\build\Release\*
 
       - uses: actions/upload-release-asset@v1.0.1
@@ -325,7 +323,7 @@ jobs:
           cmake --build ./build_deps/nitro_deps --config Release
           mkdir -p build
           cd build
-          cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
+          cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
           cmake --build . --config Release -j 4
 
       - name: Pack artifacts
@@ -336,8 +334,6 @@ jobs:
           echo %PATH%
           robocopy build_deps\_install\bin .\build\Release zlib.dll
           robocopy build\bin\Release .\build\Release llama.dll
-          robocopy ext_libs .\build\Release libcrypto-3-x64.dll
-          robocopy ext_libs .\build\Release libssl-3-x64.dll
           7z a nitro.zip .\build\Release\*
 
       - uses: actions/upload-release-asset@v1.0.1
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9730e1eac..89e3a88fe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,9 @@ else()
   set(CMAKE_CXX_STANDARD 14)
 endif()
 
+# llama cpp server need llava example to work, this is for llama cpp server
+set(LLAMA_BUILD_EXAMPLES ON)
+
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 set(OPENSSL_USE_STATIC_LIBS TRUE)
@@ -48,7 +51,7 @@ add_executable(${PROJECT_NAME} main.cc)
 #
 # and comment out the following lines
 find_package(Drogon CONFIG REQUIRED)
-target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon common llama
+target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon common llama clip
                                               ${CMAKE_THREAD_LIBS_INIT})
 
 # ##############################################################################
diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index 24e10d9be..f7b738724 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -10,6 +10,22 @@
 #include <trantor/utils/Logger.h>
 
 using namespace inferences;
+using json = nlohmann::json;
+
+// To store state of each inference request
+struct State {
+  bool isStopped = false;
+  int task_id;
+  llamaCPP *instance;
+
+  State(int tid, llamaCPP *inst) : task_id(tid), instance(inst) {}
+};
+
+std::shared_ptr<State> createState(int task_id, llamaCPP *instance) {
+  return std::make_shared<State>(task_id, instance);
+}
+
+// --------------------------------------------
 
 std::string create_return_json(const std::string &id, const std::string &model,
                                const std::string &content,
@@ -41,71 +57,40 @@ std::string create_return_json(const std::string &id, const std::string &model,
 }
 
 void llamaCPP::warmupModel() {
-  auto lock = llama.lock();
-  llama.rewind();
-  llama_reset_timings(llama.ctx);
-
-  llama.prompt = "hello";
-  llama.params.n_predict = 1;
-  llama.loadPrompt();
-  llama.beginCompletion();
-  size_t stop_pos = std::string::npos;
-
-  while (llama.has_next_token) {
-    const completion_token_output token_with_probs = llama.doCompletion();
-    const std::string token_text =
-        token_with_probs.tok == -1
-            ? ""
-            : llama_token_to_piece(llama.ctx, token_with_probs.tok);
-
-    stop_pos = llama.findStoppingStrings(llama.generated_text,
-                                         token_text.size(), STOP_FULL);
-  }
-
-  if (stop_pos == std::string::npos) {
-    stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
-  }
-  if (stop_pos != std::string::npos) {
-    llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
-                               llama.generated_text.end());
-  }
-  auto probs = llama.generated_token_probs;
-  if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) {
-    const std::vector<llama_token> stop_word_toks =
-        llama_tokenize(llama.ctx, llama.stopping_word, false);
-    probs = std::vector<completion_token_output>(
-        llama.generated_token_probs.begin(),
-        llama.generated_token_probs.end() - stop_word_toks.size());
-  }
-
-  LOG_INFO << "Warm-up generated text:" << llama.generated_text;
-  LOG_INFO << "Warm-up finish";
-  return;
+//  json pseudo;
+//
+//  pseudo["prompt"] = "Hello";
+//  pseudo["n_predict"] = 10;
+//  const int task_id = llama.request_completion(pseudo, false);
+//  std::string completion_text;
+//  task_result result = llama.next_result(task_id);
+//  if (!result.error && result.stop) {
+//    LOG_INFO << result.result_json.dump(-1, ' ', false,
+//                                        json::error_handler_t::replace);
+//  }
+//  return;
 }
 
 void llamaCPP::chatCompletion(
     const HttpRequestPtr &req,
     std::function<void(const HttpResponsePtr &)> &&callback) {
-  if (!model_loaded) {
-    Json::Value jsonResp;
-    jsonResp["message"] = "Model is not loaded yet";
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    resp->setStatusCode(drogon::k500InternalServerError);
-    callback(resp);
-    return;
-  }
 
   const auto &jsonBody = req->getJsonObject();
   std::string formatted_output =
       "Below is a conversation between an AI system named ASSISTANT and USER\n";
+
+  json data;
+  json stopWords;
+  // To set default value
+  data["stream"] = true;
+  data["n_predict"] = 30;
+
   if (jsonBody) {
-    llama.params.n_predict = (*jsonBody)["max_tokens"].asInt();
-    llama.params.sampling_params.top_p = (*jsonBody)["top_p"].asFloat();
-    llama.params.sampling_params.temp = (*jsonBody)["temperature"].asFloat();
-    llama.params.sampling_params.frequency_penalty =
-        (*jsonBody)["frequency_penalty"].asFloat();
-    llama.params.sampling_params.presence_penalty =
-        (*jsonBody)["presence_penalty"].asFloat();
+    data["n_predict"] = (*jsonBody)["max_tokens"].asInt();
+    data["top_p"] = (*jsonBody)["top_p"].asFloat();
+    data["temperature"] = (*jsonBody)["temperature"].asFloat();
+    data["frequency_penalty"] = (*jsonBody)["frequency_penalty"].asFloat();
+    data["presence_penalty"] = (*jsonBody)["presence_penalty"].asFloat();
 
     const Json::Value &messages = (*jsonBody)["messages"];
     for (const auto &message : messages) {
@@ -114,186 +99,93 @@ void llamaCPP::chatCompletion(
       formatted_output += role + ": " + content + "\n";
     }
     formatted_output += "assistant:";
+
+    data["prompt"] = formatted_output;
+    for (const auto &stop_word : (*jsonBody)["stop"]) {
+      stopWords.push_back(stop_word.asString());
+    }
+    // specify default stop words
+    stopWords.push_back("user:");
+    stopWords.push_back("### USER:");
+    data["stop"] = stopWords;
   }
 
-  this->llama.rewind();
+  const int task_id = llama.request_completion(data, false);
+  LOG_INFO << "Resolved request for task_id:" << task_id;
 
-  llama_reset_timings(llama.ctx);
+  auto state = createState(task_id, this);
 
-  this->llama.prompt = formatted_output;
-  this->llama.params.antiprompt.clear();
-  for (const auto &stop_word : (*jsonBody)["stop"]) {
-    llama.params.antiprompt.push_back(stop_word.asString());
-  }
-  this->llama.params.antiprompt.push_back("user:");
-  this->llama.params.antiprompt.push_back("### USER:");
-  this->llama.loadPrompt();
-  this->llama.beginCompletion();
-
-  const auto chunked_content_provider =
-      [this](char *pBuffer, std::size_t nBuffSize) -> std::size_t {
-    auto lock = this->llama.lock();
+  auto chunked_content_provider =
+      [state](char *pBuffer, std::size_t nBuffSize) -> std::size_t {
     if (!pBuffer) {
       LOG_INFO << "Connection closed or buffer is null. Reset context";
-      lock.release();
-
-      llama_print_timings(llama.ctx);
-      this->llama.mutex.unlock();
-      this->sent_count = 0;
-      this->sent_token_probs_index = 0;
-      // LOG_INFO << "Test end two time lol";
+      state->instance->llama.request_cancel(state->task_id);
       return 0;
     }
-    // LOG_INFO << this->llama.has_next_token;
-    while (this->llama.has_next_token) {
-      try {
-        // LOG_INFO << this->llama.has_next_token;
-        const completion_token_output token_with_probs =
-            this->llama.doCompletion();
-        if (token_with_probs.tok == -1 || this->llama.multibyte_pending > 0) {
-          return 0;
-        }
-        const std::string token_text =
-            llama_token_to_piece(llama.ctx, token_with_probs.tok);
-
-        size_t pos = std::min(sent_count, this->llama.generated_text.size());
-
-        const std::string str_test = this->llama.generated_text.substr(pos);
-        bool is_stop_full = false;
-        size_t stop_pos = this->llama.findStoppingStrings(
-            str_test, token_text.size(), STOP_FULL);
-        if (stop_pos != std::string::npos) {
-          is_stop_full = true;
-          this->llama.generated_text.erase(llama.generated_text.begin() + pos +
-                                               stop_pos,
-                                           this->llama.generated_text.end());
-          pos = std::min(sent_count, this->llama.generated_text.size());
-        } else {
-          is_stop_full = false;
-          stop_pos = this->llama.findStoppingStrings(
-              str_test, token_text.size(), STOP_PARTIAL);
-        }
-
-        if (stop_pos == std::string::npos ||
-            // Send rest of the text if we are at the end of the generation
-            (!this->llama.has_next_token && !is_stop_full && stop_pos > 0)) {
-          const std::string to_send =
-              this->llama.generated_text.substr(pos, std::string::npos);
-
-          sent_count += to_send.size();
-
-          std::vector<completion_token_output> probs_output = {};
-
-          if (this->llama.params.sampling_params.n_probs > 0) {
-            const std::vector<llama_token> to_send_toks =
-                llama_tokenize(llama.ctx, to_send, false);
-            size_t probs_pos =
-                std::min(sent_token_probs_index,
-                         this->llama.generated_token_probs.size());
-            size_t probs_stop_pos =
-                std::min(sent_token_probs_index + to_send_toks.size(),
-                         this->llama.generated_token_probs.size());
-            if (probs_pos < probs_stop_pos) {
-              probs_output = std::vector<completion_token_output>(
-                  this->llama.generated_token_probs.begin() + probs_pos,
-                  this->llama.generated_token_probs.begin() + probs_stop_pos);
-            }
-            sent_token_probs_index = probs_stop_pos;
-          }
-          if (!to_send.empty() &&
-              llama.has_next_token) { //  NITRO : the patch here is important to
-                                      //  make midway cutting possible
-            // const json data = format_partial_response(this->llama, to_send,
-            // probs_output);
-            // LOG_INFO << llama.has_next_token;
-            const std::string str =
-                "data: " +
-                create_return_json(nitro_utils::generate_random_string(20), "_",
-                                   to_send) +
-                "\n\n";
-
-            LOG_VERBOSE("data stream", {{"to_send", str}});
-            std::size_t nRead = std::min(str.size(), nBuffSize);
-            memcpy(pBuffer, str.data(), nRead);
-            return nRead;
-          }
-        }
-
-        //       std::this_thread::sleep_for(std::chrono::seconds(2));
-        // LOG_INFO << this->llama.has_next_token;
-        if (!this->llama.has_next_token) {
-          // Generation is done, send extra information.
-          //        const json data = format_final_response(
-          //            this->llama, "",
-          //            std::vector<completion_token_output>(
-          //                this->llama.generated_token_probs.begin(),
-          //                this->llama.generated_token_probs.begin() +
-          //                sent_token_probs_index));
-          //
-
-          const std::string str =
-              "data: " +
-              create_return_json(nitro_utils::generate_random_string(20), "_",
-                                 "", "stop") +
-              "\n\n" + "data: [DONE]" + "\n\n";
-
-          LOG_VERBOSE("data stream", {{"to_send", str}});
-          std::size_t nRead = std::min(str.size(), nBuffSize);
-          memcpy(pBuffer, str.data(), nRead);
-          return nRead;
-        }
-      } catch (...) {
-        LOG_ERROR << "error inside while loop";
-      }
+    if (state->isStopped) {
+      return 0;
     }
-    lock.release();
 
-    llama_print_timings(llama.ctx);
-    this->llama.mutex.unlock();
-    this->sent_count = 0;
-    this->sent_token_probs_index = 0;
-    // LOG_INFO << "Test end two time lol";
+    task_result result = state->instance->llama.next_result(state->task_id);
+    if (!result.error) {
+      const std::string to_send = result.result_json["content"];
+      const std::string str =
+          "data: " +
+          create_return_json(nitro_utils::generate_random_string(20), "_",
+                             to_send) +
+          "\n\n";
+
+      std::size_t nRead = std::min(str.size(), nBuffSize);
+      memcpy(pBuffer, str.data(), nRead);
+
+      if (result.stop) {
+        const std::string str =
+            "data: " +
+            create_return_json(nitro_utils::generate_random_string(20), "_", "",
+                               "stop") +
+            "\n\n" + "data: [DONE]" + "\n\n";
+
+        LOG_VERBOSE("data stream", {{"to_send", str}});
+        std::size_t nRead = std::min(str.size(), nBuffSize);
+        memcpy(pBuffer, str.data(), nRead);
+        LOG_INFO << "reached result stop";
+        state->isStopped = true;
+        state->instance->llama.request_cancel(state->task_id);
+        return nRead;
+      }
+      return nRead;
+    } else {
+      return 0;
+    }
     return 0;
   };
-
   auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider,
                                                "chat_completions.txt");
   callback(resp);
+
+  return;
 }
 
 void llamaCPP::embedding(
     const HttpRequestPtr &req,
     std::function<void(const HttpResponsePtr &)> &&callback) {
-  if (!model_loaded) {
-    Json::Value jsonResp;
-    jsonResp["message"] = "Model is not loaded yet";
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    resp->setStatusCode(drogon::k500InternalServerError);
-    callback(resp);
-    return;
-  }
-
-  auto lock = llama.lock();
-
   const auto &jsonBody = req->getJsonObject();
 
-  llama.rewind();
-  llama_reset_timings(llama.ctx);
+  json prompt;
   if (jsonBody->isMember("content") != 0) {
-    llama.prompt = (*jsonBody)["content"].asString();
+    prompt = (*jsonBody)["content"].asString();
   } else {
-    llama.prompt = "";
+    prompt = "";
   }
-  llama.params.n_predict = 0;
-  llama.loadPrompt();
-  llama.beginCompletion();
-  llama.doCompletion();
-
-  const json data = format_embedding_response(llama);
-  auto resp = drogon::HttpResponse::newHttpResponse();
-  resp->setBody(data.dump());
+  const int task_id =
+      llama.request_completion({{"prompt", prompt}, {"n_predict", 0}}, false);
+  task_result result = llama.next_result(task_id);
+  std::string embeddingResp = result.result_json.dump();
+  auto resp = nitro_utils::nitroHttpResponse();
+  resp->setBody(embeddingResp);
   resp->setContentTypeString("application/json");
   callback(resp);
+  return;
 }
 
 void llamaCPP::loadModel(
@@ -303,11 +195,26 @@ void llamaCPP::loadModel(
   const auto &jsonBody = req->getJsonObject();
 
   gpt_params params;
+
+  params.cont_batching = false;
+  // By default will setting based on number of handlers
+  int drogon_thread = drogon::app().getThreadNum();
+  LOG_INFO << "Drogon thread is:" << drogon_thread;
   if (jsonBody) {
     params.model = (*jsonBody)["llama_model_path"].asString();
     params.n_gpu_layers = (*jsonBody)["ngl"].asInt();
     params.n_ctx = (*jsonBody)["ctx_len"].asInt();
     params.embedding = (*jsonBody)["embedding"].asBool();
+    // Check if n_parallel exists in jsonBody, if not, set to drogon_thread
+    if ((*jsonBody).isMember("n_parallel")) {
+      params.n_parallel = (*jsonBody)["n_parallel"].asInt();
+    } else {
+      params.n_parallel = drogon_thread;
+    }
+
+    params.cont_batching = (*jsonBody)["cont_batching"].asBool();
+    // params.n_threads = (*jsonBody)["n_threads"].asInt();
+    // params.n_threads_batch = params.n_threads;
   }
 #ifdef GGML_USE_CUBLAS
   LOG_INFO << "Setting up GGML CUBLAS PARAMS";
@@ -329,7 +236,7 @@ void llamaCPP::loadModel(
                  });
 
   // load the model
-  if (!llama.loadModel(params)) {
+  if (!llama.load_model(params)) {
     LOG_ERROR << "Error loading the model will exit the program";
     Json::Value jsonResp;
     jsonResp["message"] = "Model loaded failed";
@@ -337,10 +244,22 @@ void llamaCPP::loadModel(
     resp->setStatusCode(drogon::k500InternalServerError);
     callback(resp);
   }
+  llama.initialize();
+
   Json::Value jsonResp;
   jsonResp["message"] = "Model loaded successfully";
   model_loaded = true;
   auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-  warmupModel();
+  //warmupModel();
+
+  LOG_INFO << "Started background task here!";
+  backgroundThread = std::thread(&llamaCPP::backgroundTask, this);
   callback(resp);
 }
+
+void llamaCPP::backgroundTask() {
+  while (model_loaded) {
+    model_loaded = llama.update_slots();
+  }
+  return;
+}
diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h
index 2b188867a..01fcab840 100644
--- a/controllers/llamaCPP.h
+++ b/controllers/llamaCPP.h
@@ -1,17 +1,18 @@
-#include "log.h"
-#include <drogon/HttpTypes.h>
 #if defined(_WIN32)
 #define NOMINMAX
 #endif
 
 #pragma once
 
+#include "log.h"
 #include "utils/nitro_utils.h"
 #include <drogon/HttpAppFramework.h>
 #include <drogon/HttpController.h>
 #include <drogon/HttpSimpleController.h>
-#include <trantor/utils/Logger.h>
+#include <drogon/HttpTypes.h>
+#include <fstream>
 #include <regex>
+#include <trantor/utils/Logger.h>
 
 // External
 
@@ -20,6 +21,10 @@
 #include "grammar-parser.h"
 #include "llama.h"
 
+#include "../../llama.cpp/examples/llava/clip.h"
+
+#include "stb_image.h"
+
 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
 #define CPPHTTPLIB_NO_EXCEPTIONS 1
@@ -29,7 +34,10 @@
 
 // auto generated files (update with ./deps.sh)
 
+#include <chrono>
 #include <cstddef>
+#include <mutex>
+#include <thread>
 
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
@@ -45,6 +53,155 @@ struct server_params {
   int32_t write_timeout = 600;
 };
 
+static bool server_verbose = false;
+
+#if SERVER_VERBOSE != 1
+#define LOG_VERBOSE(MSG, ...)
+#else
+#define LOG_VERBOSE(MSG, ...)                                                  \
+  do {                                                                         \
+    if (server_verbose) {                                                      \
+      server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__);             \
+    }                                                                          \
+  } while (0)
+#endif
+
+#define LOG_ERROR_LLAMA(MSG, ...)                                              \
+  server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_WARNING_LLAMA(MSG, ...)                                            \
+  server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_INFO_LLAMA(MSG, ...)                                               \
+  server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
+
+//
+// base64 utils (TODO: move to common in the future)
+//
+
+static const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                                        "abcdefghijklmnopqrstuvwxyz"
+                                        "0123456789+/";
+
+static inline bool is_base64(uint8_t c) {
+  return (isalnum(c) || (c == '+') || (c == '/'));
+}
+
+static std::vector<uint8_t> base64_decode(std::string const &encoded_string) {
+  int i = 0;
+  int j = 0;
+  int in_ = 0;
+
+  int in_len = encoded_string.size();
+
+  uint8_t char_array_4[4];
+  uint8_t char_array_3[3];
+
+  std::vector<uint8_t> ret;
+
+  while (in_len-- && (encoded_string[in_] != '=') &&
+         is_base64(encoded_string[in_])) {
+    char_array_4[i++] = encoded_string[in_];
+    in_++;
+    if (i == 4) {
+      for (i = 0; i < 4; i++) {
+        char_array_4[i] = base64_chars.find(char_array_4[i]);
+      }
+
+      char_array_3[0] =
+          ((char_array_4[0]) << 2) + ((char_array_4[1] & 0x30) >> 4);
+      char_array_3[1] =
+          ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+      char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
+
+      for (i = 0; (i < 3); i++) {
+        ret.push_back(char_array_3[i]);
+      }
+      i = 0;
+    }
+  }
+
+  if (i) {
+    for (j = i; j < 4; j++) {
+      char_array_4[j] = 0;
+    }
+
+    for (j = 0; j < 4; j++) {
+      char_array_4[j] = base64_chars.find(char_array_4[j]);
+    }
+
+    char_array_3[0] =
+        ((char_array_4[0]) << 2) + ((char_array_4[1] & 0x30) >> 4);
+    char_array_3[1] =
+        ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+    char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
+
+    for (j = 0; (j < i - 1); j++) {
+      ret.push_back(char_array_3[j]);
+    }
+  }
+
+  return ret;
+}
+
+//
+// parallel
+//
+
+enum task_type { COMPLETION_TASK, CANCEL_TASK };
+
+struct task_server {
+  int id;
+  int target_id;
+  task_type type;
+  json data;
+  bool infill_mode = false;
+};
+
+struct task_result {
+  int id;
+  bool stop;
+  bool error;
+  json result_json;
+};
+
+// TODO: can become bool if we can't find use of more states
+enum slot_state {
+  IDLE,
+  PROCESSING,
+};
+
+enum slot_command {
+  NONE,
+  LOAD_PROMPT,
+  RELEASE,
+};
+
+struct slot_params {
+  bool stream = true;
+  bool cache_prompt =
+      false; // remember the prompt to avoid reprocessing all prompt
+
+  uint32_t seed = -1;     // RNG seed
+  int32_t n_keep = 0;     // number of tokens to keep from initial prompt
+  int32_t n_predict = -1; // new tokens to predict
+
+  std::vector<std::string> antiprompt;
+
+  json input_prefix;
+  json input_suffix;
+};
+
+struct slot_image {
+  int32_t id;
+
+  bool request_encode_image = false;
+  float *image_embedding = nullptr;
+  int32_t image_tokens = 0;
+
+  clip_image_u8 img_data;
+
+  std::string prefix_prompt; // before of this image
+};
+
 // completion token output with probabilities
 struct completion_token_output {
   struct token_prob {
@@ -54,6 +211,7 @@ struct completion_token_output {
 
   std::vector<token_prob> probs;
   llama_token tok;
+  std::string text_to_send;
 };
 
 static size_t common_part(const std::vector<llama_token> &a,
@@ -90,6 +248,7 @@ static size_t find_partial_stop_string(const std::string &stop,
   return std::string::npos;
 }
 
+// TODO: reuse llama_detokenize
 template <class Iter>
 static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) {
   std::string ret;
@@ -155,111 +314,275 @@ probs_vector_to_json(const llama_context *ctx,
   return out;
 }
 
-static bool server_verbose = false;
+template <typename T>
+static T json_value(const json &body, const std::string &key,
+                    const T &default_value) {
+  // Fallback null to default value
+  return body.contains(key) && !body.at(key).is_null()
+             ? body.value(key, default_value)
+             : default_value;
+}
 
-#if SERVER_VERBOSE != 1
-#define LOG_VERBOSE(MSG, ...)
-#else
-#define LOG_VERBOSE(MSG, ...)                                                  \
-  do {                                                                         \
-    if (server_verbose) {                                                      \
-      server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__);             \
-    }                                                                          \
-  } while (0)
-#endif
+struct llama_client_slot {
+  int id;
+  int task_id = -1;
 
-#define LOG_ERROR_LLAMA(MSG, ...)                                              \
-  server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_WARNING_LLAMA(MSG, ...)                                            \
-  server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_INFO_LLAMA(MSG, ...)                                               \
-  server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
+  struct slot_params params;
 
-struct llama_server_context {
-  bool stream = false;
-  bool has_next_token = false;
-  std::string generated_text;
-  std::vector<completion_token_output> generated_token_probs;
+  slot_state state = IDLE;
+  slot_command command = NONE;
 
-  size_t num_prompt_tokens = 0;
-  size_t num_tokens_predicted = 0;
-  size_t n_past = 0;
-  size_t n_remain = 0;
+  // used to determine the slot that has been used the longest
+  int64_t t_last_used = -1;
 
-  json prompt;
-  std::vector<llama_token> embd;
-  std::vector<llama_token> last_n_tokens;
+  // generation props
+  int32_t n_ctx = 0; // context size per slot
+  int32_t n_past = 0;
+  int32_t n_decoded = 0;
+  int32_t n_remaining = -1;
+  int32_t i_batch = -1;
 
-  llama_model *model = nullptr;
-  llama_context *ctx = nullptr;
-  gpt_params params;
-  llama_sampling_context ctx_sampling;
-  int n_ctx;
+  int32_t num_prompt_tokens = 0;
+  int32_t num_prompt_tokens_processed = 0;
+  int32_t multibyte_pending = 0;
 
-  grammar_parser::parse_state parsed_grammar;
-  llama_grammar *grammar = nullptr;
+  json prompt;
+  std::string generated_text;
+  llama_token sampled;
+  std::vector<llama_token> cache_tokens;
+  std::vector<completion_token_output> generated_token_probs;
 
+  bool infill = false;
+  bool has_next_token = true;
   bool truncated = false;
   bool stopped_eos = false;
   bool stopped_word = false;
   bool stopped_limit = false;
+
   std::string stopping_word;
-  int32_t multibyte_pending = 0;
 
-  std::mutex mutex;
+  // sampling
+  struct llama_sampling_params sparams;
+  llama_sampling_context *ctx_sampling = nullptr;
 
-  std::unique_lock<std::mutex> lock() {
-    return std::unique_lock<std::mutex>(mutex);
-  }
+  // multimodal
+  std::vector<slot_image> images;
 
-  ~llama_server_context() {
-    if (ctx) {
-      llama_free(ctx);
-      ctx = nullptr;
-    }
-    if (model) {
-      llama_free_model(model);
-      model = nullptr;
-    }
-  }
+  // stats
+  size_t sent_count = 0;
+  size_t sent_token_probs_index = 0;
+
+  int64_t t_start_process_prompt;
+  int64_t t_start_genereration;
 
-  void rewind() {
-    params.antiprompt.clear();
-    params.grammar.clear();
+  double t_prompt_processing; // ms
+  double t_token_generation;  // ms
+
+  void reset() {
     num_prompt_tokens = 0;
-    num_tokens_predicted = 0;
     generated_text = "";
-    generated_text.reserve(n_ctx);
-    generated_token_probs.clear();
     truncated = false;
     stopped_eos = false;
     stopped_word = false;
     stopped_limit = false;
     stopping_word = "";
     multibyte_pending = 0;
-    n_remain = 0;
     n_past = 0;
+    sent_count = 0;
+    sent_token_probs_index = 0;
+    infill = false;
+
+    generated_token_probs.clear();
+
+    for (slot_image &img : images) {
+      free(img.image_embedding);
+      delete[] img.img_data.data;
+      img.prefix_prompt = "";
+    }
+
+    images.clear();
+    // llama_set_rng_seed(ctx, params.seed); in batched the seed matter???????
+  }
+
+  bool has_budget(gpt_params &global_params) {
+    n_remaining = -1;
+    if (params.n_predict != -1) {
+      n_remaining = params.n_predict - n_decoded;
+    } else if (global_params.n_predict != -1) {
+      n_remaining = global_params.n_predict - n_decoded;
+    }
+    return n_remaining > 0 || n_remaining == -1; // no budget || limitless
+  }
+
+  bool available() const { return state == IDLE && command == NONE; }
+
+  bool is_processing() const {
+    return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING;
+  }
+
+  void add_token_string(const completion_token_output &token) {
+    if (command == RELEASE) {
+      return;
+    }
+    cache_tokens.push_back(token.tok);
+    generated_token_probs.push_back(token);
+  }
+
+  void release() {
+    if (state == IDLE || state == PROCESSING) {
+      t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
+      command = RELEASE;
+    }
+  }
+
+  json get_formated_timings() {
+    return json{
+        {"prompt_n", num_prompt_tokens_processed},
+        {"prompt_ms", t_prompt_processing},
+        {"prompt_per_token_ms",
+         t_prompt_processing / num_prompt_tokens_processed},
+        {"prompt_per_second",
+         1e3 / t_prompt_processing * num_prompt_tokens_processed},
+
+        {"predicted_n", n_decoded},
+        {"predicted_ms", t_token_generation},
+        {"predicted_per_token_ms", t_token_generation / n_decoded},
+        {"predicted_per_second", 1e3 / t_token_generation * n_decoded},
+    };
+  }
+
+  void print_timings() {
+    LOG_TEE("\n");
+    LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per "
+            "token, %8.2f tokens per second)\n",
+            __func__, t_prompt_processing, num_prompt_tokens_processed,
+            t_prompt_processing / num_prompt_tokens_processed,
+            1e3 / t_prompt_processing * num_prompt_tokens_processed);
+    LOG_TEE("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per "
+            "token, %8.2f tokens per second)\n",
+            __func__, t_token_generation, n_decoded,
+            t_token_generation / n_decoded,
+            1e3 / t_token_generation * n_decoded);
+    LOG_TEE("%s:       total time = %10.2f ms\n", __func__,
+            t_prompt_processing + t_token_generation);
+  }
+};
+
+struct llama_server_context {
+  llama_model *model = nullptr;
+  llama_context *ctx = nullptr;
+
+  clip_ctx *clp_ctx = nullptr;
+
+  gpt_params params;
+
+  llama_batch batch;
+
+  bool multimodal = false;
+  bool clean_kv_cache = true;
+  bool all_slots_are_idle = false;
+
+  int32_t id_gen;
+  int32_t n_ctx; // total context for all clients / slots
+
+  // system prompt
+  bool system_need_update = false;
+
+  std::string system_prompt;
+  std::vector<llama_token> system_tokens;
+
+  std::string name_user; // this should be the antiprompt
+  std::string name_assistant;
+
+  // slots / clients
+  std::vector<llama_client_slot> slots;
+
+  std::vector<task_server> queue_tasks;
+  std::vector<task_result> queue_results;
+  std::mutex mutex_tasks;
+  std::mutex mutex_results;
 
-    if (grammar != nullptr) {
-      llama_grammar_free(grammar);
-      grammar = nullptr;
-      ctx_sampling = llama_sampling_context_init(params, NULL);
+  ~llama_server_context() {
+    if (ctx) {
+      llama_free(ctx);
+      ctx = nullptr;
+    }
+    if (model) {
+      llama_free_model(model);
+      model = nullptr;
     }
   }
 
-  bool loadModel(const gpt_params &params_) {
+  bool load_model(const gpt_params &params_) {
     params = params_;
+    if (!params.mmproj.empty()) {
+      multimodal = true;
+      LOG_TEE("Multi Modal Mode Enabled");
+      clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/1);
+      if (clp_ctx == nullptr) {
+        LOG_ERROR_LLAMA("unable to load clip model",
+                        {{"model", params.mmproj}});
+        return false;
+      }
+
+      if (params.n_ctx <
+          2048) { // request larger context for the image embedding
+        params.n_ctx = 2048;
+      }
+    }
+
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
     if (model == nullptr) {
-      LOG_ERROR_LLAMA("unable to load model", {{"model", params_.model}});
+      LOG_ERROR_LLAMA("unable to load model", {{"model", params.model}});
       return false;
     }
+
+    if (multimodal) {
+      const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
+      const int n_embd_llm = llama_n_embd(model);
+      if (n_embd_clip != n_embd_llm) {
+        LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not "
+                "equal to that of LLaMA (%d). Make sure that you use the "
+                "correct mmproj file.\n",
+                __func__, n_embd_clip, n_embd_llm);
+        llama_free(ctx);
+        llama_free_model(model);
+        return false;
+      }
+    }
+
     n_ctx = llama_n_ctx(ctx);
-    last_n_tokens.resize(n_ctx);
-    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
+
     return true;
   }
 
+  void initialize() {
+    id_gen = 0;
+
+    // create slots
+    all_slots_are_idle = true;
+
+    const int32_t n_ctx_slot = n_ctx / params.n_parallel;
+
+    LOG_TEE("Available slots:\n");
+    for (int i = 0; i < params.n_parallel; i++) {
+      llama_client_slot slot;
+
+      slot.id = i;
+      slot.n_ctx = n_ctx_slot;
+      slot.reset();
+
+      LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
+      slots.push_back(slot);
+    }
+
+    batch = llama_batch_init(n_ctx, 0, params.n_parallel);
+
+    // empty system prompt
+    system_prompt = "";
+    system_tokens.clear();
+  }
+
   std::vector<llama_token> tokenize(const json &json_prompt,
                                     bool add_bos) const {
     // If `add_bos` is true, we only add BOS, when json_prompt is a string,
@@ -291,401 +614,1063 @@ struct llama_server_context {
       prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
     }
 
-    return prompt_tokens;
-  }
+    return prompt_tokens;
+  }
+
+  llama_client_slot *get_slot(int id) {
+    int64_t t_last = ggml_time_us();
+    llama_client_slot *last_used = nullptr;
+
+    for (llama_client_slot &slot : slots) {
+      if (slot.id == id && slot.available()) {
+        return &slot;
+      }
+
+      if (slot.available() && slot.t_last_used < t_last) {
+        last_used = &slot;
+        t_last = slot.t_last_used;
+      }
+    }
+
+    return last_used;
+  }
+
+  bool launch_slot_with_data(llama_client_slot *&slot, json data) {
+    slot_params default_params;
+    llama_sampling_params default_sparams;
+
+    slot->params.stream = json_value(data, "stream", false);
+    slot->params.cache_prompt = json_value(data, "cache_prompt", false);
+    slot->params.n_predict =
+        json_value(data, "n_predict", default_params.n_predict);
+    slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
+    slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
+    slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
+    slot->sparams.typical_p =
+        json_value(data, "typical_p", default_sparams.typical_p);
+    slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
+    slot->sparams.penalty_last_n =
+        json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
+    slot->sparams.penalty_repeat =
+        json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
+    slot->sparams.penalty_freq =
+        json_value(data, "frequency_penalty", default_sparams.penalty_freq);
+    slot->sparams.penalty_present =
+        json_value(data, "presence_penalty", default_sparams.penalty_present);
+    slot->sparams.mirostat =
+        json_value(data, "mirostat", default_sparams.mirostat);
+    slot->sparams.mirostat_tau =
+        json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
+    slot->sparams.mirostat_eta =
+        json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
+    slot->sparams.penalize_nl =
+        json_value(data, "penalize_nl", default_sparams.penalize_nl);
+    slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
+    slot->params.seed = json_value(data, "seed", default_params.seed);
+    slot->sparams.grammar =
+        json_value(data, "grammar", default_sparams.grammar);
+    slot->sparams.n_probs =
+        json_value(data, "n_probs", default_sparams.n_probs);
+
+    // infill
+    if (data.count("input_prefix") != 0) {
+      slot->params.input_prefix = data["input_prefix"];
+    } else {
+      slot->params.input_prefix = "";
+    }
+
+    if (data.count("input_suffix") != 0) {
+      slot->params.input_suffix = data["input_suffix"];
+    } else {
+      slot->params.input_suffix = "";
+    }
+
+    if (data.count("prompt") != 0) {
+      slot->prompt = data["prompt"];
+    } else {
+      slot->prompt = "";
+    }
+
+    slot->sparams.logit_bias.clear();
+
+    if (json_value(data, "ignore_eos", false)) {
+      slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
+    }
+
+    const auto &logit_bias = data.find("logit_bias");
+    if (logit_bias != data.end() && logit_bias->is_array()) {
+      const int n_vocab = llama_n_vocab(model);
+      for (const auto &el : *logit_bias) {
+        if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) {
+          llama_token tok = el[0].get<llama_token>();
+          if (tok >= 0 && tok < n_vocab) {
+            if (el[1].is_number()) {
+              slot->sparams.logit_bias[tok] = el[1].get<float>();
+            } else if (el[1].is_boolean() && !el[1].get<bool>()) {
+              slot->sparams.logit_bias[tok] = -INFINITY;
+            }
+          }
+        }
+      }
+    }
+
+    slot->params.antiprompt.clear();
+
+    const auto &stop = data.find("stop");
+    if (stop != data.end() && stop->is_array()) {
+      for (const auto &word : *stop) {
+        if (!word.empty()) {
+          slot->params.antiprompt.push_back(word);
+        }
+      }
+    }
+
+    if (multimodal) {
+      const auto &images_data = data.find("image_data");
+      if (images_data != data.end() && images_data->is_array()) {
+        for (const auto &img : *images_data) {
+          std::string data_b64 = img["data"].get<std::string>();
+          slot_image img_sl;
+          img_sl.id =
+              img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
+          int width, height, channels;
+          std::vector<uint8_t> image_buffer = base64_decode(data_b64);
+          data_b64.clear();
+          auto data =
+              stbi_load_from_memory(image_buffer.data(), image_buffer.size(),
+                                    &width, &height, &channels, 3);
+          if (!data) {
+            LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id,
+                    img_sl.id);
+            return false;
+          }
+          LOG_TEE("slot %i - image loaded [id: %i] resolution (%i x %i)\n",
+                  slot->id, img_sl.id, width, height);
+          img_sl.img_data.nx = width;
+          img_sl.img_data.ny = height;
+          img_sl.img_data.size = width * height * 3;
+          img_sl.img_data.data = new uint8_t[width * height * 3]();
+          memcpy(img_sl.img_data.data, data, width * height * 3);
+          stbi_image_free(data);
+          img_sl.request_encode_image = true;
+          slot->images.push_back(img_sl);
+        }
+        // process prompt
+        // example: system prompt [img-102] user [img-103] describe [img-134] ->
+        // [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '},
+        // {id: 134, prefix: ' describe '}]}
+        if (slot->images.size() > 0 && !slot->prompt.is_array()) {
+          std::string prompt = slot->prompt.get<std::string>();
+          size_t pos = 0, begin_prefix = 0;
+          std::string pattern = "[img-";
+          while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
+            size_t end_prefix = pos;
+            pos += pattern.length();
+            size_t end_pos = prompt.find("]", pos);
+            if (end_pos != std::string::npos) {
+              std::string image_id = prompt.substr(pos, end_pos - pos);
+              try {
+                int img_id = std::stoi(image_id);
+                bool found = false;
+                for (slot_image &img : slot->images) {
+                  if (img.id == img_id) {
+                    found = true;
+                    img.prefix_prompt =
+                        prompt.substr(begin_prefix, end_prefix - begin_prefix);
+                    begin_prefix = end_pos + 1;
+                    break;
+                  }
+                }
+                if (!found) {
+                  LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
+                  slot->images.clear();
+                  return false;
+                }
+              } catch (const std::invalid_argument &e) {
+                LOG_TEE("Invalid image number id in prompt\n");
+                slot->images.clear();
+                return false;
+              }
+            }
+          }
+          slot->prompt = "";
+          slot->params.input_suffix = prompt.substr(begin_prefix);
+          slot->params.cache_prompt =
+              false; // multimodal doesn't support cache prompt
+        }
+      }
+    }
+
+    if (slot->ctx_sampling != nullptr) {
+      llama_sampling_free(slot->ctx_sampling);
+    }
+    slot->ctx_sampling = llama_sampling_init(slot->sparams);
+    slot->command = LOAD_PROMPT;
+
+    all_slots_are_idle = false;
+
+    LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
+
+    return true;
+  }
+
+  void kv_cache_clear() {
+    // clear the entire KV cache
+    llama_kv_cache_clear(ctx);
+    clean_kv_cache = false;
+  }
+
+  void update_system_prompt() {
+    system_tokens = ::llama_tokenize(ctx, system_prompt, true);
+
+    llama_batch_clear(batch);
+
+    kv_cache_clear();
+
+    for (int i = 0; i < (int)system_tokens.size(); ++i) {
+      llama_batch_add(batch, system_tokens[i], i, {0}, false);
+    }
+
+    if (llama_decode(ctx, batch) != 0) {
+      LOG_TEE("%s: llama_decode() failed\n", __func__);
+      return;
+    }
+
+    // assign the system KV cache to all parallel sequences
+    for (int32_t i = 1; i < params.n_parallel; ++i) {
+      llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
+    }
+
+    LOG_TEE("system prompt updated\n");
+    system_need_update = false;
+  }
+
+  void notify_system_prompt_changed() {
+    // release all slots
+    for (llama_client_slot &slot : slots) {
+      slot.release();
+    }
+
+    system_need_update = true;
+  }
+
+  void process_system_prompt_data(const json &sys_props) {
+    system_prompt = sys_props.value("prompt", "");
+    name_user = sys_props.value("anti_prompt", "");
+    name_assistant = sys_props.value("assistant_name", "");
+
+    if (slots.size() > 0) {
+      notify_system_prompt_changed();
+    }
+  }
+
+  static size_t find_stopping_strings(const std::string &text,
+                                      const size_t last_token_size,
+                                      const stop_type type,
+                                      llama_client_slot &slot) {
+    size_t stop_pos = std::string::npos;
+
+    for (const std::string &word : slot.params.antiprompt) {
+      size_t pos;
+      if (type == STOP_FULL) {
+        const size_t tmp = word.size() + last_token_size;
+        const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
+        pos = text.find(word, from_pos);
+      } else {
+        pos = find_partial_stop_string(word, text);
+      }
+      if (pos != std::string::npos &&
+          (stop_pos == std::string::npos || pos < stop_pos)) {
+        if (type == STOP_FULL) {
+          slot.stopped_word = true;
+          slot.stopping_word = word;
+          slot.has_next_token = false;
+        }
+        stop_pos = pos;
+      }
+    }
+
+    return stop_pos;
+  }
+
+  bool process_token(completion_token_output &result, llama_client_slot &slot) {
+    // remember which tokens were sampled - used for repetition penalties during
+    // sampling
+    const std::string token_str = llama_token_to_piece(ctx, result.tok);
+    slot.sampled = result.tok;
+
+    // search stop word and delete it
+    slot.generated_text += token_str;
+    slot.has_next_token = true;
+
+    if (slot.multibyte_pending > 0) {
+      slot.multibyte_pending -= token_str.size();
+    } else if (token_str.size() == 1) {
+      const char c = token_str[0];
+      // 2-byte characters: 110xxxxx 10xxxxxx
+      if ((c & 0xE0) == 0xC0) {
+        slot.multibyte_pending = 1;
+        // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
+      } else if ((c & 0xF0) == 0xE0) {
+        slot.multibyte_pending = 2;
+        // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+      } else if ((c & 0xF8) == 0xF0) {
+        slot.multibyte_pending = 3;
+      } else {
+        slot.multibyte_pending = 0;
+      }
+    }
+
+    if (slot.multibyte_pending == 0) {
+      size_t pos = std::min(slot.sent_count, slot.generated_text.size());
+      const std::string str_test = slot.generated_text.substr(pos);
+      bool is_stop_full = false;
+      size_t stop_pos =
+          find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
+      if (stop_pos != std::string::npos) {
+        is_stop_full = true;
+        slot.generated_text.erase(slot.generated_text.begin() + pos + stop_pos,
+                                  slot.generated_text.end());
+        pos = std::min(slot.sent_count, slot.generated_text.size());
+      } else {
+        is_stop_full = false;
+        stop_pos = find_stopping_strings(str_test, token_str.size(),
+                                         STOP_PARTIAL, slot);
+      }
 
-  bool loadGrammar() {
-    if (!params.grammar.empty()) {
-      parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-      // will be empty (default) if there are parse errors
-      if (parsed_grammar.rules.empty()) {
-        LOG_ERROR_LLAMA("grammar parse error", {{"grammar", params.grammar}});
-        return false;
+      // check if there is any token to predict
+      if (stop_pos == std::string::npos ||
+          (!slot.has_next_token && !is_stop_full && stop_pos > 0)) {
+        // no send the stop word in the response
+        result.text_to_send =
+            slot.generated_text.substr(pos, std::string::npos);
+        slot.sent_count += result.text_to_send.size();
+        // add the token to slot queue and cache
       }
-      grammar_parser::print_grammar(stderr, parsed_grammar);
-
-      {
-        auto it = params.sampling_params.logit_bias.find(llama_token_eos(ctx));
-        if (it != params.sampling_params.logit_bias.end() &&
-            it->second == -INFINITY) {
-          LOG_WARNING_LLAMA(
-              "EOS token is disabled, which will cause most grammars to fail",
-              {});
-        }
+      slot.add_token_string(result);
+      if (slot.params.stream) {
+        send_partial_response(slot, result);
       }
-
-      std::vector<const llama_grammar_element *> grammar_rules(
-          parsed_grammar.c_rules());
-      grammar = llama_grammar_init(grammar_rules.data(), grammar_rules.size(),
-                                   parsed_grammar.symbol_ids.at("root"));
     }
-    ctx_sampling = llama_sampling_context_init(params, grammar);
-    return true;
-  }
 
-  void loadInfill() {
-    bool suff_rm_leading_spc = true;
-    if (params.input_suffix.find_first_of(" ") == 0 &&
-        params.input_suffix.size() > 1) {
-      params.input_suffix.erase(0, 1);
-      suff_rm_leading_spc = false;
+    if (slot.multibyte_pending > 0 && !slot.has_next_token) {
+      slot.has_next_token = true;
     }
 
-    auto prefix_tokens = tokenize(params.input_prefix, false);
-    auto suffix_tokens = tokenize(params.input_suffix, false);
-    const int space_token = 29871;
-    if (suff_rm_leading_spc && suffix_tokens[0] == space_token) {
-      suffix_tokens.erase(suffix_tokens.begin());
-    }
-    prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
-    prefix_tokens.insert(prefix_tokens.begin(),
-                         llama_token_bos(ctx)); // always add BOS
-    prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
-    prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(),
-                         suffix_tokens.end());
-    prefix_tokens.push_back(llama_token_middle(ctx));
-    auto prompt_tokens = prefix_tokens;
-
-    num_prompt_tokens = prompt_tokens.size();
-
-    if (params.n_keep < 0) {
-      params.n_keep = (int)num_prompt_tokens;
+    // check the limits
+    if (slot.n_decoded > 2 && slot.has_next_token && !slot.has_budget(params)) {
+      slot.stopped_limit = true;
+      slot.has_next_token = false;
     }
-    params.n_keep = std::min(params.n_ctx - 4, params.n_keep);
-
-    // if input prompt is too big, truncate like normal
-    if (num_prompt_tokens >= (size_t)params.n_ctx) {
-      printf("Input prompt is too big, truncating. Can only take %d tokens but "
-             "got %zu\n",
-             params.n_ctx, num_prompt_tokens);
-      // todo we probably want to cut from both sides
-      const int n_left = (params.n_ctx - params.n_keep) / 2;
-      std::vector<llama_token> new_tokens(
-          prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
-      const int erased_blocks =
-          (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
-      new_tokens.insert(new_tokens.end(),
-                        prompt_tokens.begin() + params.n_keep +
-                            erased_blocks * n_left,
-                        prompt_tokens.end());
-      std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(),
-                last_n_tokens.begin());
-
-      LOG_VERBOSE("input truncated",
-                  {
-                      {"n_ctx", params.n_ctx},
-                      {"n_keep", params.n_keep},
-                      {"n_left", n_left},
-                      {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(),
-                                                   new_tokens.cend())},
-                  });
 
-      truncated = true;
-      prompt_tokens = new_tokens;
-    } else {
-      const size_t ps = num_prompt_tokens;
-      std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
-      std::copy(prompt_tokens.begin(), prompt_tokens.end(),
-                last_n_tokens.end() - ps);
+    if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model)) {
+      slot.stopped_eos = true;
+      slot.has_next_token = false;
+      LOG_VERBOSE("eos token found", {});
     }
 
-    // compare the evaluated prompt with the new prompt
-    n_past = common_part(embd, prompt_tokens);
-    embd = prompt_tokens;
+    LOG_VERBOSE(
+        "next token",
+        {
+            {"token", result.tok},
+            {"token_text", tokens_to_output_formatted_string(ctx, result.tok)},
+            {"has_next_token", slot.has_next_token},
+            {"n_remain", slot.n_remaining},
+            {"num_tokens_predicted", slot.n_decoded},
+            {"stopped_eos", slot.stopped_eos},
+            {"stopped_word", slot.stopped_word},
+            {"stopped_limit", slot.stopped_limit},
+            {"stopping_word", slot.stopping_word},
+        });
+
+    return slot.has_next_token; // continue
+  }
 
-    if (n_past == num_prompt_tokens) {
-      // we have to evaluate at least 1 token to generate logits.
-      printf("we have to evaluate at least 1 token to generate logits\n");
-      n_past--;
+  bool process_images(llama_client_slot &slot) const {
+    for (slot_image &img : slot.images) {
+      if (!img.request_encode_image) {
+        continue;
+      }
+      clip_image_f32 img_res;
+      if (!clip_image_preprocess(clp_ctx, &img.img_data, &img_res,
+                                 /*pad2square =*/true)) {
+        LOG_TEE("Error processing the given image");
+        clip_free(clp_ctx);
+        return false;
+      }
+      img.image_tokens = clip_n_patches(clp_ctx);
+      img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
+      if (!img.image_embedding) {
+        LOG_TEE("Unable to allocate memory for image embeddings\n");
+        clip_free(clp_ctx);
+        return false;
+      }
+      LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
+      if (!clip_image_encode(clp_ctx, params.n_threads, &img_res,
+                             img.image_embedding)) {
+        LOG_TEE("Unable to encode image\n");
+        return false;
+      }
+      img.request_encode_image = false;
     }
 
-    // since #3228 we now have to manually manage the KV cache
-    llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+    return slot.images.size() > 0;
+  }
 
-    LOG_VERBOSE("prompt ingested",
-                {
-                    {"n_past", n_past},
-                    {"cached",
-                     tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
-                    {"to_eval",
-                     tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())},
-                });
+  void send_error(int id, std::string error) {
+    std::lock_guard<std::mutex> lock(mutex_results);
+    task_result res;
+    res.id = id;
+    res.error = true;
+    res.result_json = {{"content", error}};
+    queue_results.push_back(res);
+  }
 
-    has_next_token = true;
+  json get_model_props() { return get_formated_generation(slots[0]); }
+
+  json get_formated_generation(llama_client_slot &slot) {
+    const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
+    const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
+                            eos_bias->second < 0.0f &&
+                            std::isinf(eos_bias->second);
+    return json{
+        {"n_ctx", slot.n_ctx},
+        {"model", params.model_alias},
+        {"seed", slot.params.seed},
+        {"temp", slot.sparams.temp},
+        {"top_k", slot.sparams.top_k},
+        {"top_p", slot.sparams.top_p},
+        {"tfs_z", slot.sparams.tfs_z},
+        {"typical_p", slot.sparams.typical_p},
+        {"repeat_last_n", slot.sparams.penalty_last_n},
+        {"repeat_penalty", slot.sparams.penalty_repeat},
+        {"presence_penalty", slot.sparams.penalty_present},
+        {"frequency_penalty", slot.sparams.penalty_freq},
+        {"mirostat", slot.sparams.mirostat},
+        {"mirostat_tau", slot.sparams.mirostat_tau},
+        {"mirostat_eta", slot.sparams.mirostat_eta},
+        {"penalize_nl", slot.sparams.penalize_nl},
+        {"stop", slot.params.antiprompt},
+        {"n_predict", slot.params.n_predict},
+        {"n_keep", params.n_keep},
+        {"ignore_eos", ignore_eos},
+        {"stream", slot.params.stream},
+        {"logit_bias", slot.sparams.logit_bias},
+        {"n_probs", slot.sparams.n_probs},
+        {"grammar", slot.sparams.grammar},
+    };
   }
-  void loadPrompt() {
-    auto prompt_tokens = tokenize(prompt, true); // always add BOS
 
-    num_prompt_tokens = prompt_tokens.size();
+  void send_partial_response(llama_client_slot &slot,
+                             completion_token_output tkn) {
+    std::lock_guard<std::mutex> lock(mutex_results);
+    task_result res;
+    res.id = slot.task_id;
+    res.error = false;
+    res.stop = false;
+
+    res.result_json = json{{"content", tkn.text_to_send},
+                           {"stop", false},
+                           {"slot_id", slot.id},
+                           {"multimodal", multimodal}};
+
+    if (slot.sparams.n_probs > 0) {
+      std::vector<completion_token_output> probs_output = {};
+      const std::vector<llama_token> to_send_toks =
+          llama_tokenize(ctx, tkn.text_to_send, false);
+      size_t probs_pos = std::min(slot.sent_token_probs_index,
+                                  slot.generated_token_probs.size());
+      size_t probs_stop_pos =
+          std::min(slot.sent_token_probs_index + to_send_toks.size(),
+                   slot.generated_token_probs.size());
+      if (probs_pos < probs_stop_pos) {
+        probs_output = std::vector<completion_token_output>(
+            slot.generated_token_probs.begin() + probs_pos,
+            slot.generated_token_probs.begin() + probs_stop_pos);
+      }
+      slot.sent_token_probs_index = probs_stop_pos;
+      res.result_json["completion_probabilities"] =
+          probs_vector_to_json(ctx, probs_output);
+    }
+
+    queue_results.push_back(res);
+  }
 
-    if (params.n_keep < 0) {
-      params.n_keep = (int)num_prompt_tokens;
+  void send_final_response(llama_client_slot &slot) {
+    std::lock_guard<std::mutex> lock(mutex_results);
+    task_result res;
+    res.id = slot.task_id;
+    res.error = false;
+    res.stop = true;
+
+    res.result_json =
+        json{{"content", !slot.params.stream ? slot.generated_text : ""},
+             {"slot_id", slot.id},
+             {"stop", true},
+             {"model", params.model_alias},
+             {"tokens_predicted", slot.n_decoded},
+             {"tokens_evaluated", slot.num_prompt_tokens},
+             {"generation_settings", get_formated_generation(slot)},
+             {"prompt", slot.prompt},
+             {"truncated", slot.truncated},
+             {"stopped_eos", slot.stopped_eos},
+             {"stopped_word", slot.stopped_word},
+             {"stopped_limit", slot.stopped_limit},
+             {"stopping_word", slot.stopping_word},
+             {"tokens_cached", slot.n_past},
+             {"timings", slot.get_formated_timings()}};
+
+    if (slot.sparams.n_probs > 0) {
+      std::vector<completion_token_output> probs = {};
+      if (!slot.params.stream && slot.stopped_word) {
+        const std::vector<llama_token> stop_word_toks =
+            llama_tokenize(ctx, slot.stopping_word, false);
+        probs = std::vector<completion_token_output>(
+            slot.generated_token_probs.begin(),
+            slot.generated_token_probs.end() - stop_word_toks.size());
+      } else {
+        probs = std::vector<completion_token_output>(
+            slot.generated_token_probs.begin(),
+            slot.generated_token_probs.begin() + slot.sent_token_probs_index);
+      }
+      res.result_json["completion_probabilities"] =
+          probs_vector_to_json(ctx, probs);
     }
-    params.n_keep = std::min(n_ctx - 4, params.n_keep);
-
-    // if input prompt is too big, truncate like normal
-    if (num_prompt_tokens >= (size_t)n_ctx) {
-      const int n_left = (n_ctx - params.n_keep) / 2;
-      std::vector<llama_token> new_tokens(
-          prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
-      const int erased_blocks =
-          (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
-      new_tokens.insert(new_tokens.end(),
-                        prompt_tokens.begin() + params.n_keep +
-                            erased_blocks * n_left,
-                        prompt_tokens.end());
-      std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(),
-                last_n_tokens.begin());
-
-      LOG_VERBOSE("input truncated",
-                  {
-                      {"n_ctx", n_ctx},
-                      {"n_keep", params.n_keep},
-                      {"n_left", n_left},
-                      {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(),
-                                                   new_tokens.cend())},
-                  });
 
-      truncated = true;
-      prompt_tokens = new_tokens;
+    queue_results.push_back(res);
+  }
+
+  void send_embedding(llama_client_slot &slot) {
+    std::lock_guard<std::mutex> lock(mutex_results);
+    task_result res;
+    res.id = slot.task_id;
+    res.error = false;
+    res.stop = true;
+
+    const int n_embd = llama_n_embd(model);
+    if (!params.embedding) {
+      LOG_WARNING_LLAMA("embedding disabled",
+                        {
+                            {"params.embedding", params.embedding},
+                        });
+      res.result_json = json{
+          {"embedding", std::vector<float>(n_embd, 0.0f)},
+      };
     } else {
-      const size_t ps = num_prompt_tokens;
-      std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
-      std::copy(prompt_tokens.begin(), prompt_tokens.end(),
-                last_n_tokens.end() - ps);
+      const float *data = llama_get_embeddings(ctx);
+      std::vector<float> embedding(data, data + n_embd);
+      res.result_json = json{
+          {"embedding", embedding},
+      };
     }
+    queue_results.push_back(res);
+  }
+
+  int request_completion(json data, bool infill) {
+    std::lock_guard<std::mutex> lock(mutex_tasks);
+    task_server task;
+    task.id = id_gen++;
+    task.data = data;
+    task.infill_mode = infill;
+    task.type = COMPLETION_TASK;
+    queue_tasks.push_back(task);
+    return task.id;
+  }
 
-    // compare the evaluated prompt with the new prompt
-    n_past = common_part(embd, prompt_tokens);
+  task_result next_result(int task_id) {
+    while (true) {
+      std::this_thread::sleep_for(std::chrono::microseconds(5));
+      std::lock_guard<std::mutex> lock(mutex_results);
 
-    embd = prompt_tokens;
-    if (n_past == num_prompt_tokens) {
-      // we have to evaluate at least 1 token to generate logits.
-      n_past--;
+      if (queue_results.empty()) {
+        continue;
+      }
+
+      for (int i = 0; i < (int)queue_results.size(); i++) {
+        if (queue_results[i].id == task_id) {
+          task_result res = queue_results[i];
+          queue_results.erase(queue_results.begin() + i);
+          return res;
+        }
+      }
     }
 
-    // since #3228 we now have to manually manage the KV cache
-    llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+    // never reached
+    // return task_result{-1, false, false, {}};
+  }
+
+  // for multiple images processing
+  bool ingest_images(llama_client_slot &slot, int n_batch) {
+    int image_idx = 0;
+
+    while (image_idx < (int)slot.images.size()) {
+      slot_image &img = slot.images[image_idx];
+
+      // process prefix prompt
+      for (int32_t i = 0; i < (int32_t)batch.n_tokens; i += n_batch) {
+        const int32_t n_tokens =
+            std::min(n_batch, (int32_t)(batch.n_tokens - i));
+        llama_batch batch_view = {
+            n_tokens,
+            batch.token + i,
+            nullptr,
+            batch.pos + i,
+            batch.n_seq_id + i,
+            batch.seq_id + i,
+            batch.logits + i,
+            0,
+            0,
+            0, // unused
+        };
+        if (llama_decode(ctx, batch_view)) {
+          LOG_TEE("%s : failed to eval\n", __func__);
+          return false;
+        }
+      }
 
-    LOG_VERBOSE("prompt ingested",
-                {
-                    {"n_past", n_past},
-                    {"cached",
-                     tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
-                    {"to_eval",
-                     tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())},
-                });
+      // process image with llm
+      for (int i = 0; i < img.image_tokens; i += n_batch) {
+        int n_eval = img.image_tokens - i;
+        if (n_eval > n_batch) {
+          n_eval = n_batch;
+        }
+
+        const int n_embd = llama_n_embd(model);
+        llama_batch batch_img = {
+            n_eval,  nullptr,     (img.image_embedding + i * n_embd),
+            nullptr, nullptr,     nullptr,
+            nullptr, slot.n_past, 1,
+            0,
+        };
+        if (llama_decode(ctx, batch_img)) {
+          LOG_TEE("%s : failed to eval image\n", __func__);
+          return false;
+        }
+        slot.n_past += n_eval;
+      }
+      image_idx++;
+
+      llama_batch_clear(batch);
+
+      // append prefix of next image
+      const auto json_prompt =
+          (image_idx >= (int)slot.images.size())
+              ? slot.params.input_suffix
+              : // no more images, then process suffix prompt
+              (json)(slot.images[image_idx].prefix_prompt);
+
+      std::vector<llama_token> append_tokens =
+          tokenize(json_prompt, false); // has next image
+      for (int i = 0; i < (int)append_tokens.size(); ++i) {
+        llama_batch_add(batch, append_tokens[i], slot.n_past, {slot.id}, true);
+        slot.n_past += 1;
+      }
+    }
 
-    has_next_token = true;
+    return true;
   }
 
-  void beginCompletion() {
-    // number of tokens to keep when resetting context
-    n_remain = params.n_predict;
-    llama_set_rng_seed(ctx, params.seed);
+  void request_cancel(int task_id) {
+    std::lock_guard<std::mutex> lock(mutex_tasks);
+    task_server task;
+    task.id = id_gen++;
+    task.type = CANCEL_TASK;
+    task.target_id = task_id;
+    queue_tasks.push_back(task);
   }
 
-  completion_token_output nextToken() {
-    completion_token_output result;
-    result.tok = -1;
+  void process_tasks() {
+    std::lock_guard<std::mutex> lock(mutex_tasks);
+    while (!queue_tasks.empty()) {
+      // LOG_INFO << "test tasks";
+      task_server task = queue_tasks.front();
+      queue_tasks.erase(queue_tasks.begin());
+      switch (task.type) {
+      case COMPLETION_TASK: {
+        llama_client_slot *slot =
+            get_slot(json_value(task.data, "slot_id", -1));
+        if (slot == nullptr) {
+          LOG_TEE("slot unavailable\n");
+          // send error result
+          send_error(task.id, "slot unavaliable");
+          return;
+        }
+
+        if (task.data.contains("system_prompt")) {
+          process_system_prompt_data(task.data["system_prompt"]);
+        }
+
+        slot->reset();
+
+        slot->infill = task.infill_mode;
+        slot->task_id = task.id;
 
-    if (embd.size() >= (size_t)n_ctx) {
-      // Shift context
+        if (!launch_slot_with_data(slot, task.data)) {
+          // send error result
+          send_error(task.id, "internal_error");
+          break;
+        }
+      } break;
+      case CANCEL_TASK: { // release slot linked with the task id
+        for (auto &slot : slots) {
+          if (slot.task_id == task.target_id) {
+            slot.release();
+            slot.print_timings();
+            break;
+          }
+        }
+      } break;
+      }
+    }
+  }
 
-      const int n_left = n_past - params.n_keep - 1;
-      const int n_discard = n_left / 2;
+  bool update_slots() {
+    // attend tasks
+    process_tasks();
+
+    // update the system prompt wait until all slots are idle state
+    if (system_need_update && all_slots_are_idle) {
+      LOG_TEE("updating system prompt\n");
+      update_system_prompt();
+    }
 
-      llama_kv_cache_seq_rm(ctx, 0, params.n_keep + 1,
-                            params.n_keep + n_discard + 1);
-      llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past,
-                               -n_discard);
+    llama_batch_clear(batch);
 
-      for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++) {
-        embd[i - n_discard] = embd[i];
+    if (all_slots_are_idle) {
+      if (system_prompt.empty() && clean_kv_cache) {
+        LOG_TEE("all slots are idle and system prompt is empty, clear the KV "
+                "cache\n");
+        kv_cache_clear();
       }
-      embd.resize(embd.size() - n_discard);
+      // avoid 100% usage of cpu all time
+      std::this_thread::sleep_for(std::chrono::milliseconds(5));
+    }
+
+    for (llama_client_slot &slot : slots) {
+      if (slot.is_processing() &&
+          slot.cache_tokens.size() >= (size_t)slot.n_ctx) {
+        // Shift context
+        const int n_left = slot.n_past - slot.params.n_keep - 1;
+        const int n_discard = n_left / 2;
+
+        LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard "
+                "= %d\n",
+                slot.id, slot.params.n_keep, n_left, n_discard);
+        llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1,
+                              slot.params.n_keep + n_discard + 1);
+        llama_kv_cache_seq_shift(ctx, slot.id,
+                                 slot.params.n_keep + 1 + n_discard,
+                                 slot.n_past, -n_discard);
+
+        for (size_t i = slot.params.n_keep + 1 + n_discard;
+             i < slot.cache_tokens.size(); i++) {
+          slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
+        }
+
+        slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
 
-      n_past -= n_discard;
+        slot.n_past -= n_discard;
 
-      truncated = true;
-      LOG_VERBOSE("input truncated", {
+        slot.truncated = true;
+
+        LOG_VERBOSE("context shift", {
                                          {"n_ctx", n_ctx},
                                          {"n_keep", params.n_keep},
                                          {"n_left", n_left},
                                      });
+      }
     }
 
-    bool tg = true;
-    while (n_past < embd.size()) {
-      int n_eval = (int)embd.size() - n_past;
-      tg = n_eval == 1;
-      if (n_eval > params.n_batch) {
-        n_eval = params.n_batch;
+    // decode any currently ongoing sequences
+    for (auto &slot : slots) {
+      // release the slot
+      if (slot.command == RELEASE) {
+        slot.state = IDLE;
+        slot.command = NONE;
+        slot.t_last_used = ggml_time_us();
+
+        LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id,
+                (int)slot.cache_tokens.size());
+
+        continue;
       }
 
-      if (llama_decode(ctx,
-                       llama_batch_get_one(&embd[n_past], n_eval, n_past, 0))) {
-        LOG_ERROR_LLAMA("failed to eval",
-                        {
-                            {"n_eval", n_eval},
-                            {"n_past", n_past},
-                            {"embd", tokens_to_str(ctx, embd.cbegin() + n_past,
-                                                   embd.cend())},
-                        });
-        has_next_token = false;
-        return result;
+      if (slot.state == IDLE) {
+        continue;
       }
-      n_past += n_eval;
-    }
 
-    if (params.n_predict == 0) {
-      has_next_token = false;
-      result.tok = llama_token_eos(ctx);
-      return result;
+      slot.i_batch = batch.n_tokens;
+
+      llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past,
+                      {slot.id}, true);
+
+      slot.n_decoded += 1;
+      slot.n_past += 1;
     }
 
-    {
-      // out of user input, sample next token
-      std::vector<llama_token_data> candidates;
-      candidates.reserve(llama_n_vocab(model));
+    // process in chunks of params.n_batch
+    int32_t n_batch = params.n_batch;
+
+    // assign workload to the slots
+    if (params.cont_batching || batch.n_tokens == 0) {
+      for (auto &slot : slots) {
+        const bool has_prompt = slot.prompt.is_array() ||
+                                (slot.prompt.is_string() &&
+                                 !slot.prompt.get<std::string>().empty()) ||
+                                !slot.images.empty();
+
+        // empty prompt passed -> release the slot and send empty response
+        if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt) {
+          slot.release();
+          slot.print_timings();
+          send_final_response(slot);
+          continue;
+        }
+
+        // need process the prompt
+        if (slot.state == IDLE && slot.command == LOAD_PROMPT) {
+          slot.state = PROCESSING;
+          slot.command = NONE;
+          std::vector<llama_token> prompt_tokens;
+          slot.t_start_process_prompt = ggml_time_us();
+          slot.t_start_genereration = 0;
+
+          if (slot.infill) {
+            bool suff_rm_leading_spc = true;
+            if (params.input_suffix.find_first_of(' ') == 0 &&
+                params.input_suffix.size() > 1) {
+              params.input_suffix.erase(0, 1);
+              suff_rm_leading_spc = false;
+            }
+            auto prefix_tokens = tokenize(slot.params.input_prefix, false);
+            auto suffix_tokens = tokenize(slot.params.input_suffix, false);
+
+            const int space_token = 29871; // TODO: this should not be hardcoded
+            if (suff_rm_leading_spc && !suffix_tokens.empty() &&
+                suffix_tokens[0] == space_token) {
+              suffix_tokens.erase(suffix_tokens.begin());
+            }
+
+            prefix_tokens.insert(prefix_tokens.begin(),
+                                 llama_token_prefix(model));
+            prefix_tokens.insert(prefix_tokens.begin(),
+                                 llama_token_bos(model)); // always add BOS
+            prefix_tokens.insert(prefix_tokens.end(),
+                                 llama_token_suffix(model));
+            prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(),
+                                 suffix_tokens.end());
+            prefix_tokens.push_back(llama_token_middle(model));
+            prompt_tokens = prefix_tokens;
+          } else {
+            prompt_tokens = tokenize(
+                slot.prompt,
+                system_prompt.empty()); // add BOS if there isn't system prompt
+          }
+
+          slot.num_prompt_tokens = prompt_tokens.size();
 
-      result.tok = llama_sampling_sample(ctx, NULL, ctx_sampling, last_n_tokens,
-                                         candidates);
+          if (!slot.params.cache_prompt) {
+            llama_sampling_reset(slot.ctx_sampling);
 
-      llama_token_data_array candidates_p = {candidates.data(),
-                                             candidates.size(), false};
+            slot.n_past = 0;
+            slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
+          } else {
+            if (slot.params.n_keep < 0) {
+              slot.params.n_keep = slot.num_prompt_tokens;
+            }
+            slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
+
+            // if input prompt is too big, truncate it
+            if (slot.num_prompt_tokens >= slot.n_ctx) {
+              const int n_left = slot.n_ctx - slot.params.n_keep;
+              const int n_block_size = n_left / 2;
+              const int erased_blocks =
+                  (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) /
+                  n_block_size;
+
+              std::vector<llama_token> new_tokens(prompt_tokens.begin(),
+                                                  prompt_tokens.begin() +
+                                                      slot.params.n_keep);
+              new_tokens.insert(new_tokens.end(),
+                                prompt_tokens.begin() + slot.params.n_keep +
+                                    erased_blocks * n_block_size,
+                                prompt_tokens.end());
+
+              LOG_VERBOSE(
+                  "input truncated",
+                  {
+                      {"n_ctx", slot.n_ctx},
+                      {"n_keep", slot.params.n_keep},
+                      {"n_left", n_left},
+                      {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(),
+                                                   new_tokens.cend())},
+                  });
+              slot.truncated = true;
+              prompt_tokens = new_tokens;
 
-      const int32_t n_probs = params.sampling_params.n_probs;
-      if (params.sampling_params.temp <= 0 && n_probs > 0) {
-        // For llama_sample_token_greedy we need to sort candidates
-        llama_sample_softmax(ctx, &candidates_p);
-      }
+              slot.num_prompt_tokens = prompt_tokens.size();
+              GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
+            }
 
-      for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs);
-           ++i) {
-        result.probs.push_back(
-            {candidates_p.data[i].id, candidates_p.data[i].p});
-      }
+            // push the prompt into the sampling context (do not apply grammar)
+            for (auto &token : prompt_tokens) {
+              llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
+            }
 
-      last_n_tokens.erase(last_n_tokens.begin());
-      last_n_tokens.push_back(result.tok);
-      if (tg) {
-        num_tokens_predicted++;
-      }
-    }
+            slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
+            slot.num_prompt_tokens_processed =
+                slot.num_prompt_tokens - slot.n_past;
 
-    // add it to the context
-    embd.push_back(result.tok);
-    // decrement remaining sampling budget
-    --n_remain;
+            LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n",
+                    slot.id, slot.n_past, slot.num_prompt_tokens_processed);
+          }
 
-    if (!embd.empty() && embd.back() == llama_token_eos(ctx)) {
-      // stopping_word = llama_token_to_piece(ctx, embd.back());
-      has_next_token = false;
-      stopped_eos = true;
-      LOG_VERBOSE("eos token found", {});
-      return result;
-    }
+          LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id,
+                  (int)system_tokens.size() + slot.n_past);
 
-    has_next_token = params.n_predict == -1 || n_remain != 0;
-    return result;
-  }
+          llama_kv_cache_seq_rm(ctx, slot.id,
+                                system_tokens.size() + slot.n_past, -1);
 
-  size_t findStoppingStrings(const std::string &text,
-                             const size_t last_token_size,
-                             const stop_type type) {
-    size_t stop_pos = std::string::npos;
-    for (const std::string &word : params.antiprompt) {
-      size_t pos;
-      if (type == STOP_FULL) {
-        const size_t tmp = word.size() + last_token_size;
-        const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
-        pos = text.find(word, from_pos);
-      } else {
-        pos = find_partial_stop_string(word, text);
-      }
-      if (pos != std::string::npos &&
-          (stop_pos == std::string::npos || pos < stop_pos)) {
-        if (type == STOP_FULL) {
-          stopping_word = word;
-          stopped_word = true;
-          has_next_token = false;
+          slot.cache_tokens = prompt_tokens;
+
+          if (slot.n_past == slot.num_prompt_tokens) {
+            // we have to evaluate at least 1 token to generate logits.
+            LOG_TEE("slot %d : we have to evaluate at least 1 token to "
+                    "generate logits\n",
+                    slot.id);
+            slot.n_past--;
+          }
+
+          LOG_VERBOSE(
+              "prompt ingested",
+              {
+                  {"n_past", slot.n_past},
+                  {"cached",
+                   tokens_to_str(ctx, slot.cache_tokens.cbegin(),
+                                 slot.cache_tokens.cbegin() + slot.n_past)},
+                  {"to_eval",
+                   tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past,
+                                 slot.cache_tokens.cend())},
+              });
+
+          const bool has_images = process_images(slot);
+
+          // process the prefix of first image
+          std::vector<llama_token> prefix_tokens =
+              has_images ? tokenize(slot.images[0].prefix_prompt, true)
+                         : prompt_tokens;
+          for (; slot.n_past < (int)prefix_tokens.size(); ++slot.n_past) {
+            llama_batch_add(batch, prefix_tokens[slot.n_past],
+                            system_tokens.size() + slot.n_past, {slot.id},
+                            false);
+          }
+
+          if (has_images && !ingest_images(slot, n_batch)) {
+            LOG_TEE("failed processing images\n");
+            return false;
+          }
+
+          // extract the logits only for the last token
+          if (batch.n_tokens > 0) {
+            batch.logits[batch.n_tokens - 1] = true;
+          }
+
+          slot.n_decoded = 0;
+          slot.i_batch = batch.n_tokens - 1;
         }
-        stop_pos = pos;
       }
     }
-    return stop_pos;
-  }
 
-  completion_token_output doCompletion() {
-    auto token_with_probs = nextToken();
+    if (batch.n_tokens == 0) {
+      all_slots_are_idle = true;
+      return true;
+    }
 
-    const std::string token_text =
-        token_with_probs.tok == -1
-            ? ""
-            : llama_token_to_piece(ctx, token_with_probs.tok);
-    generated_text += token_text;
+    for (int32_t i = 0; i < (int32_t)batch.n_tokens; i += n_batch) {
+      const int32_t n_tokens = std::min(n_batch, (int32_t)(batch.n_tokens - i));
+      llama_batch batch_view = {
+          n_tokens,
+          batch.token + i,
+          nullptr,
+          batch.pos + i,
+          batch.n_seq_id + i,
+          batch.seq_id + i,
+          batch.logits + i,
+          0,
+          0,
+          0, // unused
+      };
+
+      const int ret = llama_decode(ctx, batch_view);
+      if (ret != 0) {
+        if (n_batch == 1 || ret < 0) {
+          // if you get here, it means the KV cache is full - try increasing it
+          // via the context size
+          LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n",
+                  __func__, n_batch, ret);
+          return false;
+        }
 
-    if (params.sampling_params.n_probs > 0) {
-      generated_token_probs.push_back(token_with_probs);
-    }
+        LOG_TEE("%s : failed to find free space in the KV cache, retrying with "
+                "smaller n_batch = %d\n",
+                __func__, n_batch / 2);
 
-    if (multibyte_pending > 0) {
-      multibyte_pending -= token_text.size();
-    } else if (token_text.size() == 1) {
-      const char c = token_text[0];
-      // 2-byte characters: 110xxxxx 10xxxxxx
-      if ((c & 0xE0) == 0xC0) {
-        multibyte_pending = 1;
-        // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
-      } else if ((c & 0xF0) == 0xE0) {
-        multibyte_pending = 2;
-        // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-      } else if ((c & 0xF8) == 0xF0) {
-        multibyte_pending = 3;
-      } else {
-        multibyte_pending = 0;
+        // retry with half the batch size to try to find a free slot in the KV
+        // cache
+        n_batch /= 2;
+        i -= n_batch;
+        continue;
       }
-    }
 
-    if (multibyte_pending > 0 && !has_next_token) {
-      has_next_token = true;
-      n_remain++;
-    }
+      for (auto &slot : slots) {
+        if (slot.i_batch < (int)i || slot.i_batch >= (int)(i + n_tokens)) {
+          continue;
+        }
 
-    if (!has_next_token && n_remain == 0) {
-      stopped_limit = true;
-    }
+        // prompt evaluated for embedding
+        if (params.embedding) {
+          send_embedding(slot);
+          slot.release();
+          slot.i_batch = -1;
+          return true;
+        }
 
-    LOG_VERBOSE("next token",
-                {
-                    {"token", token_with_probs.tok},
-                    {"token_text", tokens_to_output_formatted_string(
-                                       ctx, token_with_probs.tok)},
-                    {"has_next_token", has_next_token},
-                    {"n_remain", n_remain},
-                    {"num_tokens_predicted", num_tokens_predicted},
-                    {"stopped_eos", stopped_eos},
-                    {"stopped_word", stopped_word},
-                    {"stopped_limit", stopped_limit},
-                    {"stopping_word", stopping_word},
-                });
-
-    return token_with_probs;
-  }
+        completion_token_output result;
+        const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx,
+                                                     NULL, slot.i_batch - i);
 
-  std::vector<float> getEmbedding() {
-    static const int n_embd = llama_n_embd(model);
-    if (!params.embedding) {
-      LOG_WARNING_LLAMA("embedding disabled",
-                        {
-                            {"params.embedding", params.embedding},
-                        });
-      return std::vector<float>(n_embd, 0.0f);
+        llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
+
+        if (slot.n_decoded == 1) {
+          slot.t_start_genereration = ggml_time_us();
+          slot.t_prompt_processing =
+              (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
+        }
+
+        llama_token_data_array cur_p = {slot.ctx_sampling->cur.data(),
+                                        slot.ctx_sampling->cur.size(), false};
+        result.tok = id;
+
+        const int32_t n_probs = slot.sparams.n_probs;
+        if (slot.sparams.temp <= 0 && n_probs > 0) {
+          // for llama_sample_token_greedy we need to sort candidates
+          llama_sample_softmax(ctx, &cur_p);
+        }
+
+        for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i) {
+          result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
+        }
+
+        if (!process_token(result, slot)) {
+          slot.release();
+          slot.print_timings();
+          send_final_response(slot);
+        }
+
+        slot.i_batch = -1;
+      }
     }
-    const float *data = llama_get_embeddings(ctx);
-    std::vector<float> embedding(data, data + n_embd);
-    return embedding;
+    return true;
   }
 };
 
@@ -763,11 +1748,22 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
   printf(
       "  --embedding           enable embedding vector output (default: %s)\n",
       params.embedding ? "enabled" : "disabled");
+  printf("  -np N, --parallel N   number of slots for process requests "
+         "(default: %d)\n",
+         params.n_parallel);
+  printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic "
+         "batching) (default: disabled)\n");
+  printf("    -spf FNAME, --system-prompt-file FNAME\n");
+  printf("                        Set a file to load a system prompt (initial "
+         "prompt of all slots), this is useful for chat applications.\n");
+  printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for "
+         "LLaVA.\n");
   printf("\n");
 }
 
 static void server_params_parse(int argc, char **argv, server_params &sparams,
-                                gpt_params &params) {
+                                gpt_params &params,
+                                llama_server_context &llama) {
   gpt_params default_params;
   server_params default_sparams;
   std::string arg;
@@ -892,19 +1888,17 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
         }
       }
 #else
-      LOG_WARNING_LLAMA(
-          "llama.cpp was compiled without cuBLAS. It is not possible "
-          "to set a tensor split.\n",
-          {});
+      LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not "
+                        "possible to set a tensor split.\n",
+                        {});
 #endif // GGML_USE_CUBLAS
     } else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
 #ifdef GGML_USE_CUBLAS
       params.mul_mat_q = false;
 #else
-      LOG_WARNING_LLAMA(
-          "warning: llama.cpp was compiled without cuBLAS. Disabling "
-          "mul_mat_q kernels has no effect.\n",
-          {});
+      LOG_WARNING_LLAMA("warning: llama.cpp was compiled without cuBLAS. "
+                        "Disabling mul_mat_q kernels has no effect.\n",
+                        {});
 #endif // GGML_USE_CUBLAS
     } else if (arg == "--main-gpu" || arg == "-mg") {
       if (++i >= argc) {
@@ -914,10 +1908,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 #ifdef GGML_USE_CUBLAS
       params.main_gpu = std::stoi(argv[i]);
 #else
-      LOG_WARNING_LLAMA(
-          "llama.cpp was compiled without cuBLAS. It is not possible "
-          "to set a main GPU.",
-          {});
+      LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not "
+                        "possible to set a main GPU.",
+                        {});
 #endif
     } else if (arg == "--lora") {
       if (++i >= argc) {
@@ -959,6 +1952,42 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
       params.numa = true;
     } else if (arg == "--embedding") {
       params.embedding = true;
+    } else if (arg == "-cb" || arg == "--cont-batching") {
+      params.cont_batching = true;
+    } else if (arg == "-np" || arg == "--parallel") {
+      if (++i >= argc) {
+        invalid_param = true;
+        break;
+      }
+      params.n_parallel = std::stoi(argv[i]);
+    } else if (arg == "-n" || arg == "--n-predict") {
+      if (++i >= argc) {
+        invalid_param = true;
+        break;
+      }
+      params.n_predict = std::stoi(argv[i]);
+    } else if (arg == "-spf" || arg == "--system-prompt-file") {
+      if (++i >= argc) {
+        invalid_param = true;
+        break;
+      }
+      std::ifstream file(argv[i]);
+      if (!file) {
+        fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+        invalid_param = true;
+        break;
+      }
+      std::string systm_content;
+      std::copy(std::istreambuf_iterator<char>(file),
+                std::istreambuf_iterator<char>(),
+                std::back_inserter(systm_content));
+      llama.process_system_prompt_data(json::parse(systm_content));
+    } else if (arg == "--mmproj") {
+      if (++i >= argc) {
+        invalid_param = true;
+        break;
+      }
+      params.mmproj = argv[i];
     } else {
       fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
       server_print_usage(argv[0], default_params, default_sparams);
@@ -973,100 +2002,16 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
   }
 }
 
-static json format_generation_settings(llama_server_context &llama) {
-  const auto &sparams = llama.params.sampling_params;
-  const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx));
-  const bool ignore_eos = eos_bias != sparams.logit_bias.end() &&
-                          eos_bias->second < 0.0f &&
-                          std::isinf(eos_bias->second);
-
-  return json{
-      {"n_ctx", llama.n_ctx},
-      {"model", llama.params.model_alias},
-      {"seed", llama.params.seed},
-      {"temp", sparams.temp},
-      {"top_k", sparams.top_k},
-      {"top_p", sparams.top_p},
-      {"tfs_z", sparams.tfs_z},
-      {"typical_p", sparams.typical_p},
-      {"repeat_last_n", sparams.repeat_last_n},
-      {"repeat_penalty", sparams.repeat_penalty},
-      {"presence_penalty", sparams.presence_penalty},
-      {"frequency_penalty", sparams.frequency_penalty},
-      {"mirostat", sparams.mirostat},
-      {"mirostat_tau", sparams.mirostat_tau},
-      {"mirostat_eta", sparams.mirostat_eta},
-      {"penalize_nl", sparams.penalize_nl},
-      {"stop", llama.params.antiprompt},
-      {"n_predict", llama.params.n_predict},
-      {"n_keep", llama.params.n_keep},
-      {"ignore_eos", ignore_eos},
-      {"stream", llama.stream},
-      {"logit_bias", sparams.logit_bias},
-      {"n_probs", sparams.n_probs},
-      {"grammar", llama.params.grammar},
-  };
-}
-
-static json format_embedding_response(llama_server_context &llama) {
-  return json{
-      {"embedding", llama.getEmbedding()},
-  };
-}
-
-static json format_timings(llama_server_context &llama) {
-  const auto timings = llama_get_timings(llama.ctx);
-
-  return json{
-      {"prompt_n", timings.n_p_eval},
-      {"prompt_ms", timings.t_p_eval_ms},
-      {"prompt_per_token_ms", timings.t_p_eval_ms / timings.n_p_eval},
-      {"prompt_per_second", 1e3 / timings.t_p_eval_ms * timings.n_p_eval},
-
-      {"predicted_n", timings.n_eval},
-      {"predicted_ms", timings.t_eval_ms},
-      {"predicted_per_token_ms", timings.t_eval_ms / timings.n_eval},
-      {"predicted_per_second", 1e3 / timings.t_eval_ms * timings.n_eval},
-  };
-}
-
-static json
-format_final_response(llama_server_context &llama, const std::string &content,
-                      const std::vector<completion_token_output> &probs) {
-
-  json res = json{
-      {"content", content},
-      {"stop", true},
-      {"model", llama.params.model_alias},
-      {"tokens_predicted", llama.num_tokens_predicted},
-      {"tokens_evaluated", llama.num_prompt_tokens},
-      {"generation_settings", format_generation_settings(llama)},
-      {"prompt", llama.prompt},
-      {"truncated", llama.truncated},
-      {"stopped_eos", llama.stopped_eos},
-      {"stopped_word", llama.stopped_word},
-      {"stopped_limit", llama.stopped_limit},
-      {"stopping_word", llama.stopping_word},
-      {"tokens_cached", llama.n_past},
-      {"timings", format_timings(llama)},
-  };
-
-  if (llama.params.sampling_params.n_probs > 0) {
-    res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
-  }
-
-  return res;
-}
-
 static json
-format_partial_response(llama_server_context &llama, const std::string &content,
+format_partial_response(llama_server_context &llama, llama_client_slot *slot,
+                        const std::string &content,
                         const std::vector<completion_token_output> &probs) {
-  json res = json{
-      {"content", content},
-      {"stop", false},
-  };
+  json res = json{{"content", content},
+                  {"stop", false},
+                  {"slot_id", slot->id},
+                  {"multimodal", llama.multimodal}};
 
-  if (llama.params.sampling_params.n_probs > 0) {
+  if (slot->sparams.n_probs > 0) {
     res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
   }
 
@@ -1081,155 +2026,6 @@ static json format_detokenized_response(std::string content) {
   return json{{"content", content}};
 }
 
-template <typename T>
-static T json_value(const json &body, const std::string &key,
-                    const T &default_value) {
-  // Fallback null to default value
-  return body.contains(key) && !body.at(key).is_null()
-             ? body.value(key, default_value)
-             : default_value;
-}
-
-static void parse_options_completion(const json &body,
-                                     llama_server_context &llama) {
-  gpt_params default_params;
-  const auto &default_sparams = default_params.sampling_params;
-  auto &sparams = llama.params.sampling_params;
-
-  llama.stream = json_value(body, "stream", false);
-  llama.params.n_predict =
-      json_value(body, "n_predict", default_params.n_predict);
-  sparams.top_k = json_value(body, "top_k", default_sparams.top_k);
-  sparams.top_p = json_value(body, "top_p", default_sparams.top_p);
-  sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z);
-  sparams.typical_p = json_value(body, "typical_p", default_sparams.typical_p);
-  sparams.repeat_last_n =
-      json_value(body, "repeat_last_n", default_sparams.repeat_last_n);
-  sparams.temp = json_value(body, "temperature", default_sparams.temp);
-  sparams.repeat_penalty =
-      json_value(body, "repeat_penalty", default_sparams.repeat_penalty);
-  sparams.presence_penalty =
-      json_value(body, "presence_penalty", default_sparams.presence_penalty);
-  sparams.frequency_penalty =
-      json_value(body, "frequency_penalty", default_sparams.frequency_penalty);
-  sparams.mirostat = json_value(body, "mirostat", default_sparams.mirostat);
-  sparams.mirostat_tau =
-      json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
-  sparams.mirostat_eta =
-      json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
-  sparams.penalize_nl =
-      json_value(body, "penalize_nl", default_sparams.penalize_nl);
-  llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
-  llama.params.seed = json_value(body, "seed", default_params.seed);
-  llama.params.grammar = json_value(body, "grammar", default_params.grammar);
-  sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs);
-
-  if (body.count("prompt") != 0) {
-    llama.prompt = body["prompt"];
-  } else {
-    llama.prompt = "";
-  }
-
-  sparams.logit_bias.clear();
-  if (json_value(body, "ignore_eos", false)) {
-    sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
-  }
-
-  const auto &logit_bias = body.find("logit_bias");
-  if (logit_bias != body.end() && logit_bias->is_array()) {
-    const int n_vocab = llama_n_vocab(llama.model);
-    for (const auto &el : *logit_bias) {
-      if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) {
-        llama_token tok = el[0].get<llama_token>();
-        if (tok >= 0 && tok < n_vocab) {
-          if (el[1].is_number()) {
-            sparams.logit_bias[tok] = el[1].get<float>();
-          } else if (el[1].is_boolean() && !el[1].get<bool>()) {
-            sparams.logit_bias[tok] = -INFINITY;
-          }
-        }
-      }
-    }
-  }
-
-  llama.params.antiprompt.clear();
-  const auto &stop = body.find("stop");
-  if (stop != body.end() && stop->is_array()) {
-    for (const auto &word : *stop) {
-      if (!word.empty()) {
-        llama.params.antiprompt.push_back(word);
-      }
-    }
-  }
-
-  llama.ctx_sampling = llama_sampling_context_init(llama.params, llama.grammar);
-
-  LOG_VERBOSE("completion parameters parsed",
-              format_generation_settings(llama));
-}
-
-static void parse_options_infill(const json &body,
-                                 llama_server_context &llama) {
-  if (body.count("input_prefix") != 0) {
-    llama.params.input_prefix = body["input_prefix"];
-  } else {
-    llama.params.input_prefix = "";
-  }
-  if (body.count("input_suffix") != 0) {
-    llama.params.input_suffix = body["input_suffix"];
-  } else {
-    llama.params.input_suffix = "";
-  }
-  parse_options_completion(body, llama);
-}
-
-static bool is_at_eob(llama_server_context &server_context,
-                      const llama_token *tokens, const size_t n_tokens) {
-  return n_tokens &&
-         tokens[n_tokens - 1] == llama_token_eos(server_context.ctx);
-}
-
-// Function matching type llama_beam_search_callback_fn_t.
-// Custom callback example is called each time the beams lengths increase:
-//  * Show progress by printing ',' following by number of convergent beam
-//  tokens if any.
-//  * When all beams converge to a common prefix, they are made available in
-//  beams_state.beams[0].
-//    This is also called when the stop condition is met.
-//    Collect tokens into std::vector<llama_token> response which is pointed to
-//    by callback_data.
-static void beam_search_callback(void *callback_data,
-                                 llama_beams_state beams_state) {
-  auto &llama = *static_cast<llama_server_context *>(callback_data);
-  // Mark beams as EOS as needed.
-  for (size_t i = 0; i < beams_state.n_beams; ++i) {
-    llama_beam_view &beam_view = beams_state.beam_views[i];
-    if (!beam_view.eob &&
-        is_at_eob(llama, beam_view.tokens, beam_view.n_tokens)) {
-      beam_view.eob = true;
-    }
-  }
-  printf(","); // Show progress
-  if (const size_t n = beams_state.common_prefix_length) {
-    llama.generated_token_probs.resize(llama.generated_token_probs.size() + n);
-    assert(0u < beams_state.n_beams);
-    const llama_token *tokens = beams_state.beam_views[0].tokens;
-    const auto map = [](llama_token tok) {
-      return completion_token_output{{}, tok};
-    };
-    std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n,
-                   map);
-    printf("%zu", n);
-  }
-  fflush(stdout);
-#if 0 // DEBUG: print current beams for this iteration
-    std::cout << "\n\nCurrent beams:\n";
-    for (size_t i=0 ; i < beams_state.n_beams ; ++i) {
-        std::cout << "beams["<<i<<"]: " << ostream_beam_view{state.ctx,beams_state.beam_views[i]} << std::endl;
-    }
-#endif
-}
-
 struct token_translator {
   llama_context *ctx;
   std::string operator()(llama_token tok) const {
@@ -1240,20 +2036,21 @@ struct token_translator {
   }
 };
 
-static void append_to_generated_text_from_generated_token_probs(
-    llama_server_context &llama) {
-  auto &gtps = llama.generated_token_probs;
+static void
+append_to_generated_text_from_generated_token_probs(llama_server_context &llama,
+                                                    llama_client_slot *slot) {
+  auto &gtps = slot->generated_token_probs;
   auto translator = token_translator{llama.ctx};
   auto add_strlen = [=](size_t sum, const completion_token_output &cto) {
     return sum + translator(cto).size();
   };
   const size_t len =
       std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
-  if (llama.generated_text.capacity() < llama.generated_text.size() + len) {
-    llama.generated_text.reserve(llama.generated_text.size() + len);
+  if (slot->generated_text.capacity() < slot->generated_text.size() + len) {
+    slot->generated_text.reserve(slot->generated_text.size() + len);
   }
   for (const completion_token_output &cto : gtps) {
-    llama.generated_text += translator(cto);
+    slot->generated_text += translator(cto);
   }
 }
 
@@ -1264,9 +2061,11 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
 public:
   llamaCPP() {
     // Some default values for now below
-    log_disable(); // Disable the log to file feature, reduce bloat for target
-                   // system ()
+    // log_disable(); // Disable the log to file feature, reduce bloat for
+    // target
+    // system ()
   }
+
   METHOD_LIST_BEGIN
   // list path definitions here;
   METHOD_ADD(llamaCPP::chatCompletion, "chat_completion", Post);
@@ -1282,10 +2081,13 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
                  std::function<void(const HttpResponsePtr &)> &&callback);
   void warmupModel();
 
+  void backgroundTask();
+
 private:
   llama_server_context llama;
   bool model_loaded = false;
   size_t sent_count = 0;
   size_t sent_token_probs_index = 0;
+  std::thread backgroundThread;
 };
 }; // namespace inferences
diff --git a/ext_libs/libcrypto-3-x64.dll b/ext_libs/libcrypto-3-x64.dll
deleted file mode 100644
index b9223ec18..000000000
Binary files a/ext_libs/libcrypto-3-x64.dll and /dev/null differ
diff --git a/ext_libs/libssl-3-x64.dll b/ext_libs/libssl-3-x64.dll
deleted file mode 100644
index d48518e4f..000000000
Binary files a/ext_libs/libssl-3-x64.dll and /dev/null differ
diff --git a/llama.cpp b/llama.cpp
index 281ef73c2..207b51900 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit 281ef73c258cc1eebec8a64264240432d5878c4b
+Subproject commit 207b51900e15cc7f89763a3bb1c565fe11cbb45d
diff --git a/main.cc b/main.cc
index 64f556cb1..c8ab01856 100644
--- a/main.cc
+++ b/main.cc
@@ -1,5 +1,6 @@
 #include "utils/nitro_utils.h"
 #include <climits> // for PATH_MAX
+#include <drogon/HttpAppFramework.h>
 #include <drogon/drogon.h>
 
 #if defined(__APPLE__) && defined(__MACH__)
@@ -15,24 +16,32 @@
 #endif
 
 int main(int argc, char *argv[]) {
-
+  int thread_num = std::thread::hardware_concurrency();
   std::string host = "127.0.0.1";
   int port = 3928;
 
-  // Check for host argument
+  // Number of nitro threads
   if (argc > 1) {
-    host = argv[1];
+    thread_num = std::atoi(argv[1]);
   }
 
-  // Check for port argument
+  // Check for host argument
   if (argc > 2) {
-    port = std::atoi(argv[2]); // Convert string argument to int
+    host = argv[2];
+  }
+
+  // Check for port argument
+  if (argc > 3) {
+    port = std::atoi(argv[3]); // Convert string argument to int
   }
 
   nitro_utils::nitro_logo();
   LOG_INFO << "Server started, listening at: " << host << ":" << port;
   LOG_INFO << "Please load your model";
   drogon::app().addListener(host, port);
+  drogon::app().setThreadNum(thread_num);
+  LOG_INFO << "Number of thread is:" << drogon::app().getThreadNum();
+
   drogon::app().run();
 
   return 0;