diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 006bae70a..2e44d6dc5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -276,8 +276,6 @@ jobs: run: | robocopy build_deps\_install\bin .\build\Release zlib.dll robocopy build\bin\Release .\build\Release llama.dll - robocopy ext_libs .\build\Release libcrypto-3-x64.dll - robocopy ext_libs .\build\Release libssl-3-x64.dll 7z a nitro.zip .\build\Release\* - uses: actions/upload-release-asset@v1.0.1 @@ -325,7 +323,7 @@ jobs: cmake --build ./build_deps/nitro_deps --config Release mkdir -p build cd build - cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON + cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON cmake --build . --config Release -j 4 - name: Pack artifacts @@ -336,8 +334,6 @@ jobs: echo %PATH% robocopy build_deps\_install\bin .\build\Release zlib.dll robocopy build\bin\Release .\build\Release llama.dll - robocopy ext_libs .\build\Release libcrypto-3-x64.dll - robocopy ext_libs .\build\Release libssl-3-x64.dll 7z a nitro.zip .\build\Release\* - uses: actions/upload-release-asset@v1.0.1 diff --git a/CMakeLists.txt b/CMakeLists.txt index 9730e1eac..89e3a88fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,9 @@ else() set(CMAKE_CXX_STANDARD 14) endif() +# llama cpp server need llava example to work, this is for llama cpp server +set(LLAMA_BUILD_EXAMPLES ON) + set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) set(OPENSSL_USE_STATIC_LIBS TRUE) @@ -48,7 +51,7 @@ add_executable(${PROJECT_NAME} main.cc) # # and comment out the following lines find_package(Drogon CONFIG REQUIRED) -target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon common llama +target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon common llama clip ${CMAKE_THREAD_LIBS_INIT}) # ############################################################################## diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 24e10d9be..f7b738724 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -10,6 +10,22 @@ #include using namespace inferences; +using json = nlohmann::json; + +// To store state of each inference request +struct State { + bool isStopped = false; + int task_id; + llamaCPP *instance; + + State(int tid, llamaCPP *inst) : task_id(tid), instance(inst) {} +}; + +std::shared_ptr createState(int task_id, llamaCPP *instance) { + return std::make_shared(task_id, instance); +} + +// -------------------------------------------- std::string create_return_json(const std::string &id, const std::string &model, const std::string &content, @@ -41,71 +57,40 @@ std::string create_return_json(const std::string &id, const std::string &model, } void llamaCPP::warmupModel() { - auto lock = llama.lock(); - llama.rewind(); - llama_reset_timings(llama.ctx); - - llama.prompt = "hello"; - llama.params.n_predict = 1; - llama.loadPrompt(); - llama.beginCompletion(); - size_t stop_pos = std::string::npos; - - while (llama.has_next_token) { - const completion_token_output token_with_probs = llama.doCompletion(); - const std::string token_text = - token_with_probs.tok == -1 - ? "" - : llama_token_to_piece(llama.ctx, token_with_probs.tok); - - stop_pos = llama.findStoppingStrings(llama.generated_text, - token_text.size(), STOP_FULL); - } - - if (stop_pos == std::string::npos) { - stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL); - } - if (stop_pos != std::string::npos) { - llama.generated_text.erase(llama.generated_text.begin() + stop_pos, - llama.generated_text.end()); - } - auto probs = llama.generated_token_probs; - if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) { - const std::vector stop_word_toks = - llama_tokenize(llama.ctx, llama.stopping_word, false); - probs = std::vector( - llama.generated_token_probs.begin(), - llama.generated_token_probs.end() - stop_word_toks.size()); - } - - LOG_INFO << "Warm-up generated text:" << llama.generated_text; - LOG_INFO << "Warm-up finish"; - return; +// json pseudo; +// +// pseudo["prompt"] = "Hello"; +// pseudo["n_predict"] = 10; +// const int task_id = llama.request_completion(pseudo, false); +// std::string completion_text; +// task_result result = llama.next_result(task_id); +// if (!result.error && result.stop) { +// LOG_INFO << result.result_json.dump(-1, ' ', false, +// json::error_handler_t::replace); +// } +// return; } void llamaCPP::chatCompletion( const HttpRequestPtr &req, std::function &&callback) { - if (!model_loaded) { - Json::Value jsonResp; - jsonResp["message"] = "Model is not loaded yet"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(drogon::k500InternalServerError); - callback(resp); - return; - } const auto &jsonBody = req->getJsonObject(); std::string formatted_output = "Below is a conversation between an AI system named ASSISTANT and USER\n"; + + json data; + json stopWords; + // To set default value + data["stream"] = true; + data["n_predict"] = 30; + if (jsonBody) { - llama.params.n_predict = (*jsonBody)["max_tokens"].asInt(); - llama.params.sampling_params.top_p = (*jsonBody)["top_p"].asFloat(); - llama.params.sampling_params.temp = (*jsonBody)["temperature"].asFloat(); - llama.params.sampling_params.frequency_penalty = - (*jsonBody)["frequency_penalty"].asFloat(); - llama.params.sampling_params.presence_penalty = - (*jsonBody)["presence_penalty"].asFloat(); + data["n_predict"] = (*jsonBody)["max_tokens"].asInt(); + data["top_p"] = (*jsonBody)["top_p"].asFloat(); + data["temperature"] = (*jsonBody)["temperature"].asFloat(); + data["frequency_penalty"] = (*jsonBody)["frequency_penalty"].asFloat(); + data["presence_penalty"] = (*jsonBody)["presence_penalty"].asFloat(); const Json::Value &messages = (*jsonBody)["messages"]; for (const auto &message : messages) { @@ -114,186 +99,93 @@ void llamaCPP::chatCompletion( formatted_output += role + ": " + content + "\n"; } formatted_output += "assistant:"; + + data["prompt"] = formatted_output; + for (const auto &stop_word : (*jsonBody)["stop"]) { + stopWords.push_back(stop_word.asString()); + } + // specify default stop words + stopWords.push_back("user:"); + stopWords.push_back("### USER:"); + data["stop"] = stopWords; } - this->llama.rewind(); + const int task_id = llama.request_completion(data, false); + LOG_INFO << "Resolved request for task_id:" << task_id; - llama_reset_timings(llama.ctx); + auto state = createState(task_id, this); - this->llama.prompt = formatted_output; - this->llama.params.antiprompt.clear(); - for (const auto &stop_word : (*jsonBody)["stop"]) { - llama.params.antiprompt.push_back(stop_word.asString()); - } - this->llama.params.antiprompt.push_back("user:"); - this->llama.params.antiprompt.push_back("### USER:"); - this->llama.loadPrompt(); - this->llama.beginCompletion(); - - const auto chunked_content_provider = - [this](char *pBuffer, std::size_t nBuffSize) -> std::size_t { - auto lock = this->llama.lock(); + auto chunked_content_provider = + [state](char *pBuffer, std::size_t nBuffSize) -> std::size_t { if (!pBuffer) { LOG_INFO << "Connection closed or buffer is null. Reset context"; - lock.release(); - - llama_print_timings(llama.ctx); - this->llama.mutex.unlock(); - this->sent_count = 0; - this->sent_token_probs_index = 0; - // LOG_INFO << "Test end two time lol"; + state->instance->llama.request_cancel(state->task_id); return 0; } - // LOG_INFO << this->llama.has_next_token; - while (this->llama.has_next_token) { - try { - // LOG_INFO << this->llama.has_next_token; - const completion_token_output token_with_probs = - this->llama.doCompletion(); - if (token_with_probs.tok == -1 || this->llama.multibyte_pending > 0) { - return 0; - } - const std::string token_text = - llama_token_to_piece(llama.ctx, token_with_probs.tok); - - size_t pos = std::min(sent_count, this->llama.generated_text.size()); - - const std::string str_test = this->llama.generated_text.substr(pos); - bool is_stop_full = false; - size_t stop_pos = this->llama.findStoppingStrings( - str_test, token_text.size(), STOP_FULL); - if (stop_pos != std::string::npos) { - is_stop_full = true; - this->llama.generated_text.erase(llama.generated_text.begin() + pos + - stop_pos, - this->llama.generated_text.end()); - pos = std::min(sent_count, this->llama.generated_text.size()); - } else { - is_stop_full = false; - stop_pos = this->llama.findStoppingStrings( - str_test, token_text.size(), STOP_PARTIAL); - } - - if (stop_pos == std::string::npos || - // Send rest of the text if we are at the end of the generation - (!this->llama.has_next_token && !is_stop_full && stop_pos > 0)) { - const std::string to_send = - this->llama.generated_text.substr(pos, std::string::npos); - - sent_count += to_send.size(); - - std::vector probs_output = {}; - - if (this->llama.params.sampling_params.n_probs > 0) { - const std::vector to_send_toks = - llama_tokenize(llama.ctx, to_send, false); - size_t probs_pos = - std::min(sent_token_probs_index, - this->llama.generated_token_probs.size()); - size_t probs_stop_pos = - std::min(sent_token_probs_index + to_send_toks.size(), - this->llama.generated_token_probs.size()); - if (probs_pos < probs_stop_pos) { - probs_output = std::vector( - this->llama.generated_token_probs.begin() + probs_pos, - this->llama.generated_token_probs.begin() + probs_stop_pos); - } - sent_token_probs_index = probs_stop_pos; - } - if (!to_send.empty() && - llama.has_next_token) { // NITRO : the patch here is important to - // make midway cutting possible - // const json data = format_partial_response(this->llama, to_send, - // probs_output); - // LOG_INFO << llama.has_next_token; - const std::string str = - "data: " + - create_return_json(nitro_utils::generate_random_string(20), "_", - to_send) + - "\n\n"; - - LOG_VERBOSE("data stream", {{"to_send", str}}); - std::size_t nRead = std::min(str.size(), nBuffSize); - memcpy(pBuffer, str.data(), nRead); - return nRead; - } - } - - // std::this_thread::sleep_for(std::chrono::seconds(2)); - // LOG_INFO << this->llama.has_next_token; - if (!this->llama.has_next_token) { - // Generation is done, send extra information. - // const json data = format_final_response( - // this->llama, "", - // std::vector( - // this->llama.generated_token_probs.begin(), - // this->llama.generated_token_probs.begin() + - // sent_token_probs_index)); - // - - const std::string str = - "data: " + - create_return_json(nitro_utils::generate_random_string(20), "_", - "", "stop") + - "\n\n" + "data: [DONE]" + "\n\n"; - - LOG_VERBOSE("data stream", {{"to_send", str}}); - std::size_t nRead = std::min(str.size(), nBuffSize); - memcpy(pBuffer, str.data(), nRead); - return nRead; - } - } catch (...) { - LOG_ERROR << "error inside while loop"; - } + if (state->isStopped) { + return 0; } - lock.release(); - llama_print_timings(llama.ctx); - this->llama.mutex.unlock(); - this->sent_count = 0; - this->sent_token_probs_index = 0; - // LOG_INFO << "Test end two time lol"; + task_result result = state->instance->llama.next_result(state->task_id); + if (!result.error) { + const std::string to_send = result.result_json["content"]; + const std::string str = + "data: " + + create_return_json(nitro_utils::generate_random_string(20), "_", + to_send) + + "\n\n"; + + std::size_t nRead = std::min(str.size(), nBuffSize); + memcpy(pBuffer, str.data(), nRead); + + if (result.stop) { + const std::string str = + "data: " + + create_return_json(nitro_utils::generate_random_string(20), "_", "", + "stop") + + "\n\n" + "data: [DONE]" + "\n\n"; + + LOG_VERBOSE("data stream", {{"to_send", str}}); + std::size_t nRead = std::min(str.size(), nBuffSize); + memcpy(pBuffer, str.data(), nRead); + LOG_INFO << "reached result stop"; + state->isStopped = true; + state->instance->llama.request_cancel(state->task_id); + return nRead; + } + return nRead; + } else { + return 0; + } return 0; }; - auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider, "chat_completions.txt"); callback(resp); + + return; } void llamaCPP::embedding( const HttpRequestPtr &req, std::function &&callback) { - if (!model_loaded) { - Json::Value jsonResp; - jsonResp["message"] = "Model is not loaded yet"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(drogon::k500InternalServerError); - callback(resp); - return; - } - - auto lock = llama.lock(); - const auto &jsonBody = req->getJsonObject(); - llama.rewind(); - llama_reset_timings(llama.ctx); + json prompt; if (jsonBody->isMember("content") != 0) { - llama.prompt = (*jsonBody)["content"].asString(); + prompt = (*jsonBody)["content"].asString(); } else { - llama.prompt = ""; + prompt = ""; } - llama.params.n_predict = 0; - llama.loadPrompt(); - llama.beginCompletion(); - llama.doCompletion(); - - const json data = format_embedding_response(llama); - auto resp = drogon::HttpResponse::newHttpResponse(); - resp->setBody(data.dump()); + const int task_id = + llama.request_completion({{"prompt", prompt}, {"n_predict", 0}}, false); + task_result result = llama.next_result(task_id); + std::string embeddingResp = result.result_json.dump(); + auto resp = nitro_utils::nitroHttpResponse(); + resp->setBody(embeddingResp); resp->setContentTypeString("application/json"); callback(resp); + return; } void llamaCPP::loadModel( @@ -303,11 +195,26 @@ void llamaCPP::loadModel( const auto &jsonBody = req->getJsonObject(); gpt_params params; + + params.cont_batching = false; + // By default will setting based on number of handlers + int drogon_thread = drogon::app().getThreadNum(); + LOG_INFO << "Drogon thread is:" << drogon_thread; if (jsonBody) { params.model = (*jsonBody)["llama_model_path"].asString(); params.n_gpu_layers = (*jsonBody)["ngl"].asInt(); params.n_ctx = (*jsonBody)["ctx_len"].asInt(); params.embedding = (*jsonBody)["embedding"].asBool(); + // Check if n_parallel exists in jsonBody, if not, set to drogon_thread + if ((*jsonBody).isMember("n_parallel")) { + params.n_parallel = (*jsonBody)["n_parallel"].asInt(); + } else { + params.n_parallel = drogon_thread; + } + + params.cont_batching = (*jsonBody)["cont_batching"].asBool(); + // params.n_threads = (*jsonBody)["n_threads"].asInt(); + // params.n_threads_batch = params.n_threads; } #ifdef GGML_USE_CUBLAS LOG_INFO << "Setting up GGML CUBLAS PARAMS"; @@ -329,7 +236,7 @@ void llamaCPP::loadModel( }); // load the model - if (!llama.loadModel(params)) { + if (!llama.load_model(params)) { LOG_ERROR << "Error loading the model will exit the program"; Json::Value jsonResp; jsonResp["message"] = "Model loaded failed"; @@ -337,10 +244,22 @@ void llamaCPP::loadModel( resp->setStatusCode(drogon::k500InternalServerError); callback(resp); } + llama.initialize(); + Json::Value jsonResp; jsonResp["message"] = "Model loaded successfully"; model_loaded = true; auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - warmupModel(); + //warmupModel(); + + LOG_INFO << "Started background task here!"; + backgroundThread = std::thread(&llamaCPP::backgroundTask, this); callback(resp); } + +void llamaCPP::backgroundTask() { + while (model_loaded) { + model_loaded = llama.update_slots(); + } + return; +} diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 2b188867a..01fcab840 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -1,17 +1,18 @@ -#include "log.h" -#include #if defined(_WIN32) #define NOMINMAX #endif #pragma once +#include "log.h" #include "utils/nitro_utils.h" #include #include #include -#include +#include +#include #include +#include // External @@ -20,6 +21,10 @@ #include "grammar-parser.h" #include "llama.h" +#include "../../llama.cpp/examples/llava/clip.h" + +#include "stb_image.h" + #ifndef NDEBUG // crash the server in debug mode, otherwise send an http 500 error #define CPPHTTPLIB_NO_EXCEPTIONS 1 @@ -29,7 +34,10 @@ // auto generated files (update with ./deps.sh) +#include #include +#include +#include #ifndef SERVER_VERBOSE #define SERVER_VERBOSE 1 @@ -45,6 +53,155 @@ struct server_params { int32_t write_timeout = 600; }; +static bool server_verbose = false; + +#if SERVER_VERBOSE != 1 +#define LOG_VERBOSE(MSG, ...) +#else +#define LOG_VERBOSE(MSG, ...) \ + do { \ + if (server_verbose) { \ + server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \ + } \ + } while (0) +#endif + +#define LOG_ERROR_LLAMA(MSG, ...) \ + server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_WARNING_LLAMA(MSG, ...) \ + server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_INFO_LLAMA(MSG, ...) \ + server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) + +// +// base64 utils (TODO: move to common in the future) +// + +static const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; + +static inline bool is_base64(uint8_t c) { + return (isalnum(c) || (c == '+') || (c == '/')); +} + +static std::vector base64_decode(std::string const &encoded_string) { + int i = 0; + int j = 0; + int in_ = 0; + + int in_len = encoded_string.size(); + + uint8_t char_array_4[4]; + uint8_t char_array_3[3]; + + std::vector ret; + + while (in_len-- && (encoded_string[in_] != '=') && + is_base64(encoded_string[in_])) { + char_array_4[i++] = encoded_string[in_]; + in_++; + if (i == 4) { + for (i = 0; i < 4; i++) { + char_array_4[i] = base64_chars.find(char_array_4[i]); + } + + char_array_3[0] = + ((char_array_4[0]) << 2) + ((char_array_4[1] & 0x30) >> 4); + char_array_3[1] = + ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); + char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; + + for (i = 0; (i < 3); i++) { + ret.push_back(char_array_3[i]); + } + i = 0; + } + } + + if (i) { + for (j = i; j < 4; j++) { + char_array_4[j] = 0; + } + + for (j = 0; j < 4; j++) { + char_array_4[j] = base64_chars.find(char_array_4[j]); + } + + char_array_3[0] = + ((char_array_4[0]) << 2) + ((char_array_4[1] & 0x30) >> 4); + char_array_3[1] = + ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); + char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; + + for (j = 0; (j < i - 1); j++) { + ret.push_back(char_array_3[j]); + } + } + + return ret; +} + +// +// parallel +// + +enum task_type { COMPLETION_TASK, CANCEL_TASK }; + +struct task_server { + int id; + int target_id; + task_type type; + json data; + bool infill_mode = false; +}; + +struct task_result { + int id; + bool stop; + bool error; + json result_json; +}; + +// TODO: can become bool if we can't find use of more states +enum slot_state { + IDLE, + PROCESSING, +}; + +enum slot_command { + NONE, + LOAD_PROMPT, + RELEASE, +}; + +struct slot_params { + bool stream = true; + bool cache_prompt = + false; // remember the prompt to avoid reprocessing all prompt + + uint32_t seed = -1; // RNG seed + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_predict = -1; // new tokens to predict + + std::vector antiprompt; + + json input_prefix; + json input_suffix; +}; + +struct slot_image { + int32_t id; + + bool request_encode_image = false; + float *image_embedding = nullptr; + int32_t image_tokens = 0; + + clip_image_u8 img_data; + + std::string prefix_prompt; // before of this image +}; + // completion token output with probabilities struct completion_token_output { struct token_prob { @@ -54,6 +211,7 @@ struct completion_token_output { std::vector probs; llama_token tok; + std::string text_to_send; }; static size_t common_part(const std::vector &a, @@ -90,6 +248,7 @@ static size_t find_partial_stop_string(const std::string &stop, return std::string::npos; } +// TODO: reuse llama_detokenize template static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) { std::string ret; @@ -155,111 +314,275 @@ probs_vector_to_json(const llama_context *ctx, return out; } -static bool server_verbose = false; +template +static T json_value(const json &body, const std::string &key, + const T &default_value) { + // Fallback null to default value + return body.contains(key) && !body.at(key).is_null() + ? body.value(key, default_value) + : default_value; +} -#if SERVER_VERBOSE != 1 -#define LOG_VERBOSE(MSG, ...) -#else -#define LOG_VERBOSE(MSG, ...) \ - do { \ - if (server_verbose) { \ - server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \ - } \ - } while (0) -#endif +struct llama_client_slot { + int id; + int task_id = -1; -#define LOG_ERROR_LLAMA(MSG, ...) \ - server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_WARNING_LLAMA(MSG, ...) \ - server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_INFO_LLAMA(MSG, ...) \ - server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) + struct slot_params params; -struct llama_server_context { - bool stream = false; - bool has_next_token = false; - std::string generated_text; - std::vector generated_token_probs; + slot_state state = IDLE; + slot_command command = NONE; - size_t num_prompt_tokens = 0; - size_t num_tokens_predicted = 0; - size_t n_past = 0; - size_t n_remain = 0; + // used to determine the slot that has been used the longest + int64_t t_last_used = -1; - json prompt; - std::vector embd; - std::vector last_n_tokens; + // generation props + int32_t n_ctx = 0; // context size per slot + int32_t n_past = 0; + int32_t n_decoded = 0; + int32_t n_remaining = -1; + int32_t i_batch = -1; - llama_model *model = nullptr; - llama_context *ctx = nullptr; - gpt_params params; - llama_sampling_context ctx_sampling; - int n_ctx; + int32_t num_prompt_tokens = 0; + int32_t num_prompt_tokens_processed = 0; + int32_t multibyte_pending = 0; - grammar_parser::parse_state parsed_grammar; - llama_grammar *grammar = nullptr; + json prompt; + std::string generated_text; + llama_token sampled; + std::vector cache_tokens; + std::vector generated_token_probs; + bool infill = false; + bool has_next_token = true; bool truncated = false; bool stopped_eos = false; bool stopped_word = false; bool stopped_limit = false; + std::string stopping_word; - int32_t multibyte_pending = 0; - std::mutex mutex; + // sampling + struct llama_sampling_params sparams; + llama_sampling_context *ctx_sampling = nullptr; - std::unique_lock lock() { - return std::unique_lock(mutex); - } + // multimodal + std::vector images; - ~llama_server_context() { - if (ctx) { - llama_free(ctx); - ctx = nullptr; - } - if (model) { - llama_free_model(model); - model = nullptr; - } - } + // stats + size_t sent_count = 0; + size_t sent_token_probs_index = 0; + + int64_t t_start_process_prompt; + int64_t t_start_genereration; - void rewind() { - params.antiprompt.clear(); - params.grammar.clear(); + double t_prompt_processing; // ms + double t_token_generation; // ms + + void reset() { num_prompt_tokens = 0; - num_tokens_predicted = 0; generated_text = ""; - generated_text.reserve(n_ctx); - generated_token_probs.clear(); truncated = false; stopped_eos = false; stopped_word = false; stopped_limit = false; stopping_word = ""; multibyte_pending = 0; - n_remain = 0; n_past = 0; + sent_count = 0; + sent_token_probs_index = 0; + infill = false; + + generated_token_probs.clear(); + + for (slot_image &img : images) { + free(img.image_embedding); + delete[] img.img_data.data; + img.prefix_prompt = ""; + } + + images.clear(); + // llama_set_rng_seed(ctx, params.seed); in batched the seed matter??????? + } + + bool has_budget(gpt_params &global_params) { + n_remaining = -1; + if (params.n_predict != -1) { + n_remaining = params.n_predict - n_decoded; + } else if (global_params.n_predict != -1) { + n_remaining = global_params.n_predict - n_decoded; + } + return n_remaining > 0 || n_remaining == -1; // no budget || limitless + } + + bool available() const { return state == IDLE && command == NONE; } + + bool is_processing() const { + return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING; + } + + void add_token_string(const completion_token_output &token) { + if (command == RELEASE) { + return; + } + cache_tokens.push_back(token.tok); + generated_token_probs.push_back(token); + } + + void release() { + if (state == IDLE || state == PROCESSING) { + t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3; + command = RELEASE; + } + } + + json get_formated_timings() { + return json{ + {"prompt_n", num_prompt_tokens_processed}, + {"prompt_ms", t_prompt_processing}, + {"prompt_per_token_ms", + t_prompt_processing / num_prompt_tokens_processed}, + {"prompt_per_second", + 1e3 / t_prompt_processing * num_prompt_tokens_processed}, + + {"predicted_n", n_decoded}, + {"predicted_ms", t_token_generation}, + {"predicted_per_token_ms", t_token_generation / n_decoded}, + {"predicted_per_second", 1e3 / t_token_generation * n_decoded}, + }; + } + + void print_timings() { + LOG_TEE("\n"); + LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per " + "token, %8.2f tokens per second)\n", + __func__, t_prompt_processing, num_prompt_tokens_processed, + t_prompt_processing / num_prompt_tokens_processed, + 1e3 / t_prompt_processing * num_prompt_tokens_processed); + LOG_TEE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per " + "token, %8.2f tokens per second)\n", + __func__, t_token_generation, n_decoded, + t_token_generation / n_decoded, + 1e3 / t_token_generation * n_decoded); + LOG_TEE("%s: total time = %10.2f ms\n", __func__, + t_prompt_processing + t_token_generation); + } +}; + +struct llama_server_context { + llama_model *model = nullptr; + llama_context *ctx = nullptr; + + clip_ctx *clp_ctx = nullptr; + + gpt_params params; + + llama_batch batch; + + bool multimodal = false; + bool clean_kv_cache = true; + bool all_slots_are_idle = false; + + int32_t id_gen; + int32_t n_ctx; // total context for all clients / slots + + // system prompt + bool system_need_update = false; + + std::string system_prompt; + std::vector system_tokens; + + std::string name_user; // this should be the antiprompt + std::string name_assistant; + + // slots / clients + std::vector slots; + + std::vector queue_tasks; + std::vector queue_results; + std::mutex mutex_tasks; + std::mutex mutex_results; - if (grammar != nullptr) { - llama_grammar_free(grammar); - grammar = nullptr; - ctx_sampling = llama_sampling_context_init(params, NULL); + ~llama_server_context() { + if (ctx) { + llama_free(ctx); + ctx = nullptr; + } + if (model) { + llama_free_model(model); + model = nullptr; } } - bool loadModel(const gpt_params ¶ms_) { + bool load_model(const gpt_params ¶ms_) { params = params_; + if (!params.mmproj.empty()) { + multimodal = true; + LOG_TEE("Multi Modal Mode Enabled"); + clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/1); + if (clp_ctx == nullptr) { + LOG_ERROR_LLAMA("unable to load clip model", + {{"model", params.mmproj}}); + return false; + } + + if (params.n_ctx < + 2048) { // request larger context for the image embedding + params.n_ctx = 2048; + } + } + std::tie(model, ctx) = llama_init_from_gpt_params(params); if (model == nullptr) { - LOG_ERROR_LLAMA("unable to load model", {{"model", params_.model}}); + LOG_ERROR_LLAMA("unable to load model", {{"model", params.model}}); return false; } + + if (multimodal) { + const int n_embd_clip = clip_n_mmproj_embd(clp_ctx); + const int n_embd_llm = llama_n_embd(model); + if (n_embd_clip != n_embd_llm) { + LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not " + "equal to that of LLaMA (%d). Make sure that you use the " + "correct mmproj file.\n", + __func__, n_embd_clip, n_embd_llm); + llama_free(ctx); + llama_free_model(model); + return false; + } + } + n_ctx = llama_n_ctx(ctx); - last_n_tokens.resize(n_ctx); - std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); + return true; } + void initialize() { + id_gen = 0; + + // create slots + all_slots_are_idle = true; + + const int32_t n_ctx_slot = n_ctx / params.n_parallel; + + LOG_TEE("Available slots:\n"); + for (int i = 0; i < params.n_parallel; i++) { + llama_client_slot slot; + + slot.id = i; + slot.n_ctx = n_ctx_slot; + slot.reset(); + + LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); + slots.push_back(slot); + } + + batch = llama_batch_init(n_ctx, 0, params.n_parallel); + + // empty system prompt + system_prompt = ""; + system_tokens.clear(); + } + std::vector tokenize(const json &json_prompt, bool add_bos) const { // If `add_bos` is true, we only add BOS, when json_prompt is a string, @@ -291,401 +614,1063 @@ struct llama_server_context { prompt_tokens = ::llama_tokenize(ctx, s, add_bos); } - return prompt_tokens; - } + return prompt_tokens; + } + + llama_client_slot *get_slot(int id) { + int64_t t_last = ggml_time_us(); + llama_client_slot *last_used = nullptr; + + for (llama_client_slot &slot : slots) { + if (slot.id == id && slot.available()) { + return &slot; + } + + if (slot.available() && slot.t_last_used < t_last) { + last_used = &slot; + t_last = slot.t_last_used; + } + } + + return last_used; + } + + bool launch_slot_with_data(llama_client_slot *&slot, json data) { + slot_params default_params; + llama_sampling_params default_sparams; + + slot->params.stream = json_value(data, "stream", false); + slot->params.cache_prompt = json_value(data, "cache_prompt", false); + slot->params.n_predict = + json_value(data, "n_predict", default_params.n_predict); + slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k); + slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p); + slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); + slot->sparams.typical_p = + json_value(data, "typical_p", default_sparams.typical_p); + slot->sparams.temp = json_value(data, "temperature", default_sparams.temp); + slot->sparams.penalty_last_n = + json_value(data, "repeat_last_n", default_sparams.penalty_last_n); + slot->sparams.penalty_repeat = + json_value(data, "repeat_penalty", default_sparams.penalty_repeat); + slot->sparams.penalty_freq = + json_value(data, "frequency_penalty", default_sparams.penalty_freq); + slot->sparams.penalty_present = + json_value(data, "presence_penalty", default_sparams.penalty_present); + slot->sparams.mirostat = + json_value(data, "mirostat", default_sparams.mirostat); + slot->sparams.mirostat_tau = + json_value(data, "mirostat_tau", default_sparams.mirostat_tau); + slot->sparams.mirostat_eta = + json_value(data, "mirostat_eta", default_sparams.mirostat_eta); + slot->sparams.penalize_nl = + json_value(data, "penalize_nl", default_sparams.penalize_nl); + slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); + slot->params.seed = json_value(data, "seed", default_params.seed); + slot->sparams.grammar = + json_value(data, "grammar", default_sparams.grammar); + slot->sparams.n_probs = + json_value(data, "n_probs", default_sparams.n_probs); + + // infill + if (data.count("input_prefix") != 0) { + slot->params.input_prefix = data["input_prefix"]; + } else { + slot->params.input_prefix = ""; + } + + if (data.count("input_suffix") != 0) { + slot->params.input_suffix = data["input_suffix"]; + } else { + slot->params.input_suffix = ""; + } + + if (data.count("prompt") != 0) { + slot->prompt = data["prompt"]; + } else { + slot->prompt = ""; + } + + slot->sparams.logit_bias.clear(); + + if (json_value(data, "ignore_eos", false)) { + slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY; + } + + const auto &logit_bias = data.find("logit_bias"); + if (logit_bias != data.end() && logit_bias->is_array()) { + const int n_vocab = llama_n_vocab(model); + for (const auto &el : *logit_bias) { + if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) { + llama_token tok = el[0].get(); + if (tok >= 0 && tok < n_vocab) { + if (el[1].is_number()) { + slot->sparams.logit_bias[tok] = el[1].get(); + } else if (el[1].is_boolean() && !el[1].get()) { + slot->sparams.logit_bias[tok] = -INFINITY; + } + } + } + } + } + + slot->params.antiprompt.clear(); + + const auto &stop = data.find("stop"); + if (stop != data.end() && stop->is_array()) { + for (const auto &word : *stop) { + if (!word.empty()) { + slot->params.antiprompt.push_back(word); + } + } + } + + if (multimodal) { + const auto &images_data = data.find("image_data"); + if (images_data != data.end() && images_data->is_array()) { + for (const auto &img : *images_data) { + std::string data_b64 = img["data"].get(); + slot_image img_sl; + img_sl.id = + img.count("id") != 0 ? img["id"].get() : slot->images.size(); + int width, height, channels; + std::vector image_buffer = base64_decode(data_b64); + data_b64.clear(); + auto data = + stbi_load_from_memory(image_buffer.data(), image_buffer.size(), + &width, &height, &channels, 3); + if (!data) { + LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, + img_sl.id); + return false; + } + LOG_TEE("slot %i - image loaded [id: %i] resolution (%i x %i)\n", + slot->id, img_sl.id, width, height); + img_sl.img_data.nx = width; + img_sl.img_data.ny = height; + img_sl.img_data.size = width * height * 3; + img_sl.img_data.data = new uint8_t[width * height * 3](); + memcpy(img_sl.img_data.data, data, width * height * 3); + stbi_image_free(data); + img_sl.request_encode_image = true; + slot->images.push_back(img_sl); + } + // process prompt + // example: system prompt [img-102] user [img-103] describe [img-134] -> + // [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, + // {id: 134, prefix: ' describe '}]} + if (slot->images.size() > 0 && !slot->prompt.is_array()) { + std::string prompt = slot->prompt.get(); + size_t pos = 0, begin_prefix = 0; + std::string pattern = "[img-"; + while ((pos = prompt.find(pattern, pos)) != std::string::npos) { + size_t end_prefix = pos; + pos += pattern.length(); + size_t end_pos = prompt.find("]", pos); + if (end_pos != std::string::npos) { + std::string image_id = prompt.substr(pos, end_pos - pos); + try { + int img_id = std::stoi(image_id); + bool found = false; + for (slot_image &img : slot->images) { + if (img.id == img_id) { + found = true; + img.prefix_prompt = + prompt.substr(begin_prefix, end_prefix - begin_prefix); + begin_prefix = end_pos + 1; + break; + } + } + if (!found) { + LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id); + slot->images.clear(); + return false; + } + } catch (const std::invalid_argument &e) { + LOG_TEE("Invalid image number id in prompt\n"); + slot->images.clear(); + return false; + } + } + } + slot->prompt = ""; + slot->params.input_suffix = prompt.substr(begin_prefix); + slot->params.cache_prompt = + false; // multimodal doesn't support cache prompt + } + } + } + + if (slot->ctx_sampling != nullptr) { + llama_sampling_free(slot->ctx_sampling); + } + slot->ctx_sampling = llama_sampling_init(slot->sparams); + slot->command = LOAD_PROMPT; + + all_slots_are_idle = false; + + LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id); + + return true; + } + + void kv_cache_clear() { + // clear the entire KV cache + llama_kv_cache_clear(ctx); + clean_kv_cache = false; + } + + void update_system_prompt() { + system_tokens = ::llama_tokenize(ctx, system_prompt, true); + + llama_batch_clear(batch); + + kv_cache_clear(); + + for (int i = 0; i < (int)system_tokens.size(); ++i) { + llama_batch_add(batch, system_tokens[i], i, {0}, false); + } + + if (llama_decode(ctx, batch) != 0) { + LOG_TEE("%s: llama_decode() failed\n", __func__); + return; + } + + // assign the system KV cache to all parallel sequences + for (int32_t i = 1; i < params.n_parallel; ++i) { + llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size()); + } + + LOG_TEE("system prompt updated\n"); + system_need_update = false; + } + + void notify_system_prompt_changed() { + // release all slots + for (llama_client_slot &slot : slots) { + slot.release(); + } + + system_need_update = true; + } + + void process_system_prompt_data(const json &sys_props) { + system_prompt = sys_props.value("prompt", ""); + name_user = sys_props.value("anti_prompt", ""); + name_assistant = sys_props.value("assistant_name", ""); + + if (slots.size() > 0) { + notify_system_prompt_changed(); + } + } + + static size_t find_stopping_strings(const std::string &text, + const size_t last_token_size, + const stop_type type, + llama_client_slot &slot) { + size_t stop_pos = std::string::npos; + + for (const std::string &word : slot.params.antiprompt) { + size_t pos; + if (type == STOP_FULL) { + const size_t tmp = word.size() + last_token_size; + const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; + pos = text.find(word, from_pos); + } else { + pos = find_partial_stop_string(word, text); + } + if (pos != std::string::npos && + (stop_pos == std::string::npos || pos < stop_pos)) { + if (type == STOP_FULL) { + slot.stopped_word = true; + slot.stopping_word = word; + slot.has_next_token = false; + } + stop_pos = pos; + } + } + + return stop_pos; + } + + bool process_token(completion_token_output &result, llama_client_slot &slot) { + // remember which tokens were sampled - used for repetition penalties during + // sampling + const std::string token_str = llama_token_to_piece(ctx, result.tok); + slot.sampled = result.tok; + + // search stop word and delete it + slot.generated_text += token_str; + slot.has_next_token = true; + + if (slot.multibyte_pending > 0) { + slot.multibyte_pending -= token_str.size(); + } else if (token_str.size() == 1) { + const char c = token_str[0]; + // 2-byte characters: 110xxxxx 10xxxxxx + if ((c & 0xE0) == 0xC0) { + slot.multibyte_pending = 1; + // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx + } else if ((c & 0xF0) == 0xE0) { + slot.multibyte_pending = 2; + // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + } else if ((c & 0xF8) == 0xF0) { + slot.multibyte_pending = 3; + } else { + slot.multibyte_pending = 0; + } + } + + if (slot.multibyte_pending == 0) { + size_t pos = std::min(slot.sent_count, slot.generated_text.size()); + const std::string str_test = slot.generated_text.substr(pos); + bool is_stop_full = false; + size_t stop_pos = + find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); + if (stop_pos != std::string::npos) { + is_stop_full = true; + slot.generated_text.erase(slot.generated_text.begin() + pos + stop_pos, + slot.generated_text.end()); + pos = std::min(slot.sent_count, slot.generated_text.size()); + } else { + is_stop_full = false; + stop_pos = find_stopping_strings(str_test, token_str.size(), + STOP_PARTIAL, slot); + } - bool loadGrammar() { - if (!params.grammar.empty()) { - parsed_grammar = grammar_parser::parse(params.grammar.c_str()); - // will be empty (default) if there are parse errors - if (parsed_grammar.rules.empty()) { - LOG_ERROR_LLAMA("grammar parse error", {{"grammar", params.grammar}}); - return false; + // check if there is any token to predict + if (stop_pos == std::string::npos || + (!slot.has_next_token && !is_stop_full && stop_pos > 0)) { + // no send the stop word in the response + result.text_to_send = + slot.generated_text.substr(pos, std::string::npos); + slot.sent_count += result.text_to_send.size(); + // add the token to slot queue and cache } - grammar_parser::print_grammar(stderr, parsed_grammar); - - { - auto it = params.sampling_params.logit_bias.find(llama_token_eos(ctx)); - if (it != params.sampling_params.logit_bias.end() && - it->second == -INFINITY) { - LOG_WARNING_LLAMA( - "EOS token is disabled, which will cause most grammars to fail", - {}); - } + slot.add_token_string(result); + if (slot.params.stream) { + send_partial_response(slot, result); } - - std::vector grammar_rules( - parsed_grammar.c_rules()); - grammar = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), - parsed_grammar.symbol_ids.at("root")); } - ctx_sampling = llama_sampling_context_init(params, grammar); - return true; - } - void loadInfill() { - bool suff_rm_leading_spc = true; - if (params.input_suffix.find_first_of(" ") == 0 && - params.input_suffix.size() > 1) { - params.input_suffix.erase(0, 1); - suff_rm_leading_spc = false; + if (slot.multibyte_pending > 0 && !slot.has_next_token) { + slot.has_next_token = true; } - auto prefix_tokens = tokenize(params.input_prefix, false); - auto suffix_tokens = tokenize(params.input_suffix, false); - const int space_token = 29871; - if (suff_rm_leading_spc && suffix_tokens[0] == space_token) { - suffix_tokens.erase(suffix_tokens.begin()); - } - prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx)); - prefix_tokens.insert(prefix_tokens.begin(), - llama_token_bos(ctx)); // always add BOS - prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx)); - prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), - suffix_tokens.end()); - prefix_tokens.push_back(llama_token_middle(ctx)); - auto prompt_tokens = prefix_tokens; - - num_prompt_tokens = prompt_tokens.size(); - - if (params.n_keep < 0) { - params.n_keep = (int)num_prompt_tokens; + // check the limits + if (slot.n_decoded > 2 && slot.has_next_token && !slot.has_budget(params)) { + slot.stopped_limit = true; + slot.has_next_token = false; } - params.n_keep = std::min(params.n_ctx - 4, params.n_keep); - - // if input prompt is too big, truncate like normal - if (num_prompt_tokens >= (size_t)params.n_ctx) { - printf("Input prompt is too big, truncating. Can only take %d tokens but " - "got %zu\n", - params.n_ctx, num_prompt_tokens); - // todo we probably want to cut from both sides - const int n_left = (params.n_ctx - params.n_keep) / 2; - std::vector new_tokens( - prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); - const int erased_blocks = - (num_prompt_tokens - params.n_keep - n_left - 1) / n_left; - new_tokens.insert(new_tokens.end(), - prompt_tokens.begin() + params.n_keep + - erased_blocks * n_left, - prompt_tokens.end()); - std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), - last_n_tokens.begin()); - - LOG_VERBOSE("input truncated", - { - {"n_ctx", params.n_ctx}, - {"n_keep", params.n_keep}, - {"n_left", n_left}, - {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), - new_tokens.cend())}, - }); - truncated = true; - prompt_tokens = new_tokens; - } else { - const size_t ps = num_prompt_tokens; - std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0); - std::copy(prompt_tokens.begin(), prompt_tokens.end(), - last_n_tokens.end() - ps); + if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model)) { + slot.stopped_eos = true; + slot.has_next_token = false; + LOG_VERBOSE("eos token found", {}); } - // compare the evaluated prompt with the new prompt - n_past = common_part(embd, prompt_tokens); - embd = prompt_tokens; + LOG_VERBOSE( + "next token", + { + {"token", result.tok}, + {"token_text", tokens_to_output_formatted_string(ctx, result.tok)}, + {"has_next_token", slot.has_next_token}, + {"n_remain", slot.n_remaining}, + {"num_tokens_predicted", slot.n_decoded}, + {"stopped_eos", slot.stopped_eos}, + {"stopped_word", slot.stopped_word}, + {"stopped_limit", slot.stopped_limit}, + {"stopping_word", slot.stopping_word}, + }); + + return slot.has_next_token; // continue + } - if (n_past == num_prompt_tokens) { - // we have to evaluate at least 1 token to generate logits. - printf("we have to evaluate at least 1 token to generate logits\n"); - n_past--; + bool process_images(llama_client_slot &slot) const { + for (slot_image &img : slot.images) { + if (!img.request_encode_image) { + continue; + } + clip_image_f32 img_res; + if (!clip_image_preprocess(clp_ctx, &img.img_data, &img_res, + /*pad2square =*/true)) { + LOG_TEE("Error processing the given image"); + clip_free(clp_ctx); + return false; + } + img.image_tokens = clip_n_patches(clp_ctx); + img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx)); + if (!img.image_embedding) { + LOG_TEE("Unable to allocate memory for image embeddings\n"); + clip_free(clp_ctx); + return false; + } + LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id); + if (!clip_image_encode(clp_ctx, params.n_threads, &img_res, + img.image_embedding)) { + LOG_TEE("Unable to encode image\n"); + return false; + } + img.request_encode_image = false; } - // since #3228 we now have to manually manage the KV cache - llama_kv_cache_seq_rm(ctx, 0, n_past, -1); + return slot.images.size() > 0; + } - LOG_VERBOSE("prompt ingested", - { - {"n_past", n_past}, - {"cached", - tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)}, - {"to_eval", - tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())}, - }); + void send_error(int id, std::string error) { + std::lock_guard lock(mutex_results); + task_result res; + res.id = id; + res.error = true; + res.result_json = {{"content", error}}; + queue_results.push_back(res); + } - has_next_token = true; + json get_model_props() { return get_formated_generation(slots[0]); } + + json get_formated_generation(llama_client_slot &slot) { + const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); + const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && + eos_bias->second < 0.0f && + std::isinf(eos_bias->second); + return json{ + {"n_ctx", slot.n_ctx}, + {"model", params.model_alias}, + {"seed", slot.params.seed}, + {"temp", slot.sparams.temp}, + {"top_k", slot.sparams.top_k}, + {"top_p", slot.sparams.top_p}, + {"tfs_z", slot.sparams.tfs_z}, + {"typical_p", slot.sparams.typical_p}, + {"repeat_last_n", slot.sparams.penalty_last_n}, + {"repeat_penalty", slot.sparams.penalty_repeat}, + {"presence_penalty", slot.sparams.penalty_present}, + {"frequency_penalty", slot.sparams.penalty_freq}, + {"mirostat", slot.sparams.mirostat}, + {"mirostat_tau", slot.sparams.mirostat_tau}, + {"mirostat_eta", slot.sparams.mirostat_eta}, + {"penalize_nl", slot.sparams.penalize_nl}, + {"stop", slot.params.antiprompt}, + {"n_predict", slot.params.n_predict}, + {"n_keep", params.n_keep}, + {"ignore_eos", ignore_eos}, + {"stream", slot.params.stream}, + {"logit_bias", slot.sparams.logit_bias}, + {"n_probs", slot.sparams.n_probs}, + {"grammar", slot.sparams.grammar}, + }; } - void loadPrompt() { - auto prompt_tokens = tokenize(prompt, true); // always add BOS - num_prompt_tokens = prompt_tokens.size(); + void send_partial_response(llama_client_slot &slot, + completion_token_output tkn) { + std::lock_guard lock(mutex_results); + task_result res; + res.id = slot.task_id; + res.error = false; + res.stop = false; + + res.result_json = json{{"content", tkn.text_to_send}, + {"stop", false}, + {"slot_id", slot.id}, + {"multimodal", multimodal}}; + + if (slot.sparams.n_probs > 0) { + std::vector probs_output = {}; + const std::vector to_send_toks = + llama_tokenize(ctx, tkn.text_to_send, false); + size_t probs_pos = std::min(slot.sent_token_probs_index, + slot.generated_token_probs.size()); + size_t probs_stop_pos = + std::min(slot.sent_token_probs_index + to_send_toks.size(), + slot.generated_token_probs.size()); + if (probs_pos < probs_stop_pos) { + probs_output = std::vector( + slot.generated_token_probs.begin() + probs_pos, + slot.generated_token_probs.begin() + probs_stop_pos); + } + slot.sent_token_probs_index = probs_stop_pos; + res.result_json["completion_probabilities"] = + probs_vector_to_json(ctx, probs_output); + } + + queue_results.push_back(res); + } - if (params.n_keep < 0) { - params.n_keep = (int)num_prompt_tokens; + void send_final_response(llama_client_slot &slot) { + std::lock_guard lock(mutex_results); + task_result res; + res.id = slot.task_id; + res.error = false; + res.stop = true; + + res.result_json = + json{{"content", !slot.params.stream ? slot.generated_text : ""}, + {"slot_id", slot.id}, + {"stop", true}, + {"model", params.model_alias}, + {"tokens_predicted", slot.n_decoded}, + {"tokens_evaluated", slot.num_prompt_tokens}, + {"generation_settings", get_formated_generation(slot)}, + {"prompt", slot.prompt}, + {"truncated", slot.truncated}, + {"stopped_eos", slot.stopped_eos}, + {"stopped_word", slot.stopped_word}, + {"stopped_limit", slot.stopped_limit}, + {"stopping_word", slot.stopping_word}, + {"tokens_cached", slot.n_past}, + {"timings", slot.get_formated_timings()}}; + + if (slot.sparams.n_probs > 0) { + std::vector probs = {}; + if (!slot.params.stream && slot.stopped_word) { + const std::vector stop_word_toks = + llama_tokenize(ctx, slot.stopping_word, false); + probs = std::vector( + slot.generated_token_probs.begin(), + slot.generated_token_probs.end() - stop_word_toks.size()); + } else { + probs = std::vector( + slot.generated_token_probs.begin(), + slot.generated_token_probs.begin() + slot.sent_token_probs_index); + } + res.result_json["completion_probabilities"] = + probs_vector_to_json(ctx, probs); } - params.n_keep = std::min(n_ctx - 4, params.n_keep); - - // if input prompt is too big, truncate like normal - if (num_prompt_tokens >= (size_t)n_ctx) { - const int n_left = (n_ctx - params.n_keep) / 2; - std::vector new_tokens( - prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); - const int erased_blocks = - (num_prompt_tokens - params.n_keep - n_left - 1) / n_left; - new_tokens.insert(new_tokens.end(), - prompt_tokens.begin() + params.n_keep + - erased_blocks * n_left, - prompt_tokens.end()); - std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(), - last_n_tokens.begin()); - - LOG_VERBOSE("input truncated", - { - {"n_ctx", n_ctx}, - {"n_keep", params.n_keep}, - {"n_left", n_left}, - {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), - new_tokens.cend())}, - }); - truncated = true; - prompt_tokens = new_tokens; + queue_results.push_back(res); + } + + void send_embedding(llama_client_slot &slot) { + std::lock_guard lock(mutex_results); + task_result res; + res.id = slot.task_id; + res.error = false; + res.stop = true; + + const int n_embd = llama_n_embd(model); + if (!params.embedding) { + LOG_WARNING_LLAMA("embedding disabled", + { + {"params.embedding", params.embedding}, + }); + res.result_json = json{ + {"embedding", std::vector(n_embd, 0.0f)}, + }; } else { - const size_t ps = num_prompt_tokens; - std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0); - std::copy(prompt_tokens.begin(), prompt_tokens.end(), - last_n_tokens.end() - ps); + const float *data = llama_get_embeddings(ctx); + std::vector embedding(data, data + n_embd); + res.result_json = json{ + {"embedding", embedding}, + }; } + queue_results.push_back(res); + } + + int request_completion(json data, bool infill) { + std::lock_guard lock(mutex_tasks); + task_server task; + task.id = id_gen++; + task.data = data; + task.infill_mode = infill; + task.type = COMPLETION_TASK; + queue_tasks.push_back(task); + return task.id; + } - // compare the evaluated prompt with the new prompt - n_past = common_part(embd, prompt_tokens); + task_result next_result(int task_id) { + while (true) { + std::this_thread::sleep_for(std::chrono::microseconds(5)); + std::lock_guard lock(mutex_results); - embd = prompt_tokens; - if (n_past == num_prompt_tokens) { - // we have to evaluate at least 1 token to generate logits. - n_past--; + if (queue_results.empty()) { + continue; + } + + for (int i = 0; i < (int)queue_results.size(); i++) { + if (queue_results[i].id == task_id) { + task_result res = queue_results[i]; + queue_results.erase(queue_results.begin() + i); + return res; + } + } } - // since #3228 we now have to manually manage the KV cache - llama_kv_cache_seq_rm(ctx, 0, n_past, -1); + // never reached + // return task_result{-1, false, false, {}}; + } + + // for multiple images processing + bool ingest_images(llama_client_slot &slot, int n_batch) { + int image_idx = 0; + + while (image_idx < (int)slot.images.size()) { + slot_image &img = slot.images[image_idx]; + + // process prefix prompt + for (int32_t i = 0; i < (int32_t)batch.n_tokens; i += n_batch) { + const int32_t n_tokens = + std::min(n_batch, (int32_t)(batch.n_tokens - i)); + llama_batch batch_view = { + n_tokens, + batch.token + i, + nullptr, + batch.pos + i, + batch.n_seq_id + i, + batch.seq_id + i, + batch.logits + i, + 0, + 0, + 0, // unused + }; + if (llama_decode(ctx, batch_view)) { + LOG_TEE("%s : failed to eval\n", __func__); + return false; + } + } - LOG_VERBOSE("prompt ingested", - { - {"n_past", n_past}, - {"cached", - tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)}, - {"to_eval", - tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())}, - }); + // process image with llm + for (int i = 0; i < img.image_tokens; i += n_batch) { + int n_eval = img.image_tokens - i; + if (n_eval > n_batch) { + n_eval = n_batch; + } + + const int n_embd = llama_n_embd(model); + llama_batch batch_img = { + n_eval, nullptr, (img.image_embedding + i * n_embd), + nullptr, nullptr, nullptr, + nullptr, slot.n_past, 1, + 0, + }; + if (llama_decode(ctx, batch_img)) { + LOG_TEE("%s : failed to eval image\n", __func__); + return false; + } + slot.n_past += n_eval; + } + image_idx++; + + llama_batch_clear(batch); + + // append prefix of next image + const auto json_prompt = + (image_idx >= (int)slot.images.size()) + ? slot.params.input_suffix + : // no more images, then process suffix prompt + (json)(slot.images[image_idx].prefix_prompt); + + std::vector append_tokens = + tokenize(json_prompt, false); // has next image + for (int i = 0; i < (int)append_tokens.size(); ++i) { + llama_batch_add(batch, append_tokens[i], slot.n_past, {slot.id}, true); + slot.n_past += 1; + } + } - has_next_token = true; + return true; } - void beginCompletion() { - // number of tokens to keep when resetting context - n_remain = params.n_predict; - llama_set_rng_seed(ctx, params.seed); + void request_cancel(int task_id) { + std::lock_guard lock(mutex_tasks); + task_server task; + task.id = id_gen++; + task.type = CANCEL_TASK; + task.target_id = task_id; + queue_tasks.push_back(task); } - completion_token_output nextToken() { - completion_token_output result; - result.tok = -1; + void process_tasks() { + std::lock_guard lock(mutex_tasks); + while (!queue_tasks.empty()) { + // LOG_INFO << "test tasks"; + task_server task = queue_tasks.front(); + queue_tasks.erase(queue_tasks.begin()); + switch (task.type) { + case COMPLETION_TASK: { + llama_client_slot *slot = + get_slot(json_value(task.data, "slot_id", -1)); + if (slot == nullptr) { + LOG_TEE("slot unavailable\n"); + // send error result + send_error(task.id, "slot unavaliable"); + return; + } + + if (task.data.contains("system_prompt")) { + process_system_prompt_data(task.data["system_prompt"]); + } + + slot->reset(); + + slot->infill = task.infill_mode; + slot->task_id = task.id; - if (embd.size() >= (size_t)n_ctx) { - // Shift context + if (!launch_slot_with_data(slot, task.data)) { + // send error result + send_error(task.id, "internal_error"); + break; + } + } break; + case CANCEL_TASK: { // release slot linked with the task id + for (auto &slot : slots) { + if (slot.task_id == task.target_id) { + slot.release(); + slot.print_timings(); + break; + } + } + } break; + } + } + } - const int n_left = n_past - params.n_keep - 1; - const int n_discard = n_left / 2; + bool update_slots() { + // attend tasks + process_tasks(); + + // update the system prompt wait until all slots are idle state + if (system_need_update && all_slots_are_idle) { + LOG_TEE("updating system prompt\n"); + update_system_prompt(); + } - llama_kv_cache_seq_rm(ctx, 0, params.n_keep + 1, - params.n_keep + n_discard + 1); - llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, - -n_discard); + llama_batch_clear(batch); - for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++) { - embd[i - n_discard] = embd[i]; + if (all_slots_are_idle) { + if (system_prompt.empty() && clean_kv_cache) { + LOG_TEE("all slots are idle and system prompt is empty, clear the KV " + "cache\n"); + kv_cache_clear(); } - embd.resize(embd.size() - n_discard); + // avoid 100% usage of cpu all time + std::this_thread::sleep_for(std::chrono::milliseconds(5)); + } + + for (llama_client_slot &slot : slots) { + if (slot.is_processing() && + slot.cache_tokens.size() >= (size_t)slot.n_ctx) { + // Shift context + const int n_left = slot.n_past - slot.params.n_keep - 1; + const int n_discard = n_left / 2; + + LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard " + "= %d\n", + slot.id, slot.params.n_keep, n_left, n_discard); + llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1, + slot.params.n_keep + n_discard + 1); + llama_kv_cache_seq_shift(ctx, slot.id, + slot.params.n_keep + 1 + n_discard, + slot.n_past, -n_discard); + + for (size_t i = slot.params.n_keep + 1 + n_discard; + i < slot.cache_tokens.size(); i++) { + slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; + } + + slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); - n_past -= n_discard; + slot.n_past -= n_discard; - truncated = true; - LOG_VERBOSE("input truncated", { + slot.truncated = true; + + LOG_VERBOSE("context shift", { {"n_ctx", n_ctx}, {"n_keep", params.n_keep}, {"n_left", n_left}, }); + } } - bool tg = true; - while (n_past < embd.size()) { - int n_eval = (int)embd.size() - n_past; - tg = n_eval == 1; - if (n_eval > params.n_batch) { - n_eval = params.n_batch; + // decode any currently ongoing sequences + for (auto &slot : slots) { + // release the slot + if (slot.command == RELEASE) { + slot.state = IDLE; + slot.command = NONE; + slot.t_last_used = ggml_time_us(); + + LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, + (int)slot.cache_tokens.size()); + + continue; } - if (llama_decode(ctx, - llama_batch_get_one(&embd[n_past], n_eval, n_past, 0))) { - LOG_ERROR_LLAMA("failed to eval", - { - {"n_eval", n_eval}, - {"n_past", n_past}, - {"embd", tokens_to_str(ctx, embd.cbegin() + n_past, - embd.cend())}, - }); - has_next_token = false; - return result; + if (slot.state == IDLE) { + continue; } - n_past += n_eval; - } - if (params.n_predict == 0) { - has_next_token = false; - result.tok = llama_token_eos(ctx); - return result; + slot.i_batch = batch.n_tokens; + + llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past, + {slot.id}, true); + + slot.n_decoded += 1; + slot.n_past += 1; } - { - // out of user input, sample next token - std::vector candidates; - candidates.reserve(llama_n_vocab(model)); + // process in chunks of params.n_batch + int32_t n_batch = params.n_batch; + + // assign workload to the slots + if (params.cont_batching || batch.n_tokens == 0) { + for (auto &slot : slots) { + const bool has_prompt = slot.prompt.is_array() || + (slot.prompt.is_string() && + !slot.prompt.get().empty()) || + !slot.images.empty(); + + // empty prompt passed -> release the slot and send empty response + if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt) { + slot.release(); + slot.print_timings(); + send_final_response(slot); + continue; + } + + // need process the prompt + if (slot.state == IDLE && slot.command == LOAD_PROMPT) { + slot.state = PROCESSING; + slot.command = NONE; + std::vector prompt_tokens; + slot.t_start_process_prompt = ggml_time_us(); + slot.t_start_genereration = 0; + + if (slot.infill) { + bool suff_rm_leading_spc = true; + if (params.input_suffix.find_first_of(' ') == 0 && + params.input_suffix.size() > 1) { + params.input_suffix.erase(0, 1); + suff_rm_leading_spc = false; + } + auto prefix_tokens = tokenize(slot.params.input_prefix, false); + auto suffix_tokens = tokenize(slot.params.input_suffix, false); + + const int space_token = 29871; // TODO: this should not be hardcoded + if (suff_rm_leading_spc && !suffix_tokens.empty() && + suffix_tokens[0] == space_token) { + suffix_tokens.erase(suffix_tokens.begin()); + } + + prefix_tokens.insert(prefix_tokens.begin(), + llama_token_prefix(model)); + prefix_tokens.insert(prefix_tokens.begin(), + llama_token_bos(model)); // always add BOS + prefix_tokens.insert(prefix_tokens.end(), + llama_token_suffix(model)); + prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), + suffix_tokens.end()); + prefix_tokens.push_back(llama_token_middle(model)); + prompt_tokens = prefix_tokens; + } else { + prompt_tokens = tokenize( + slot.prompt, + system_prompt.empty()); // add BOS if there isn't system prompt + } + + slot.num_prompt_tokens = prompt_tokens.size(); - result.tok = llama_sampling_sample(ctx, NULL, ctx_sampling, last_n_tokens, - candidates); + if (!slot.params.cache_prompt) { + llama_sampling_reset(slot.ctx_sampling); - llama_token_data_array candidates_p = {candidates.data(), - candidates.size(), false}; + slot.n_past = 0; + slot.num_prompt_tokens_processed = slot.num_prompt_tokens; + } else { + if (slot.params.n_keep < 0) { + slot.params.n_keep = slot.num_prompt_tokens; + } + slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); + + // if input prompt is too big, truncate it + if (slot.num_prompt_tokens >= slot.n_ctx) { + const int n_left = slot.n_ctx - slot.params.n_keep; + const int n_block_size = n_left / 2; + const int erased_blocks = + (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / + n_block_size; + + std::vector new_tokens(prompt_tokens.begin(), + prompt_tokens.begin() + + slot.params.n_keep); + new_tokens.insert(new_tokens.end(), + prompt_tokens.begin() + slot.params.n_keep + + erased_blocks * n_block_size, + prompt_tokens.end()); + + LOG_VERBOSE( + "input truncated", + { + {"n_ctx", slot.n_ctx}, + {"n_keep", slot.params.n_keep}, + {"n_left", n_left}, + {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), + new_tokens.cend())}, + }); + slot.truncated = true; + prompt_tokens = new_tokens; - const int32_t n_probs = params.sampling_params.n_probs; - if (params.sampling_params.temp <= 0 && n_probs > 0) { - // For llama_sample_token_greedy we need to sort candidates - llama_sample_softmax(ctx, &candidates_p); - } + slot.num_prompt_tokens = prompt_tokens.size(); + GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx); + } - for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); - ++i) { - result.probs.push_back( - {candidates_p.data[i].id, candidates_p.data[i].p}); - } + // push the prompt into the sampling context (do not apply grammar) + for (auto &token : prompt_tokens) { + llama_sampling_accept(slot.ctx_sampling, ctx, token, false); + } - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(result.tok); - if (tg) { - num_tokens_predicted++; - } - } + slot.n_past = common_part(slot.cache_tokens, prompt_tokens); + slot.num_prompt_tokens_processed = + slot.num_prompt_tokens - slot.n_past; - // add it to the context - embd.push_back(result.tok); - // decrement remaining sampling budget - --n_remain; + LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", + slot.id, slot.n_past, slot.num_prompt_tokens_processed); + } - if (!embd.empty() && embd.back() == llama_token_eos(ctx)) { - // stopping_word = llama_token_to_piece(ctx, embd.back()); - has_next_token = false; - stopped_eos = true; - LOG_VERBOSE("eos token found", {}); - return result; - } + LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, + (int)system_tokens.size() + slot.n_past); - has_next_token = params.n_predict == -1 || n_remain != 0; - return result; - } + llama_kv_cache_seq_rm(ctx, slot.id, + system_tokens.size() + slot.n_past, -1); - size_t findStoppingStrings(const std::string &text, - const size_t last_token_size, - const stop_type type) { - size_t stop_pos = std::string::npos; - for (const std::string &word : params.antiprompt) { - size_t pos; - if (type == STOP_FULL) { - const size_t tmp = word.size() + last_token_size; - const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; - pos = text.find(word, from_pos); - } else { - pos = find_partial_stop_string(word, text); - } - if (pos != std::string::npos && - (stop_pos == std::string::npos || pos < stop_pos)) { - if (type == STOP_FULL) { - stopping_word = word; - stopped_word = true; - has_next_token = false; + slot.cache_tokens = prompt_tokens; + + if (slot.n_past == slot.num_prompt_tokens) { + // we have to evaluate at least 1 token to generate logits. + LOG_TEE("slot %d : we have to evaluate at least 1 token to " + "generate logits\n", + slot.id); + slot.n_past--; + } + + LOG_VERBOSE( + "prompt ingested", + { + {"n_past", slot.n_past}, + {"cached", + tokens_to_str(ctx, slot.cache_tokens.cbegin(), + slot.cache_tokens.cbegin() + slot.n_past)}, + {"to_eval", + tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, + slot.cache_tokens.cend())}, + }); + + const bool has_images = process_images(slot); + + // process the prefix of first image + std::vector prefix_tokens = + has_images ? tokenize(slot.images[0].prefix_prompt, true) + : prompt_tokens; + for (; slot.n_past < (int)prefix_tokens.size(); ++slot.n_past) { + llama_batch_add(batch, prefix_tokens[slot.n_past], + system_tokens.size() + slot.n_past, {slot.id}, + false); + } + + if (has_images && !ingest_images(slot, n_batch)) { + LOG_TEE("failed processing images\n"); + return false; + } + + // extract the logits only for the last token + if (batch.n_tokens > 0) { + batch.logits[batch.n_tokens - 1] = true; + } + + slot.n_decoded = 0; + slot.i_batch = batch.n_tokens - 1; } - stop_pos = pos; } } - return stop_pos; - } - completion_token_output doCompletion() { - auto token_with_probs = nextToken(); + if (batch.n_tokens == 0) { + all_slots_are_idle = true; + return true; + } - const std::string token_text = - token_with_probs.tok == -1 - ? "" - : llama_token_to_piece(ctx, token_with_probs.tok); - generated_text += token_text; + for (int32_t i = 0; i < (int32_t)batch.n_tokens; i += n_batch) { + const int32_t n_tokens = std::min(n_batch, (int32_t)(batch.n_tokens - i)); + llama_batch batch_view = { + n_tokens, + batch.token + i, + nullptr, + batch.pos + i, + batch.n_seq_id + i, + batch.seq_id + i, + batch.logits + i, + 0, + 0, + 0, // unused + }; + + const int ret = llama_decode(ctx, batch_view); + if (ret != 0) { + if (n_batch == 1 || ret < 0) { + // if you get here, it means the KV cache is full - try increasing it + // via the context size + LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", + __func__, n_batch, ret); + return false; + } - if (params.sampling_params.n_probs > 0) { - generated_token_probs.push_back(token_with_probs); - } + LOG_TEE("%s : failed to find free space in the KV cache, retrying with " + "smaller n_batch = %d\n", + __func__, n_batch / 2); - if (multibyte_pending > 0) { - multibyte_pending -= token_text.size(); - } else if (token_text.size() == 1) { - const char c = token_text[0]; - // 2-byte characters: 110xxxxx 10xxxxxx - if ((c & 0xE0) == 0xC0) { - multibyte_pending = 1; - // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx - } else if ((c & 0xF0) == 0xE0) { - multibyte_pending = 2; - // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - } else if ((c & 0xF8) == 0xF0) { - multibyte_pending = 3; - } else { - multibyte_pending = 0; + // retry with half the batch size to try to find a free slot in the KV + // cache + n_batch /= 2; + i -= n_batch; + continue; } - } - if (multibyte_pending > 0 && !has_next_token) { - has_next_token = true; - n_remain++; - } + for (auto &slot : slots) { + if (slot.i_batch < (int)i || slot.i_batch >= (int)(i + n_tokens)) { + continue; + } - if (!has_next_token && n_remain == 0) { - stopped_limit = true; - } + // prompt evaluated for embedding + if (params.embedding) { + send_embedding(slot); + slot.release(); + slot.i_batch = -1; + return true; + } - LOG_VERBOSE("next token", - { - {"token", token_with_probs.tok}, - {"token_text", tokens_to_output_formatted_string( - ctx, token_with_probs.tok)}, - {"has_next_token", has_next_token}, - {"n_remain", n_remain}, - {"num_tokens_predicted", num_tokens_predicted}, - {"stopped_eos", stopped_eos}, - {"stopped_word", stopped_word}, - {"stopped_limit", stopped_limit}, - {"stopping_word", stopping_word}, - }); - - return token_with_probs; - } + completion_token_output result; + const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, + NULL, slot.i_batch - i); - std::vector getEmbedding() { - static const int n_embd = llama_n_embd(model); - if (!params.embedding) { - LOG_WARNING_LLAMA("embedding disabled", - { - {"params.embedding", params.embedding}, - }); - return std::vector(n_embd, 0.0f); + llama_sampling_accept(slot.ctx_sampling, ctx, id, true); + + if (slot.n_decoded == 1) { + slot.t_start_genereration = ggml_time_us(); + slot.t_prompt_processing = + (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3; + } + + llama_token_data_array cur_p = {slot.ctx_sampling->cur.data(), + slot.ctx_sampling->cur.size(), false}; + result.tok = id; + + const int32_t n_probs = slot.sparams.n_probs; + if (slot.sparams.temp <= 0 && n_probs > 0) { + // for llama_sample_token_greedy we need to sort candidates + llama_sample_softmax(ctx, &cur_p); + } + + for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i) { + result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p}); + } + + if (!process_token(result, slot)) { + slot.release(); + slot.print_timings(); + send_final_response(slot); + } + + slot.i_batch = -1; + } } - const float *data = llama_get_embeddings(ctx); - std::vector embedding(data, data + n_embd); - return embedding; + return true; } }; @@ -763,11 +1748,22 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf( " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); + printf(" -np N, --parallel N number of slots for process requests " + "(default: %d)\n", + params.n_parallel); + printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic " + "batching) (default: disabled)\n"); + printf(" -spf FNAME, --system-prompt-file FNAME\n"); + printf(" Set a file to load a system prompt (initial " + "prompt of all slots), this is useful for chat applications.\n"); + printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for " + "LLaVA.\n"); printf("\n"); } static void server_params_parse(int argc, char **argv, server_params &sparams, - gpt_params ¶ms) { + gpt_params ¶ms, + llama_server_context &llama) { gpt_params default_params; server_params default_sparams; std::string arg; @@ -892,19 +1888,17 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } } #else - LOG_WARNING_LLAMA( - "llama.cpp was compiled without cuBLAS. It is not possible " - "to set a tensor split.\n", - {}); + LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not " + "possible to set a tensor split.\n", + {}); #endif // GGML_USE_CUBLAS } else if (arg == "--no-mul-mat-q" || arg == "-nommq") { #ifdef GGML_USE_CUBLAS params.mul_mat_q = false; #else - LOG_WARNING_LLAMA( - "warning: llama.cpp was compiled without cuBLAS. Disabling " - "mul_mat_q kernels has no effect.\n", - {}); + LOG_WARNING_LLAMA("warning: llama.cpp was compiled without cuBLAS. " + "Disabling mul_mat_q kernels has no effect.\n", + {}); #endif // GGML_USE_CUBLAS } else if (arg == "--main-gpu" || arg == "-mg") { if (++i >= argc) { @@ -914,10 +1908,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, #ifdef GGML_USE_CUBLAS params.main_gpu = std::stoi(argv[i]); #else - LOG_WARNING_LLAMA( - "llama.cpp was compiled without cuBLAS. It is not possible " - "to set a main GPU.", - {}); + LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not " + "possible to set a main GPU.", + {}); #endif } else if (arg == "--lora") { if (++i >= argc) { @@ -959,6 +1952,42 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, params.numa = true; } else if (arg == "--embedding") { params.embedding = true; + } else if (arg == "-cb" || arg == "--cont-batching") { + params.cont_batching = true; + } else if (arg == "-np" || arg == "--parallel") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_parallel = std::stoi(argv[i]); + } else if (arg == "-n" || arg == "--n-predict") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_predict = std::stoi(argv[i]); + } else if (arg == "-spf" || arg == "--system-prompt-file") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + break; + } + std::string systm_content; + std::copy(std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(systm_content)); + llama.process_system_prompt_data(json::parse(systm_content)); + } else if (arg == "--mmproj") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.mmproj = argv[i]; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); server_print_usage(argv[0], default_params, default_sparams); @@ -973,100 +2002,16 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } } -static json format_generation_settings(llama_server_context &llama) { - const auto &sparams = llama.params.sampling_params; - const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx)); - const bool ignore_eos = eos_bias != sparams.logit_bias.end() && - eos_bias->second < 0.0f && - std::isinf(eos_bias->second); - - return json{ - {"n_ctx", llama.n_ctx}, - {"model", llama.params.model_alias}, - {"seed", llama.params.seed}, - {"temp", sparams.temp}, - {"top_k", sparams.top_k}, - {"top_p", sparams.top_p}, - {"tfs_z", sparams.tfs_z}, - {"typical_p", sparams.typical_p}, - {"repeat_last_n", sparams.repeat_last_n}, - {"repeat_penalty", sparams.repeat_penalty}, - {"presence_penalty", sparams.presence_penalty}, - {"frequency_penalty", sparams.frequency_penalty}, - {"mirostat", sparams.mirostat}, - {"mirostat_tau", sparams.mirostat_tau}, - {"mirostat_eta", sparams.mirostat_eta}, - {"penalize_nl", sparams.penalize_nl}, - {"stop", llama.params.antiprompt}, - {"n_predict", llama.params.n_predict}, - {"n_keep", llama.params.n_keep}, - {"ignore_eos", ignore_eos}, - {"stream", llama.stream}, - {"logit_bias", sparams.logit_bias}, - {"n_probs", sparams.n_probs}, - {"grammar", llama.params.grammar}, - }; -} - -static json format_embedding_response(llama_server_context &llama) { - return json{ - {"embedding", llama.getEmbedding()}, - }; -} - -static json format_timings(llama_server_context &llama) { - const auto timings = llama_get_timings(llama.ctx); - - return json{ - {"prompt_n", timings.n_p_eval}, - {"prompt_ms", timings.t_p_eval_ms}, - {"prompt_per_token_ms", timings.t_p_eval_ms / timings.n_p_eval}, - {"prompt_per_second", 1e3 / timings.t_p_eval_ms * timings.n_p_eval}, - - {"predicted_n", timings.n_eval}, - {"predicted_ms", timings.t_eval_ms}, - {"predicted_per_token_ms", timings.t_eval_ms / timings.n_eval}, - {"predicted_per_second", 1e3 / timings.t_eval_ms * timings.n_eval}, - }; -} - -static json -format_final_response(llama_server_context &llama, const std::string &content, - const std::vector &probs) { - - json res = json{ - {"content", content}, - {"stop", true}, - {"model", llama.params.model_alias}, - {"tokens_predicted", llama.num_tokens_predicted}, - {"tokens_evaluated", llama.num_prompt_tokens}, - {"generation_settings", format_generation_settings(llama)}, - {"prompt", llama.prompt}, - {"truncated", llama.truncated}, - {"stopped_eos", llama.stopped_eos}, - {"stopped_word", llama.stopped_word}, - {"stopped_limit", llama.stopped_limit}, - {"stopping_word", llama.stopping_word}, - {"tokens_cached", llama.n_past}, - {"timings", format_timings(llama)}, - }; - - if (llama.params.sampling_params.n_probs > 0) { - res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs); - } - - return res; -} - static json -format_partial_response(llama_server_context &llama, const std::string &content, +format_partial_response(llama_server_context &llama, llama_client_slot *slot, + const std::string &content, const std::vector &probs) { - json res = json{ - {"content", content}, - {"stop", false}, - }; + json res = json{{"content", content}, + {"stop", false}, + {"slot_id", slot->id}, + {"multimodal", llama.multimodal}}; - if (llama.params.sampling_params.n_probs > 0) { + if (slot->sparams.n_probs > 0) { res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs); } @@ -1081,155 +2026,6 @@ static json format_detokenized_response(std::string content) { return json{{"content", content}}; } -template -static T json_value(const json &body, const std::string &key, - const T &default_value) { - // Fallback null to default value - return body.contains(key) && !body.at(key).is_null() - ? body.value(key, default_value) - : default_value; -} - -static void parse_options_completion(const json &body, - llama_server_context &llama) { - gpt_params default_params; - const auto &default_sparams = default_params.sampling_params; - auto &sparams = llama.params.sampling_params; - - llama.stream = json_value(body, "stream", false); - llama.params.n_predict = - json_value(body, "n_predict", default_params.n_predict); - sparams.top_k = json_value(body, "top_k", default_sparams.top_k); - sparams.top_p = json_value(body, "top_p", default_sparams.top_p); - sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z); - sparams.typical_p = json_value(body, "typical_p", default_sparams.typical_p); - sparams.repeat_last_n = - json_value(body, "repeat_last_n", default_sparams.repeat_last_n); - sparams.temp = json_value(body, "temperature", default_sparams.temp); - sparams.repeat_penalty = - json_value(body, "repeat_penalty", default_sparams.repeat_penalty); - sparams.presence_penalty = - json_value(body, "presence_penalty", default_sparams.presence_penalty); - sparams.frequency_penalty = - json_value(body, "frequency_penalty", default_sparams.frequency_penalty); - sparams.mirostat = json_value(body, "mirostat", default_sparams.mirostat); - sparams.mirostat_tau = - json_value(body, "mirostat_tau", default_sparams.mirostat_tau); - sparams.mirostat_eta = - json_value(body, "mirostat_eta", default_sparams.mirostat_eta); - sparams.penalize_nl = - json_value(body, "penalize_nl", default_sparams.penalize_nl); - llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep); - llama.params.seed = json_value(body, "seed", default_params.seed); - llama.params.grammar = json_value(body, "grammar", default_params.grammar); - sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs); - - if (body.count("prompt") != 0) { - llama.prompt = body["prompt"]; - } else { - llama.prompt = ""; - } - - sparams.logit_bias.clear(); - if (json_value(body, "ignore_eos", false)) { - sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY; - } - - const auto &logit_bias = body.find("logit_bias"); - if (logit_bias != body.end() && logit_bias->is_array()) { - const int n_vocab = llama_n_vocab(llama.model); - for (const auto &el : *logit_bias) { - if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) { - llama_token tok = el[0].get(); - if (tok >= 0 && tok < n_vocab) { - if (el[1].is_number()) { - sparams.logit_bias[tok] = el[1].get(); - } else if (el[1].is_boolean() && !el[1].get()) { - sparams.logit_bias[tok] = -INFINITY; - } - } - } - } - } - - llama.params.antiprompt.clear(); - const auto &stop = body.find("stop"); - if (stop != body.end() && stop->is_array()) { - for (const auto &word : *stop) { - if (!word.empty()) { - llama.params.antiprompt.push_back(word); - } - } - } - - llama.ctx_sampling = llama_sampling_context_init(llama.params, llama.grammar); - - LOG_VERBOSE("completion parameters parsed", - format_generation_settings(llama)); -} - -static void parse_options_infill(const json &body, - llama_server_context &llama) { - if (body.count("input_prefix") != 0) { - llama.params.input_prefix = body["input_prefix"]; - } else { - llama.params.input_prefix = ""; - } - if (body.count("input_suffix") != 0) { - llama.params.input_suffix = body["input_suffix"]; - } else { - llama.params.input_suffix = ""; - } - parse_options_completion(body, llama); -} - -static bool is_at_eob(llama_server_context &server_context, - const llama_token *tokens, const size_t n_tokens) { - return n_tokens && - tokens[n_tokens - 1] == llama_token_eos(server_context.ctx); -} - -// Function matching type llama_beam_search_callback_fn_t. -// Custom callback example is called each time the beams lengths increase: -// * Show progress by printing ',' following by number of convergent beam -// tokens if any. -// * When all beams converge to a common prefix, they are made available in -// beams_state.beams[0]. -// This is also called when the stop condition is met. -// Collect tokens into std::vector response which is pointed to -// by callback_data. -static void beam_search_callback(void *callback_data, - llama_beams_state beams_state) { - auto &llama = *static_cast(callback_data); - // Mark beams as EOS as needed. - for (size_t i = 0; i < beams_state.n_beams; ++i) { - llama_beam_view &beam_view = beams_state.beam_views[i]; - if (!beam_view.eob && - is_at_eob(llama, beam_view.tokens, beam_view.n_tokens)) { - beam_view.eob = true; - } - } - printf(","); // Show progress - if (const size_t n = beams_state.common_prefix_length) { - llama.generated_token_probs.resize(llama.generated_token_probs.size() + n); - assert(0u < beams_state.n_beams); - const llama_token *tokens = beams_state.beam_views[0].tokens; - const auto map = [](llama_token tok) { - return completion_token_output{{}, tok}; - }; - std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, - map); - printf("%zu", n); - } - fflush(stdout); -#if 0 // DEBUG: print current beams for this iteration - std::cout << "\n\nCurrent beams:\n"; - for (size_t i=0 ; i < beams_state.n_beams ; ++i) { - std::cout << "beams["<generated_token_probs; auto translator = token_translator{llama.ctx}; auto add_strlen = [=](size_t sum, const completion_token_output &cto) { return sum + translator(cto).size(); }; const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen); - if (llama.generated_text.capacity() < llama.generated_text.size() + len) { - llama.generated_text.reserve(llama.generated_text.size() + len); + if (slot->generated_text.capacity() < slot->generated_text.size() + len) { + slot->generated_text.reserve(slot->generated_text.size() + len); } for (const completion_token_output &cto : gtps) { - llama.generated_text += translator(cto); + slot->generated_text += translator(cto); } } @@ -1264,9 +2061,11 @@ class llamaCPP : public drogon::HttpController { public: llamaCPP() { // Some default values for now below - log_disable(); // Disable the log to file feature, reduce bloat for target - // system () + // log_disable(); // Disable the log to file feature, reduce bloat for + // target + // system () } + METHOD_LIST_BEGIN // list path definitions here; METHOD_ADD(llamaCPP::chatCompletion, "chat_completion", Post); @@ -1282,10 +2081,13 @@ class llamaCPP : public drogon::HttpController { std::function &&callback); void warmupModel(); + void backgroundTask(); + private: llama_server_context llama; bool model_loaded = false; size_t sent_count = 0; size_t sent_token_probs_index = 0; + std::thread backgroundThread; }; }; // namespace inferences diff --git a/ext_libs/libcrypto-3-x64.dll b/ext_libs/libcrypto-3-x64.dll deleted file mode 100644 index b9223ec18..000000000 Binary files a/ext_libs/libcrypto-3-x64.dll and /dev/null differ diff --git a/ext_libs/libssl-3-x64.dll b/ext_libs/libssl-3-x64.dll deleted file mode 100644 index d48518e4f..000000000 Binary files a/ext_libs/libssl-3-x64.dll and /dev/null differ diff --git a/llama.cpp b/llama.cpp index 281ef73c2..207b51900 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit 281ef73c258cc1eebec8a64264240432d5878c4b +Subproject commit 207b51900e15cc7f89763a3bb1c565fe11cbb45d diff --git a/main.cc b/main.cc index 64f556cb1..c8ab01856 100644 --- a/main.cc +++ b/main.cc @@ -1,5 +1,6 @@ #include "utils/nitro_utils.h" #include // for PATH_MAX +#include #include #if defined(__APPLE__) && defined(__MACH__) @@ -15,24 +16,32 @@ #endif int main(int argc, char *argv[]) { - + int thread_num = std::thread::hardware_concurrency(); std::string host = "127.0.0.1"; int port = 3928; - // Check for host argument + // Number of nitro threads if (argc > 1) { - host = argv[1]; + thread_num = std::atoi(argv[1]); } - // Check for port argument + // Check for host argument if (argc > 2) { - port = std::atoi(argv[2]); // Convert string argument to int + host = argv[2]; + } + + // Check for port argument + if (argc > 3) { + port = std::atoi(argv[3]); // Convert string argument to int } nitro_utils::nitro_logo(); LOG_INFO << "Server started, listening at: " << host << ":" << port; LOG_INFO << "Please load your model"; drogon::app().addListener(host, port); + drogon::app().setThreadNum(thread_num); + LOG_INFO << "Number of thread is:" << drogon::app().getThreadNum(); + drogon::app().run(); return 0;