Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sync : llama.cpp #1070

Merged
merged 23 commits into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
b19d9cc
fix: Vulkan shader gen binary path (llama/11037)
giladgd Jan 4, 2025
dc5f0bc
Support for models with non-512-aligned tensors over RPC. (llama/11047)
matt23654 Jan 4, 2025
81fde73
Vulkan: Add device-specific blacklist for coopmat for the AMD proprie…
0cc4m Jan 4, 2025
5514692
CUDA: add BF16 support (llama/11093)
JohannesGaessler Jan 6, 2025
20a72f1
SYCL: Use get_multi_ptr instead of deprecated get_pointer in wkv6 (ll…
qnixsynapse Jan 7, 2025
ef9e2eb
rpc : code cleanup (llama/11107)
rgerganov Jan 7, 2025
4630db8
ggml-backend : only offload from host buffers (llama/11120)
slaren Jan 7, 2025
7150212
ggml-backend : only offload from host buffers (fix) (llama/11124)
slaren Jan 7, 2025
bacc721
GGUF: C++ refactor, backend support, misc fixes (llama/11030)
JohannesGaessler Jan 7, 2025
6a87a43
fix: Vulkan shader gen binary path when Cross-compiling (llama/11096)
ag2s20150909 Jan 8, 2025
454cfa3
Disable GL_KHR_cooperative_matrix Vulkan extension if not available. …
mbaudier Jan 8, 2025
7de0ddc
llamafile : ppc64le MMA INT8 implementation (llama/10912)
amritahs-ibm Jan 8, 2025
d50565e
fix: add missing msg in static_assert (llama/11143)
hydai Jan 8, 2025
620b188
SYCL: Refactor ggml_sycl_compute_forward (llama/11121)
qnixsynapse Jan 10, 2025
17b0a11
llama: add support for QRWKV6 model architecture (llama/11001)
MollySophia Jan 10, 2025
67b8cc9
Vulkan: Fix float16 use on devices without float16 support + fix subg…
0cc4m Jan 10, 2025
e819be6
ggml : do not define GGML_USE_CUDA when building with GGML_BACKEND_DL…
rgerganov Jan 13, 2025
aad9107
cuda : CUDA Graph Compute Function Refactor (precursor for performanc…
aendk Jan 13, 2025
23fbf2f
sync : llama.cpp
ggerganov Jan 14, 2025
f6cbb38
scripts : sync opencl
ggerganov Jan 14, 2025
ced5e67
ggml : add opencl backend (skip) (llama/10693)
lhez Jan 14, 2025
18e8b10
scripts : sync gguf
ggerganov Jan 14, 2025
daca9a1
GGUF: C++ refactor, backend support, misc fixes (skip) (llama/11030)
JohannesGaessler Jan 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,8 @@ set(GGML_PUBLIC_HEADERS
include/ggml-metal.h
include/ggml-rpc.h
include/ggml-sycl.h
include/ggml-vulkan.h)
include/ggml-vulkan.h
include/gguf.h)

set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
#if (GGML_METAL)
Expand Down
1 change: 1 addition & 0 deletions examples/magika/main.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "ggml.h"
#include "gguf.h"
#include "ggml-cpu.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
Expand Down
1 change: 1 addition & 0 deletions examples/mnist/mnist-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "ggml.h"
#include "gguf.h"
#include "ggml-cpu.h"
#include "ggml-opt.h"

Expand Down
1 change: 1 addition & 0 deletions examples/yolo/yolov3-tiny.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "ggml.h"
#include "gguf.h"
#include "ggml-cpu.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
Expand Down
1 change: 1 addition & 0 deletions include/ggml-cpp.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "ggml.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "gguf.h"
#include <memory>

// Smart pointers for ggml types
Expand Down
150 changes: 10 additions & 140 deletions include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -241,12 +241,6 @@
#define GGML_ROPE_TYPE_MROPE 8
#define GGML_ROPE_TYPE_VISION 24

#define GGUF_MAGIC "GGUF"

#define GGUF_VERSION 3

#define GGUF_DEFAULT_ALIGNMENT 32

#define GGML_UNUSED(x) (void)(x)

#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
Expand Down Expand Up @@ -403,12 +397,6 @@ extern "C" {
GGML_PREC_F32,
};

enum ggml_backend_type {
GGML_BACKEND_TYPE_CPU = 0,
GGML_BACKEND_TYPE_GPU = 10,
GGML_BACKEND_TYPE_GPU_SPLIT = 20,
};

// model file types
enum ggml_ftype {
GGML_FTYPE_UNKNOWN = -1,
Expand Down Expand Up @@ -513,6 +501,7 @@ extern "C" {
GGML_OP_GET_REL_POS,
GGML_OP_ADD_REL_POS,
GGML_OP_RWKV_WKV6,
GGML_OP_GATED_LINEAR_ATTN,

GGML_OP_UNARY,

Expand Down Expand Up @@ -587,8 +576,6 @@ extern "C" {
struct ggml_tensor {
enum ggml_type type;

GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");

struct ggml_backend_buffer * buffer;

int64_t ne[GGML_MAX_DIMS]; // number of elements
Expand Down Expand Up @@ -1873,6 +1860,15 @@ extern "C" {
struct ggml_tensor * td,
struct ggml_tensor * state);

GGML_API struct ggml_tensor * ggml_gated_linear_attn(
struct ggml_context * ctx,
struct ggml_tensor * k,
struct ggml_tensor * v,
struct ggml_tensor * q,
struct ggml_tensor * g,
struct ggml_tensor * state,
float scale);

// custom operators

typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
Expand Down Expand Up @@ -2111,132 +2107,6 @@ extern "C" {
int64_t n_per_row,
const float * imatrix);

//
// gguf
//

enum gguf_type {
GGUF_TYPE_UINT8 = 0,
GGUF_TYPE_INT8 = 1,
GGUF_TYPE_UINT16 = 2,
GGUF_TYPE_INT16 = 3,
GGUF_TYPE_UINT32 = 4,
GGUF_TYPE_INT32 = 5,
GGUF_TYPE_FLOAT32 = 6,
GGUF_TYPE_BOOL = 7,
GGUF_TYPE_STRING = 8,
GGUF_TYPE_ARRAY = 9,
GGUF_TYPE_UINT64 = 10,
GGUF_TYPE_INT64 = 11,
GGUF_TYPE_FLOAT64 = 12,
GGUF_TYPE_COUNT, // marks the end of the enum
};

struct gguf_context;

struct gguf_init_params {
bool no_alloc;

// if not NULL, create a ggml_context and allocate the tensor data in it
struct ggml_context ** ctx;
};

GGML_API struct gguf_context * gguf_init_empty(void);
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
//GGML_API struct gguf_context * gguf_init_from_buffer(..);

GGML_API void gguf_free(struct gguf_context * ctx);

GGML_API const char * gguf_type_name(enum gguf_type type);

GGML_API int gguf_get_version (const struct gguf_context * ctx);
GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
GGML_API void * gguf_get_data (const struct gguf_context * ctx);

GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);

GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);

// will abort if the wrong type is used for the key
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id);
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id);
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);

GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);

// removes key if it exists
GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);

// overrides existing values or adds a new one
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);

// set or add KV pairs from another context
GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);

// manage tensor info
GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);

// writing gguf files can be done in 2 ways:
//
// - write the entire gguf_context to a binary file in a single pass:
//
// gguf_write_to_file(ctx, fname);
//
// - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
//
// FILE * f = fopen(fname, "wb");
// fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
// fwrite(f, ...);
// void * data = gguf_meta_get_meta_data(ctx);
// fseek(f, 0, SEEK_SET);
// fwrite(f, data, gguf_get_meta_size(ctx));
// free(data);
// fclose(f);
//

// write the entire context to a binary file
GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);

// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);

#ifdef __cplusplus
// restrict not standard in C++
# if defined(__GNUC__)
Expand Down
Loading
Loading