certik · certik · Mar 17, 2024 · Mar 15, 2024 · Mar 15, 2024 · Mar 15, 2024
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -35,6 +35,16 @@ jobs:
           variant: sccache
           key: ${{ github.job }}-${{ matrix.os }}
 
+      - name: Install GGUF
+        shell: bash -e -x -l {0}
+        run: |
+            git clone https://github.com/ggerganov/llama.cpp
+            cd llama.cpp
+            git checkout 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc
+            cd gguf-py
+            pip install .
+            cd ../..
+
       - name: Build and run
         shell: bash -l {0}
         run: |

diff --git a/ci/build.sh b/ci/build.sh
@@ -15,8 +15,8 @@ cmake -DFASTGPT_BLAS=OpenBLAS .
 make
 time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2
 
-rm model.dat
-curl -o model.dat -L https://huggingface.co/datasets/certik/fastGPT/resolve/main/model_fastgpt_124M_v1.dat
+rm model.gguf
+curl -o model.gguf -L https://huggingface.co/certik/fastGPT/resolve/main/model_fastgpt_124M_v2.gguf
 time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2
 
 rm gpt2

diff --git a/create_model.py b/create_model.py
@@ -39,6 +39,7 @@
 from shutil import copyfile
 
 import numpy as np
+import gguf
 import requests
 import tensorflow as tf
 from tqdm import tqdm
@@ -156,30 +157,57 @@ def convert(params, n_head, n_ctx, idx, decoder_txt,
     assert np.size(wte, 1) == n_embd
 
     model_type = 0xfa51697 # fastGPT
-    model_version = 1
-
-    # Save the model
-    f = open("model.dat", "w")
-    np.array([model_type, model_version, n_vocab, n_ctx, n_embd, n_layer, n_head,
+    model_version = 2
+    header = np.array([model_type, model_version, n_vocab, n_ctx, n_embd, n_layer, n_head,
         len(idx),len(decoder_txt.encode("utf-8")),
-        len(vocab_idx),len(vocab_txt.encode("utf-8")),len(byte_decoder)], dtype=np.int32).tofile(f)
-    wte.tofile(f); wpe.tofile(f)
-    mlp_fc_w.tofile(f); mlp_fc_b.tofile(f)
-    mlp_proj_w.tofile(f); mlp_proj_b.tofile(f)
-    attn_w.tofile(f); attn_b.tofile(f)
-    attn_proj_w.tofile(f); attn_proj_b.tofile(f)
-    ln1_b.tofile(f); ln1_g.tofile(f)
-    ln2_b.tofile(f); ln2_g.tofile(f)
-    lnf_b.tofile(f); lnf_g.tofile(f)
-    idx.tofile(f)
-    f.write(decoder_txt)
-    vocab_idx.tofile(f)
-    f.write(vocab_txt)
-    byte_decoder.tofile(f)
+        len(vocab_idx),len(vocab_txt.encode("utf-8")),len(byte_decoder)], dtype=np.int32)
+
+    # Save the model to GGUF
+    def save_gguf(data_offset_name, data_offset_value):
+        g = gguf.GGUFWriter("model.gguf", None)
+        g.add_int32(data_offset_name, data_offset_value)
+        g.add_tensor("header", header)
+        g.add_tensor("wte", wte); g.add_tensor("wpe", wpe)
+        g.add_tensor("mlp_fc_w", mlp_fc_w); g.add_tensor("mlp_fc_b", mlp_fc_b)
+        g.add_tensor("mlp_proj_w", mlp_proj_w); g.add_tensor("mlp_proj_b", mlp_proj_b)
+        g.add_tensor("attn_w", attn_w); g.add_tensor("attn_b", attn_b)
+        g.add_tensor("attn_proj_w", attn_proj_w); g.add_tensor("attn_proj_b",
+                attn_proj_b)
+        g.add_tensor("ln1_b", ln1_b); g.add_tensor("ln1_g", ln1_g)
+        g.add_tensor("ln2_b", ln2_b); g.add_tensor("ln2_g", ln2_g)
+        g.add_tensor("lnf_b", lnf_b); g.add_tensor("lnf_g", lnf_g)
+        g.add_tensor("idx", idx)
+        g.add_tensor("decoder_txt", np.frombuffer(decoder_txt.encode("utf-8"),
+            dtype=np.int8))
+        g.add_tensor("vocab_idx", vocab_idx)
+        g.add_tensor("vocab_txt", np.frombuffer(vocab_txt.encode("utf-8"),
+            dtype=np.int8))
+        g.add_tensor("byte_decoder", byte_decoder)
+        g.write_header_to_file()
+        g.write_kv_data_to_file()
+        g.write_tensors_to_file()
+        g.close()
+
+    data_offset_name = "general.data_offset"
+    save_gguf(data_offset_name, 0)
+
+    g = gguf.GGUFReader("model.gguf")
+    data_offset = g.tensors[0].data_offset
+    # * .offset: the offset of the kv entry
+    # * 8: The i64 length of the key string
+    # * 4: The i32 type of the value
+    assert g.fields[data_offset_name].offset == 24
+    offset_offset = g.fields[data_offset_name].offset + 8 + \
+        len(data_offset_name) + 4
+    print("offset offset:", offset_offset)
+    print("data offset:", data_offset)
+
+    save_gguf(data_offset_name, data_offset)
 
     t2 = clock()
     print("Save time: ", t2-t1)
 
+
 def load_decoder(filename):
     D = json.load(open(filename))
     D2 = {v: k for k, v in D.items()}

diff --git a/driver.f90 b/driver.f90
@@ -31,17 +31,55 @@ subroutine load_input(filename, input_txt, n_tokens_to_generate)
 close(u)
 end subroutine
 
+! Aligns file position in `u` to 32 byte boundary after `A` was read
+subroutine align_i4(u, A)
+integer, intent(in) :: u
+integer, intent(in) :: A(..)
+integer :: n, alignment
+alignment = 32
+n = size(A)*4
+call fseek(u, alignment-modulo(n,alignment), 1)
+end subroutine
+
+subroutine align_str(u, A)
+integer, intent(in) :: u
+character, intent(in) :: A(:)
+integer :: n, alignment
+alignment = 32
+n = size(A)
+if (modulo(n, alignment) /= 0) then
+    call fseek(u, alignment-modulo(n,alignment), 1)
+end if
+end subroutine
+
 subroutine load_model(filename, m)
 character(*), intent(in) :: filename
 type(model_t), intent(out) :: m
 ! We use the following fastGPT model type number
 !   fastGPT (digits look similar to the letters they represent)
 ! 0xfa51697 = 262477463
+
+! We read the offset to the data section at this position, which is the first
+! variable in the metadata, the name is "general.data_offset", type i32.
+integer, parameter :: offset_offset = &
+    ! header
+    4 + & ! u8[4] magic
+    4 + & ! u32 version
+    8 + & ! u64 n_arrays
+    8 + & ! u64 n_kv
+    ! kv
+    8 + & ! u64 n_str
+    19 + & ! len("general.data_offset")
+    4 ! u32 type of value
 integer, parameter :: current_model_mark = 262477463
-integer, parameter :: current_model_version = 1
+integer, parameter :: current_model_version = 2
 integer :: model_mark
 integer :: u
+integer :: data_offset
 open(newunit=u, file=filename, form="unformatted", access="stream", status="old")
+call fseek(u, offset_offset, 0)
+read(u) data_offset
+call fseek(u, data_offset, 0)
 read(u) model_mark
 if (model_mark /= current_model_mark) then
     print *, "Found:", model_mark
@@ -56,6 +94,7 @@ subroutine load_model(filename, m)
 end if
 read(u) m%n_vocab, m%n_ctx, m%n_embd, m%n_layer, m%n_head, m%n_decoder_idx, &
     m%n_decoder_txt, m%n_vocab_idx, m%n_vocab_txt, m%n_byte_encoder
+call fseek(u, 16, 1) ! Pad the 12 element i32 array to 32 byte boundary
 allocate(m%wte(m%n_embd,m%n_vocab), m%wpe(m%n_embd,m%n_ctx), &
     m%mlp_fc_w(4*m%n_embd,m%n_embd,m%n_layer), m%mlp_fc_b(4*m%n_embd,m%n_layer), &
     m%mlp_proj_w(m%n_embd,4*m%n_embd,m%n_layer), m%mlp_proj_b(m%n_embd,m%n_layer), &
@@ -75,9 +114,15 @@ subroutine load_model(filename, m)
     m%ln1_b, m%ln1_g, &
     m%ln2_b, m%ln2_g, &
     m%lnf_b, m%lnf_g, &
-    m%decoder_idx, m%decoder_txt, &
-    m%vocab_idx, m%vocab_txt, &
-    m%byte_encoder
+    m%decoder_idx
+call align_i4(u, m%decoder_idx)
+read(u) m%decoder_txt
+call align_str(u, m%decoder_txt)
+read(u) m%vocab_idx
+call align_i4(u, m%vocab_idx)
+read(u) m%vocab_txt
+call align_str(u, m%vocab_txt)
+read(u) m%byte_encoder
 close(u)
 end subroutine
 
@@ -92,7 +137,7 @@ subroutine gpt2_driver(input, output, m)
 ! Load the model
 print "(a)", "Loading the model..."
 call cpu_time(t1)
-call load_model("model.dat", m)
+call load_model("model.gguf", m)
 call cpu_time(t2)
 print "(a,f8.3,a,i2)", "    done. Time:", t2-t1, "s, Model file version:", m%model_file_version
 print *
@@ -235,7 +280,7 @@ subroutine chat(inputs)
 type(model_t) :: m
 character(:), allocatable :: prompt, input, output
 integer :: i, n_prompts
-call load_model("model.dat", m)
+call load_model("model.gguf", m)
 prompt = "Your name is fastGPT and you are an AI bot. The user will ask you &
 &questions and you answer in a nice, truthful, short way." // LF // "&
 &User: What is the capital of Czechia?" // LF // "&

diff --git a/tests/test_more_inputs.f90 b/tests/test_more_inputs.f90
@@ -9,7 +9,7 @@ program test_more_inputs
     25370, 254, 368, 83, 6557, 81, 11]
 integer, allocatable :: input(:), output(:)
 
-call load_model("model.dat", m)
+call load_model("model.gguf", m)
 
 call gpt2_driver2("Ondřej Čertík was born in", 13, m, input, output)
 print *