certik · certik · Mar 17, 2024 · Mar 15, 2024 · Mar 15, 2024 · Mar 15, 2024
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -35,6 +35,16 @@ jobs:
           variant: sccache
           key: ${{ github.job }}-${{ matrix.os }}
 
+      - name: Install GGUF
+        shell: bash -e -x -l {0}
+        run: |
+            git clone https://github.com/ggerganov/llama.cpp
+            cd llama.cpp
+            git checkout 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc
+            cd gguf-py
+            pip install .
+            cd ../..
+
       - name: Build and run
         shell: bash -l {0}
         run: |

diff --git a/ci/build.sh b/ci/build.sh
@@ -15,9 +15,9 @@ cmake -DFASTGPT_BLAS=OpenBLAS .
 make
 time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2
 
-rm model.dat
+rm model.gguf
 curl -o model.dat -L https://huggingface.co/datasets/certik/fastGPT/resolve/main/model_fastgpt_124M_v1.dat
-time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2
+#time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2
 
 rm gpt2
 python pt.py
diff --git a/create_model.py b/create_model.py
@@ -39,6 +39,7 @@
 from shutil import copyfile
 
 import numpy as np
+import gguf
 import requests
 import tensorflow as tf
 from tqdm import tqdm
@@ -157,29 +158,38 @@ def convert(params, n_head, n_ctx, idx, decoder_txt,
 
     model_type = 0xfa51697 # fastGPT
     model_version = 1
-
-    # Save the model
-    f = open("model.dat", "w")
-    np.array([model_type, model_version, n_vocab, n_ctx, n_embd, n_layer, n_head,
+    header = np.array([model_type, model_version, n_vocab, n_ctx, n_embd, n_layer, n_head,
         len(idx),len(decoder_txt.encode("utf-8")),
-        len(vocab_idx),len(vocab_txt.encode("utf-8")),len(byte_decoder)], dtype=np.int32).tofile(f)
-    wte.tofile(f); wpe.tofile(f)
-    mlp_fc_w.tofile(f); mlp_fc_b.tofile(f)
-    mlp_proj_w.tofile(f); mlp_proj_b.tofile(f)
-    attn_w.tofile(f); attn_b.tofile(f)
-    attn_proj_w.tofile(f); attn_proj_b.tofile(f)
-    ln1_b.tofile(f); ln1_g.tofile(f)
-    ln2_b.tofile(f); ln2_g.tofile(f)
-    lnf_b.tofile(f); lnf_g.tofile(f)
-    idx.tofile(f)
-    f.write(decoder_txt)
-    vocab_idx.tofile(f)
-    f.write(vocab_txt)
-    byte_decoder.tofile(f)
+        len(vocab_idx),len(vocab_txt.encode("utf-8")),len(byte_decoder)], dtype=np.int32)
+
+    # Save the model to GGUF
+    g = gguf.GGUFWriter("model.gguf", None)
+    g.add_tensor("header", header)
+    g.add_tensor("wte", wte); g.add_tensor("wpe", wpe)
+    g.add_tensor("mlp_fc_w", mlp_fc_w); g.add_tensor("mlp_fc_b", mlp_fc_b)
+    g.add_tensor("mlp_proj_w", mlp_proj_w); g.add_tensor("mlp_proj_b", mlp_proj_b)
+    g.add_tensor("attn_w", attn_w); g.add_tensor("attn_b", attn_b)
+    g.add_tensor("attn_proj_w", attn_proj_w); g.add_tensor("attn_proj_b",
+            attn_proj_b)
+    g.add_tensor("ln1_b", ln1_b); g.add_tensor("ln1_g", ln1_g)
+    g.add_tensor("ln2_b", ln2_b); g.add_tensor("ln2_g", ln2_g)
+    g.add_tensor("lnf_b", lnf_b); g.add_tensor("lnf_g", lnf_g)
+    g.add_tensor("idx", idx)
+    g.add_tensor("decoder_txt", np.frombuffer(decoder_txt.encode("utf-8"),
+        dtype=np.int8))
+    g.add_tensor("vocab_idx", vocab_idx)
+    g.add_tensor("vocab_txt", np.frombuffer(vocab_txt.encode("utf-8"),
+        dtype=np.int8))
+    g.add_tensor("byte_decoder", byte_decoder)
+    g.write_header_to_file()
+    g.write_kv_data_to_file()
+    g.write_tensors_to_file()
+    g.close()
 
     t2 = clock()
     print("Save time: ", t2-t1)
 
+
 def load_decoder(filename):
     D = json.load(open(filename))
     D2 = {v: k for k, v in D.items()}

diff --git a/driver.f90 b/driver.f90
@@ -31,6 +31,27 @@ subroutine load_input(filename, input_txt, n_tokens_to_generate)
 close(u)
 end subroutine
 
+! Aligns file position in `u` to 32 byte boundary after `A` was read
+subroutine align_i4(u, A)
+integer, intent(in) :: u
+integer, intent(in) :: A(..)
+integer :: n, alignment
+alignment = 32
+n = size(A)*4
+call fseek(u, alignment-modulo(n,alignment), 1)
+end subroutine
+
+subroutine align_str(u, A)
+integer, intent(in) :: u
+character, intent(in) :: A(:)
+integer :: n, alignment
+alignment = 32
+n = size(A)
+if (modulo(n, alignment) /= 0) then
+    call fseek(u, alignment-modulo(n,alignment), 1)
+end if
+end subroutine
+
 subroutine load_model(filename, m)
 character(*), intent(in) :: filename
 type(model_t), intent(out) :: m
@@ -41,7 +62,11 @@ subroutine load_model(filename, m)
 integer, parameter :: current_model_version = 1
 integer :: model_mark
 integer :: u
+integer :: data_offset
 open(newunit=u, file=filename, form="unformatted", access="stream", status="old")
+! TODO: We need an easy way to extract this data offset from the gguf file
+data_offset = 1056
+call fseek(u, data_offset, 0)
 read(u) model_mark
 if (model_mark /= current_model_mark) then
     print *, "Found:", model_mark
@@ -56,6 +81,7 @@ subroutine load_model(filename, m)
 end if
 read(u) m%n_vocab, m%n_ctx, m%n_embd, m%n_layer, m%n_head, m%n_decoder_idx, &
     m%n_decoder_txt, m%n_vocab_idx, m%n_vocab_txt, m%n_byte_encoder
+call fseek(u, 16, 1) ! Pad the 12 element i32 array to 32 byte boundary
 allocate(m%wte(m%n_embd,m%n_vocab), m%wpe(m%n_embd,m%n_ctx), &
     m%mlp_fc_w(4*m%n_embd,m%n_embd,m%n_layer), m%mlp_fc_b(4*m%n_embd,m%n_layer), &
     m%mlp_proj_w(m%n_embd,4*m%n_embd,m%n_layer), m%mlp_proj_b(m%n_embd,m%n_layer), &
@@ -75,9 +101,15 @@ subroutine load_model(filename, m)
     m%ln1_b, m%ln1_g, &
     m%ln2_b, m%ln2_g, &
     m%lnf_b, m%lnf_g, &
-    m%decoder_idx, m%decoder_txt, &
-    m%vocab_idx, m%vocab_txt, &
-    m%byte_encoder
+    m%decoder_idx
+call align_i4(u, m%decoder_idx)
+read(u) m%decoder_txt
+call align_str(u, m%decoder_txt)
+read(u) m%vocab_idx
+call align_i4(u, m%vocab_idx)
+read(u) m%vocab_txt
+call align_str(u, m%vocab_txt)
+read(u) m%byte_encoder
 close(u)
 end subroutine
 
@@ -92,7 +124,7 @@ subroutine gpt2_driver(input, output, m)
 ! Load the model
 print "(a)", "Loading the model..."
 call cpu_time(t1)
-call load_model("model.dat", m)
+call load_model("model.gguf", m)
 call cpu_time(t2)
 print "(a,f8.3,a,i2)", "    done. Time:", t2-t1, "s, Model file version:", m%model_file_version
 print *
@@ -235,7 +267,7 @@ subroutine chat(inputs)
 type(model_t) :: m
 character(:), allocatable :: prompt, input, output
 integer :: i, n_prompts
-call load_model("model.dat", m)
+call load_model("model.gguf", m)
 prompt = "Your name is fastGPT and you are an AI bot. The user will ask you &
 &questions and you answer in a nice, truthful, short way." // LF // "&
 &User: What is the capital of Czechia?" // LF // "&

diff --git a/tests/test_more_inputs.f90 b/tests/test_more_inputs.f90
@@ -9,7 +9,7 @@ program test_more_inputs
     25370, 254, 368, 83, 6557, 81, 11]
 integer, allocatable :: input(:), output(:)
 
-call load_model("model.dat", m)
+call load_model("model.gguf", m)
 
 call gpt2_driver2("Ondřej Čertík was born in", 13, m, input, output)
 print *