Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use GGUF to store model weights #69

Merged
merged 10 commits into from
Mar 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@ jobs:
variant: sccache
key: ${{ github.job }}-${{ matrix.os }}

- name: Install GGUF
shell: bash -e -x -l {0}
run: |
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
git checkout 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc
cd gguf-py
pip install .
cd ../..

- name: Build and run
shell: bash -l {0}
run: |
Expand Down
4 changes: 2 additions & 2 deletions ci/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ cmake -DFASTGPT_BLAS=OpenBLAS .
make
time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2

rm model.dat
curl -o model.dat -L https://huggingface.co/datasets/certik/fastGPT/resolve/main/model_fastgpt_124M_v1.dat
rm model.gguf
curl -o model.gguf -L https://huggingface.co/certik/fastGPT/resolve/main/model_fastgpt_124M_v2.gguf
time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2

rm gpt2
Expand Down
66 changes: 47 additions & 19 deletions create_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from shutil import copyfile

import numpy as np
import gguf
import requests
import tensorflow as tf
from tqdm import tqdm
Expand Down Expand Up @@ -156,30 +157,57 @@ def convert(params, n_head, n_ctx, idx, decoder_txt,
assert np.size(wte, 1) == n_embd

model_type = 0xfa51697 # fastGPT
model_version = 1

# Save the model
f = open("model.dat", "w")
np.array([model_type, model_version, n_vocab, n_ctx, n_embd, n_layer, n_head,
model_version = 2
header = np.array([model_type, model_version, n_vocab, n_ctx, n_embd, n_layer, n_head,
len(idx),len(decoder_txt.encode("utf-8")),
len(vocab_idx),len(vocab_txt.encode("utf-8")),len(byte_decoder)], dtype=np.int32).tofile(f)
wte.tofile(f); wpe.tofile(f)
mlp_fc_w.tofile(f); mlp_fc_b.tofile(f)
mlp_proj_w.tofile(f); mlp_proj_b.tofile(f)
attn_w.tofile(f); attn_b.tofile(f)
attn_proj_w.tofile(f); attn_proj_b.tofile(f)
ln1_b.tofile(f); ln1_g.tofile(f)
ln2_b.tofile(f); ln2_g.tofile(f)
lnf_b.tofile(f); lnf_g.tofile(f)
idx.tofile(f)
f.write(decoder_txt)
vocab_idx.tofile(f)
f.write(vocab_txt)
byte_decoder.tofile(f)
len(vocab_idx),len(vocab_txt.encode("utf-8")),len(byte_decoder)], dtype=np.int32)

# Save the model to GGUF
def save_gguf(data_offset_name, data_offset_value):
g = gguf.GGUFWriter("model.gguf", None)
g.add_int32(data_offset_name, data_offset_value)
g.add_tensor("header", header)
g.add_tensor("wte", wte); g.add_tensor("wpe", wpe)
g.add_tensor("mlp_fc_w", mlp_fc_w); g.add_tensor("mlp_fc_b", mlp_fc_b)
g.add_tensor("mlp_proj_w", mlp_proj_w); g.add_tensor("mlp_proj_b", mlp_proj_b)
g.add_tensor("attn_w", attn_w); g.add_tensor("attn_b", attn_b)
g.add_tensor("attn_proj_w", attn_proj_w); g.add_tensor("attn_proj_b",
attn_proj_b)
g.add_tensor("ln1_b", ln1_b); g.add_tensor("ln1_g", ln1_g)
g.add_tensor("ln2_b", ln2_b); g.add_tensor("ln2_g", ln2_g)
g.add_tensor("lnf_b", lnf_b); g.add_tensor("lnf_g", lnf_g)
g.add_tensor("idx", idx)
g.add_tensor("decoder_txt", np.frombuffer(decoder_txt.encode("utf-8"),
dtype=np.int8))
g.add_tensor("vocab_idx", vocab_idx)
g.add_tensor("vocab_txt", np.frombuffer(vocab_txt.encode("utf-8"),
dtype=np.int8))
g.add_tensor("byte_decoder", byte_decoder)
g.write_header_to_file()
g.write_kv_data_to_file()
g.write_tensors_to_file()
g.close()

data_offset_name = "general.data_offset"
save_gguf(data_offset_name, 0)

g = gguf.GGUFReader("model.gguf")
data_offset = g.tensors[0].data_offset
# * .offset: the offset of the kv entry
# * 8: The i64 length of the key string
# * 4: The i32 type of the value
assert g.fields[data_offset_name].offset == 24
offset_offset = g.fields[data_offset_name].offset + 8 + \
len(data_offset_name) + 4
print("offset offset:", offset_offset)
print("data offset:", data_offset)

save_gguf(data_offset_name, data_offset)

t2 = clock()
print("Save time: ", t2-t1)


def load_decoder(filename):
D = json.load(open(filename))
D2 = {v: k for k, v in D.items()}
Expand Down
57 changes: 51 additions & 6 deletions driver.f90
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,55 @@ subroutine load_input(filename, input_txt, n_tokens_to_generate)
close(u)
end subroutine

! Aligns file position in `u` to 32 byte boundary after `A` was read
subroutine align_i4(u, A)
integer, intent(in) :: u
integer, intent(in) :: A(..)
integer :: n, alignment
alignment = 32
n = size(A)*4
call fseek(u, alignment-modulo(n,alignment), 1)
end subroutine

subroutine align_str(u, A)
integer, intent(in) :: u
character, intent(in) :: A(:)
integer :: n, alignment
alignment = 32
n = size(A)
if (modulo(n, alignment) /= 0) then
call fseek(u, alignment-modulo(n,alignment), 1)
end if
end subroutine

subroutine load_model(filename, m)
character(*), intent(in) :: filename
type(model_t), intent(out) :: m
! We use the following fastGPT model type number
! fastGPT (digits look similar to the letters they represent)
! 0xfa51697 = 262477463

! We read the offset to the data section at this position, which is the first
! variable in the metadata, the name is "general.data_offset", type i32.
integer, parameter :: offset_offset = &
! header
4 + & ! u8[4] magic
4 + & ! u32 version
8 + & ! u64 n_arrays
8 + & ! u64 n_kv
! kv
8 + & ! u64 n_str
19 + & ! len("general.data_offset")
4 ! u32 type of value
integer, parameter :: current_model_mark = 262477463
integer, parameter :: current_model_version = 1
integer, parameter :: current_model_version = 2
integer :: model_mark
integer :: u
integer :: data_offset
open(newunit=u, file=filename, form="unformatted", access="stream", status="old")
call fseek(u, offset_offset, 0)
read(u) data_offset
call fseek(u, data_offset, 0)
certik marked this conversation as resolved.
Show resolved Hide resolved
read(u) model_mark
if (model_mark /= current_model_mark) then
print *, "Found:", model_mark
Expand All @@ -56,6 +94,7 @@ subroutine load_model(filename, m)
end if
read(u) m%n_vocab, m%n_ctx, m%n_embd, m%n_layer, m%n_head, m%n_decoder_idx, &
m%n_decoder_txt, m%n_vocab_idx, m%n_vocab_txt, m%n_byte_encoder
call fseek(u, 16, 1) ! Pad the 12 element i32 array to 32 byte boundary
allocate(m%wte(m%n_embd,m%n_vocab), m%wpe(m%n_embd,m%n_ctx), &
m%mlp_fc_w(4*m%n_embd,m%n_embd,m%n_layer), m%mlp_fc_b(4*m%n_embd,m%n_layer), &
m%mlp_proj_w(m%n_embd,4*m%n_embd,m%n_layer), m%mlp_proj_b(m%n_embd,m%n_layer), &
Expand All @@ -75,9 +114,15 @@ subroutine load_model(filename, m)
m%ln1_b, m%ln1_g, &
m%ln2_b, m%ln2_g, &
m%lnf_b, m%lnf_g, &
m%decoder_idx, m%decoder_txt, &
m%vocab_idx, m%vocab_txt, &
m%byte_encoder
m%decoder_idx
call align_i4(u, m%decoder_idx)
read(u) m%decoder_txt
call align_str(u, m%decoder_txt)
read(u) m%vocab_idx
call align_i4(u, m%vocab_idx)
read(u) m%vocab_txt
call align_str(u, m%vocab_txt)
read(u) m%byte_encoder
close(u)
end subroutine

Expand All @@ -92,7 +137,7 @@ subroutine gpt2_driver(input, output, m)
! Load the model
print "(a)", "Loading the model..."
call cpu_time(t1)
call load_model("model.dat", m)
call load_model("model.gguf", m)
call cpu_time(t2)
print "(a,f8.3,a,i2)", " done. Time:", t2-t1, "s, Model file version:", m%model_file_version
print *
Expand Down Expand Up @@ -235,7 +280,7 @@ subroutine chat(inputs)
type(model_t) :: m
character(:), allocatable :: prompt, input, output
integer :: i, n_prompts
call load_model("model.dat", m)
call load_model("model.gguf", m)
prompt = "Your name is fastGPT and you are an AI bot. The user will ask you &
&questions and you answer in a nice, truthful, short way." // LF // "&
&User: What is the capital of Czechia?" // LF // "&
Expand Down
2 changes: 1 addition & 1 deletion tests/test_more_inputs.f90
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ program test_more_inputs
25370, 254, 368, 83, 6557, 81, 11]
integer, allocatable :: input(:), output(:)

call load_model("model.dat", m)
call load_model("model.gguf", m)

call gpt2_driver2("Ondřej Čertík was born in", 13, m, input, output)
print *
Expand Down
Loading