Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use GGUF to store model weights #69

Merged
merged 10 commits into from
Mar 17, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@ jobs:
variant: sccache
key: ${{ github.job }}-${{ matrix.os }}

- name: Install GGUF
shell: bash -e -x -l {0}
run: |
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
git checkout 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc
cd gguf-py
pip install .
cd ../..

- name: Build and run
shell: bash -l {0}
run: |
Expand Down
4 changes: 2 additions & 2 deletions ci/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ cmake -DFASTGPT_BLAS=OpenBLAS .
make
time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2

rm model.dat
rm model.gguf
curl -o model.dat -L https://huggingface.co/datasets/certik/fastGPT/resolve/main/model_fastgpt_124M_v1.dat
time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2
#time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2
certik marked this conversation as resolved.
Show resolved Hide resolved

rm gpt2
python pt.py
46 changes: 28 additions & 18 deletions create_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from shutil import copyfile

import numpy as np
import gguf
import requests
import tensorflow as tf
from tqdm import tqdm
Expand Down Expand Up @@ -157,29 +158,38 @@ def convert(params, n_head, n_ctx, idx, decoder_txt,

model_type = 0xfa51697 # fastGPT
model_version = 1

# Save the model
f = open("model.dat", "w")
np.array([model_type, model_version, n_vocab, n_ctx, n_embd, n_layer, n_head,
header = np.array([model_type, model_version, n_vocab, n_ctx, n_embd, n_layer, n_head,
len(idx),len(decoder_txt.encode("utf-8")),
len(vocab_idx),len(vocab_txt.encode("utf-8")),len(byte_decoder)], dtype=np.int32).tofile(f)
wte.tofile(f); wpe.tofile(f)
mlp_fc_w.tofile(f); mlp_fc_b.tofile(f)
mlp_proj_w.tofile(f); mlp_proj_b.tofile(f)
attn_w.tofile(f); attn_b.tofile(f)
attn_proj_w.tofile(f); attn_proj_b.tofile(f)
ln1_b.tofile(f); ln1_g.tofile(f)
ln2_b.tofile(f); ln2_g.tofile(f)
lnf_b.tofile(f); lnf_g.tofile(f)
idx.tofile(f)
f.write(decoder_txt)
vocab_idx.tofile(f)
f.write(vocab_txt)
byte_decoder.tofile(f)
len(vocab_idx),len(vocab_txt.encode("utf-8")),len(byte_decoder)], dtype=np.int32)

# Save the model to GGUF
g = gguf.GGUFWriter("model.gguf", None)
g.add_tensor("header", header)
g.add_tensor("wte", wte); g.add_tensor("wpe", wpe)
g.add_tensor("mlp_fc_w", mlp_fc_w); g.add_tensor("mlp_fc_b", mlp_fc_b)
g.add_tensor("mlp_proj_w", mlp_proj_w); g.add_tensor("mlp_proj_b", mlp_proj_b)
g.add_tensor("attn_w", attn_w); g.add_tensor("attn_b", attn_b)
g.add_tensor("attn_proj_w", attn_proj_w); g.add_tensor("attn_proj_b",
attn_proj_b)
g.add_tensor("ln1_b", ln1_b); g.add_tensor("ln1_g", ln1_g)
g.add_tensor("ln2_b", ln2_b); g.add_tensor("ln2_g", ln2_g)
g.add_tensor("lnf_b", lnf_b); g.add_tensor("lnf_g", lnf_g)
g.add_tensor("idx", idx)
g.add_tensor("decoder_txt", np.frombuffer(decoder_txt.encode("utf-8"),
dtype=np.int8))
g.add_tensor("vocab_idx", vocab_idx)
g.add_tensor("vocab_txt", np.frombuffer(vocab_txt.encode("utf-8"),
dtype=np.int8))
g.add_tensor("byte_decoder", byte_decoder)
g.write_header_to_file()
g.write_kv_data_to_file()
g.write_tensors_to_file()
g.close()

t2 = clock()
print("Save time: ", t2-t1)


def load_decoder(filename):
D = json.load(open(filename))
D2 = {v: k for k, v in D.items()}
Expand Down
42 changes: 37 additions & 5 deletions driver.f90
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,27 @@ subroutine load_input(filename, input_txt, n_tokens_to_generate)
close(u)
end subroutine

! Aligns file position in `u` to 32 byte boundary after `A` was read
subroutine align_i4(u, A)
integer, intent(in) :: u
integer, intent(in) :: A(..)
integer :: n, alignment
alignment = 32
n = size(A)*4
call fseek(u, alignment-modulo(n,alignment), 1)
end subroutine

subroutine align_str(u, A)
integer, intent(in) :: u
character, intent(in) :: A(:)
integer :: n, alignment
alignment = 32
n = size(A)
if (modulo(n, alignment) /= 0) then
call fseek(u, alignment-modulo(n,alignment), 1)
end if
end subroutine

subroutine load_model(filename, m)
character(*), intent(in) :: filename
type(model_t), intent(out) :: m
Expand All @@ -41,7 +62,11 @@ subroutine load_model(filename, m)
integer, parameter :: current_model_version = 1
integer :: model_mark
integer :: u
integer :: data_offset
open(newunit=u, file=filename, form="unformatted", access="stream", status="old")
! TODO: We need an easy way to extract this data offset from the gguf file
data_offset = 1056
call fseek(u, data_offset, 0)
certik marked this conversation as resolved.
Show resolved Hide resolved
read(u) model_mark
if (model_mark /= current_model_mark) then
print *, "Found:", model_mark
Expand All @@ -56,6 +81,7 @@ subroutine load_model(filename, m)
end if
read(u) m%n_vocab, m%n_ctx, m%n_embd, m%n_layer, m%n_head, m%n_decoder_idx, &
m%n_decoder_txt, m%n_vocab_idx, m%n_vocab_txt, m%n_byte_encoder
call fseek(u, 16, 1) ! Pad the 12 element i32 array to 32 byte boundary
allocate(m%wte(m%n_embd,m%n_vocab), m%wpe(m%n_embd,m%n_ctx), &
m%mlp_fc_w(4*m%n_embd,m%n_embd,m%n_layer), m%mlp_fc_b(4*m%n_embd,m%n_layer), &
m%mlp_proj_w(m%n_embd,4*m%n_embd,m%n_layer), m%mlp_proj_b(m%n_embd,m%n_layer), &
Expand All @@ -75,9 +101,15 @@ subroutine load_model(filename, m)
m%ln1_b, m%ln1_g, &
m%ln2_b, m%ln2_g, &
m%lnf_b, m%lnf_g, &
m%decoder_idx, m%decoder_txt, &
m%vocab_idx, m%vocab_txt, &
m%byte_encoder
m%decoder_idx
call align_i4(u, m%decoder_idx)
read(u) m%decoder_txt
call align_str(u, m%decoder_txt)
read(u) m%vocab_idx
call align_i4(u, m%vocab_idx)
read(u) m%vocab_txt
call align_str(u, m%vocab_txt)
read(u) m%byte_encoder
close(u)
end subroutine

Expand All @@ -92,7 +124,7 @@ subroutine gpt2_driver(input, output, m)
! Load the model
print "(a)", "Loading the model..."
call cpu_time(t1)
call load_model("model.dat", m)
call load_model("model.gguf", m)
call cpu_time(t2)
print "(a,f8.3,a,i2)", " done. Time:", t2-t1, "s, Model file version:", m%model_file_version
print *
Expand Down Expand Up @@ -235,7 +267,7 @@ subroutine chat(inputs)
type(model_t) :: m
character(:), allocatable :: prompt, input, output
integer :: i, n_prompts
call load_model("model.dat", m)
call load_model("model.gguf", m)
prompt = "Your name is fastGPT and you are an AI bot. The user will ask you &
&questions and you answer in a nice, truthful, short way." // LF // "&
&User: What is the capital of Czechia?" // LF // "&
Expand Down
2 changes: 1 addition & 1 deletion tests/test_more_inputs.f90
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ program test_more_inputs
25370, 254, 368, 83, 6557, 81, 11]
integer, allocatable :: input(:), output(:)

call load_model("model.dat", m)
call load_model("model.gguf", m)

call gpt2_driver2("Ondřej Čertík was born in", 13, m, input, output)
print *
Expand Down
Loading