Skip to content

Commit

Permalink
Update llama.cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
abetlen committed Sep 9, 2023
1 parent a7fb07a commit d3f6321
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 72 deletions.
169 changes: 98 additions & 71 deletions llama_cpp/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,7 @@ def llama_mlock_supported() -> bool:
_lib.llama_mlock_supported.restype = c_bool


# LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
# LLAMA_API int llama_n_vocab (const struct llama_context * ctx);
def llama_n_vocab(ctx: llama_context_p) -> int:
return _lib.llama_n_vocab(ctx)

Expand All @@ -515,7 +515,7 @@ def llama_n_vocab(ctx: llama_context_p) -> int:
_lib.llama_n_vocab.restype = c_int


# LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
# LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
def llama_n_ctx(ctx: llama_context_p) -> int:
return _lib.llama_n_ctx(ctx)

Expand All @@ -524,7 +524,16 @@ def llama_n_ctx(ctx: llama_context_p) -> int:
_lib.llama_n_ctx.restype = c_int


# LLAMA_API int llama_n_embd (const struct llama_context * ctx);
# LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
def llama_n_ctx_train(ctx: llama_context_p) -> int:
return _lib.llama_n_ctx_train(ctx)


_lib.llama_n_ctx_train.argtypes = [llama_context_p]
_lib.llama_n_ctx_train.restype = c_int


# LLAMA_API int llama_n_embd (const struct llama_context * ctx);
def llama_n_embd(ctx: llama_context_p) -> int:
return _lib.llama_n_embd(ctx)

Expand All @@ -542,7 +551,7 @@ def llama_vocab_type(ctx: llama_context_p) -> int:
_lib.llama_vocab_type.restype = c_int


# LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
# LLAMA_API int llama_model_n_vocab (const struct llama_model * model);
def llama_model_n_vocab(model: llama_model_p) -> int:
return _lib.llama_model_n_vocab(model)

Expand All @@ -551,7 +560,7 @@ def llama_model_n_vocab(model: llama_model_p) -> int:
_lib.llama_model_n_vocab.restype = c_int


# LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
# LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
def llama_model_n_ctx(model: llama_model_p) -> int:
return _lib.llama_model_n_ctx(model)

Expand All @@ -560,7 +569,16 @@ def llama_model_n_ctx(model: llama_model_p) -> int:
_lib.llama_model_n_ctx.restype = c_int


# LLAMA_API int llama_model_n_embd (const struct llama_model * model);
# LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model);
def llama_model_n_ctx_train(model: llama_model_p) -> int:
return _lib.llama_model_n_ctx_train(model)


_lib.llama_model_n_ctx_train.argtypes = [llama_model_p]
_lib.llama_model_n_ctx_train.restype = c_int


# LLAMA_API int llama_model_n_embd (const struct llama_model * model);
def llama_model_n_embd(model: llama_model_p) -> int:
return _lib.llama_model_n_embd(model)

Expand Down Expand Up @@ -1046,74 +1064,14 @@ def llama_grammar_free(grammar: llama_grammar_p):
_lib.llama_grammar_free.argtypes = [llama_grammar_p]
_lib.llama_grammar_free.restype = None

# //
# // Beam search
# //


# struct llama_beam_view {
# const llama_token * tokens;
# size_t n_tokens;
# float p; // Cumulative beam probability (renormalized relative to all beams)
# bool eob; // Callback should set this to true when a beam is at end-of-beam.
# };
class llama_beam_view(ctypes.Structure):
_fields_ = [
("tokens", llama_token_p),
("n_tokens", c_size_t),
("p", c_float),
("eob", c_bool),
]

# LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
def llama_grammar_copy(grammar: llama_grammar_p) -> llama_grammar_p:
return _lib.llama_grammar_copy(grammar)

# // Passed to beam_search_callback function.
# // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
# // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
# // These pointers are valid only during the synchronous callback, so should not be saved.
# struct llama_beams_state {
# struct llama_beam_view * beam_views;
# size_t n_beams; // Number of elements in beam_views[].
# size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
# bool last_call; // True iff this is the last callback invocation.
# };
class llama_beams_state(ctypes.Structure):
_fields_ = [
("beam_views", POINTER(llama_beam_view)),
("n_beams", c_size_t),
("common_prefix_length", c_size_t),
("last_call", c_bool),
]


# // Type of pointer to the beam_search_callback function.
# // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
# // passed back to beam_search_callback. This avoids having to use global variables in the callback.
# typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
llama_beam_search_callback_fn_t = ctypes.CFUNCTYPE(None, c_void_p, llama_beams_state)


# /// @details Deterministically returns entire sentence constructed by a beam search.
# /// @param ctx Pointer to the llama_context.
# /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
# /// @param callback_data A pointer that is simply passed back to callback.
# /// @param n_beams Number of beams to use.
# /// @param n_past Number of tokens already evaluated.
# /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
# /// @param n_threads Number of threads as passed to llama_eval().
# LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
def llama_beam_search(
ctx: llama_context_p,
callback: "ctypes._CFuncPtr[None, c_void_p, llama_beams_state]", # type: ignore
callback_data: c_void_p,
n_beams: c_size_t,
n_past: c_int,
n_predict: c_int,
n_threads: c_int,
):
return _lib.llama_beam_search(
ctx, callback, callback_data, n_beams, n_past, n_predict, n_threads
)

_lib.llama_grammar_copy.argtypes = [llama_grammar_p]
_lib.llama_grammar_copy.restype = llama_grammar_p

# //
# // Sampling functions
Expand Down Expand Up @@ -1436,6 +1394,74 @@ def llama_grammar_accept_token(
llama_token,
]
_lib.llama_grammar_accept_token.restype = None
# //
# // Beam search
# //


# struct llama_beam_view {
# const llama_token * tokens;
# size_t n_tokens;
# float p; // Cumulative beam probability (renormalized relative to all beams)
# bool eob; // Callback should set this to true when a beam is at end-of-beam.
# };
class llama_beam_view(ctypes.Structure):
_fields_ = [
("tokens", llama_token_p),
("n_tokens", c_size_t),
("p", c_float),
("eob", c_bool),
]


# // Passed to beam_search_callback function.
# // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
# // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
# // These pointers are valid only during the synchronous callback, so should not be saved.
# struct llama_beams_state {
# struct llama_beam_view * beam_views;
# size_t n_beams; // Number of elements in beam_views[].
# size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
# bool last_call; // True iff this is the last callback invocation.
# };
class llama_beams_state(ctypes.Structure):
_fields_ = [
("beam_views", POINTER(llama_beam_view)),
("n_beams", c_size_t),
("common_prefix_length", c_size_t),
("last_call", c_bool),
]


# // Type of pointer to the beam_search_callback function.
# // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
# // passed back to beam_search_callback. This avoids having to use global variables in the callback.
# typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
llama_beam_search_callback_fn_t = ctypes.CFUNCTYPE(None, c_void_p, llama_beams_state)


# /// @details Deterministically returns entire sentence constructed by a beam search.
# /// @param ctx Pointer to the llama_context.
# /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
# /// @param callback_data A pointer that is simply passed back to callback.
# /// @param n_beams Number of beams to use.
# /// @param n_past Number of tokens already evaluated.
# /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
# /// @param n_threads Number of threads as passed to llama_eval().
# LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
def llama_beam_search(
ctx: llama_context_p,
callback: "ctypes._CFuncPtr[None, c_void_p, llama_beams_state]", # type: ignore
callback_data: c_void_p,
n_beams: c_size_t,
n_past: c_int,
n_predict: c_int,
n_threads: c_int,
):
return _lib.llama_beam_search(
ctx, callback, callback_data, n_beams, n_past, n_predict, n_threads
)


# Performance information

Expand Down Expand Up @@ -1494,6 +1520,7 @@ def llama_log_set(
def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p):
return _lib.llama_dump_timing_info_yaml(stream, ctx)


_lib.llama_dump_timing_info_yaml.argtypes = [ctypes.c_void_p, llama_context_p]
_lib.llama_dump_timing_info_yaml.restype = None

Expand Down
2 changes: 1 addition & 1 deletion vendor/llama.cpp
Submodule llama.cpp updated 54 files
+5 −0 .clang-tidy
+1 −1 .devops/full-cuda.Dockerfile
+1 −1 .devops/main-cuda.Dockerfile
+3 −0 .editorconfig
+0 −1 .github/workflows/build.yml
+36 −0 .github/workflows/code-coverage.yml
+23 −14 .gitignore
+91 −25 CMakeLists.txt
+195 −124 Makefile
+35 −4 Package.swift
+49 −48 README.md
+14 −0 codecov.yml
+244 −94 common/common.cpp
+40 −1 common/common.h
+1 −0 common/grammar-parser.cpp
+14 −14 common/log.h
+4 −4 convert-falcon-hf-to-gguf.py
+133 −35 convert-llama-ggml-to-gguf.py
+35 −12 convert.py
+1 −0 examples/CMakeLists.txt
+0 −4 examples/beam-search/beam-search.cpp
+4 −4 examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+1 −6 examples/embd-input/embd-input-lib.cpp
+7 −6 examples/embedding/embedding.cpp
+21 −21 examples/gguf/gguf.cpp
+34 −34 examples/gptneox-wip/falcon-main.cpp
+39 −38 examples/gptneox-wip/gptneox-main.cpp
+26 −21 examples/llama-bench/llama-bench.cpp
+38 −143 examples/main/main.cpp
+8 −7 examples/perplexity/perplexity.cpp
+1 −1 examples/quantize-stats/quantize-stats.cpp
+3 −4 examples/quantize/quantize.cpp
+2 −2 examples/save-load-state/save-load-state.cpp
+2,005 −1,939 examples/server/index.html.hpp
+37 −4 examples/server/public/index.html
+54 −44 examples/server/server.cpp
+0 −4 examples/simple/simple.cpp
+8 −0 examples/speculative/CMakeLists.txt
+288 −0 examples/speculative/speculative.cpp
+10 −36 examples/train-text-from-scratch/train-text-from-scratch.cpp
+4 −0 flake.nix
+101 −24 ggml-alloc.c
+97 −41 ggml-cuda.cu
+56 −15 ggml-metal.m
+232 −136 ggml-metal.metal
+7 −7 ggml-opencl.cpp
+31 −55 ggml.c
+2 −2 gguf-py/gguf/gguf.py
+1 −1 gguf-py/pyproject.toml
+34 −0 grammars/json_arr.gbnf
+43 −6 k_quants.c
+135 −67 llama.cpp
+10 −6 llama.h
+1 −1 tests/test-quantize-perf.cpp

0 comments on commit d3f6321

Please sign in to comment.