Update llama.cpp

abetlen · Sep 9, 2023 · d3f6321 · d3f6321
1 parent a7fb07a
commit d3f6321
Show file tree

Hide file tree

Showing 2 changed files with 99 additions and 72 deletions.
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -506,7 +506,7 @@ def llama_mlock_supported() -> bool:
 _lib.llama_mlock_supported.restype = c_bool
 
 
-# LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
+# LLAMA_API int llama_n_vocab    (const struct llama_context * ctx);
 def llama_n_vocab(ctx: llama_context_p) -> int:
     return _lib.llama_n_vocab(ctx)
 
@@ -515,7 +515,7 @@ def llama_n_vocab(ctx: llama_context_p) -> int:
 _lib.llama_n_vocab.restype = c_int
 
 
-# LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
+# LLAMA_API int llama_n_ctx      (const struct llama_context * ctx);
 def llama_n_ctx(ctx: llama_context_p) -> int:
     return _lib.llama_n_ctx(ctx)
 
@@ -524,7 +524,16 @@ def llama_n_ctx(ctx: llama_context_p) -> int:
 _lib.llama_n_ctx.restype = c_int
 
 
-# LLAMA_API int llama_n_embd (const struct llama_context * ctx);
+# LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
+def llama_n_ctx_train(ctx: llama_context_p) -> int:
+    return _lib.llama_n_ctx_train(ctx)
+
+
+_lib.llama_n_ctx_train.argtypes = [llama_context_p]
+_lib.llama_n_ctx_train.restype = c_int
+
+
+# LLAMA_API int llama_n_embd     (const struct llama_context * ctx);
 def llama_n_embd(ctx: llama_context_p) -> int:
     return _lib.llama_n_embd(ctx)
 
@@ -542,7 +551,7 @@ def llama_vocab_type(ctx: llama_context_p) -> int:
 _lib.llama_vocab_type.restype = c_int
 
 
-# LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
+# LLAMA_API int llama_model_n_vocab    (const struct llama_model * model);
 def llama_model_n_vocab(model: llama_model_p) -> int:
     return _lib.llama_model_n_vocab(model)
 
@@ -551,7 +560,7 @@ def llama_model_n_vocab(model: llama_model_p) -> int:
 _lib.llama_model_n_vocab.restype = c_int
 
 
-# LLAMA_API int llama_model_n_ctx  (const struct llama_model * model);
+# LLAMA_API int llama_model_n_ctx      (const struct llama_model * model);
 def llama_model_n_ctx(model: llama_model_p) -> int:
     return _lib.llama_model_n_ctx(model)
 
@@ -560,7 +569,16 @@ def llama_model_n_ctx(model: llama_model_p) -> int:
 _lib.llama_model_n_ctx.restype = c_int
 
 
-# LLAMA_API int llama_model_n_embd (const struct llama_model * model);
+# LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model);
+def llama_model_n_ctx_train(model: llama_model_p) -> int:
+    return _lib.llama_model_n_ctx_train(model)
+
+
+_lib.llama_model_n_ctx_train.argtypes = [llama_model_p]
+_lib.llama_model_n_ctx_train.restype = c_int
+
+
+# LLAMA_API int llama_model_n_embd     (const struct llama_model * model);
 def llama_model_n_embd(model: llama_model_p) -> int:
     return _lib.llama_model_n_embd(model)
 
@@ -1046,74 +1064,14 @@ def llama_grammar_free(grammar: llama_grammar_p):
 _lib.llama_grammar_free.argtypes = [llama_grammar_p]
 _lib.llama_grammar_free.restype = None
 
-# //
-# // Beam search
-# //
-
-
-# struct llama_beam_view {
-#     const llama_token * tokens;
-#     size_t n_tokens;
-#     float p;   // Cumulative beam probability (renormalized relative to all beams)
-#     bool eob;  // Callback should set this to true when a beam is at end-of-beam.
-# };
-class llama_beam_view(ctypes.Structure):
-    _fields_ = [
-        ("tokens", llama_token_p),
-        ("n_tokens", c_size_t),
-        ("p", c_float),
-        ("eob", c_bool),
-    ]
 
+# LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
+def llama_grammar_copy(grammar: llama_grammar_p) -> llama_grammar_p:
+    return _lib.llama_grammar_copy(grammar)
 
-# // Passed to beam_search_callback function.
-# // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
-# // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
-# // These pointers are valid only during the synchronous callback, so should not be saved.
-# struct llama_beams_state {
-#     struct llama_beam_view * beam_views;
-#     size_t n_beams;               // Number of elements in beam_views[].
-#     size_t common_prefix_length;  // Current max length of prefix tokens shared by all beams.
-#     bool last_call;               // True iff this is the last callback invocation.
-# };
-class llama_beams_state(ctypes.Structure):
-    _fields_ = [
-        ("beam_views", POINTER(llama_beam_view)),
-        ("n_beams", c_size_t),
-        ("common_prefix_length", c_size_t),
-        ("last_call", c_bool),
-    ]
-
-
-# // Type of pointer to the beam_search_callback function.
-# // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
-# // passed back to beam_search_callback. This avoids having to use global variables in the callback.
-# typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
-llama_beam_search_callback_fn_t = ctypes.CFUNCTYPE(None, c_void_p, llama_beams_state)
-
-
-# /// @details Deterministically returns entire sentence constructed by a beam search.
-# /// @param ctx Pointer to the llama_context.
-# /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
-# /// @param callback_data A pointer that is simply passed back to callback.
-# /// @param n_beams Number of beams to use.
-# /// @param n_past Number of tokens already evaluated.
-# /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
-# /// @param n_threads Number of threads as passed to llama_eval().
-# LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
-def llama_beam_search(
-    ctx: llama_context_p,
-    callback: "ctypes._CFuncPtr[None, c_void_p, llama_beams_state]",  # type: ignore
-    callback_data: c_void_p,
-    n_beams: c_size_t,
-    n_past: c_int,
-    n_predict: c_int,
-    n_threads: c_int,
-):
-    return _lib.llama_beam_search(
-        ctx, callback, callback_data, n_beams, n_past, n_predict, n_threads
-    )
 
+_lib.llama_grammar_copy.argtypes = [llama_grammar_p]
+_lib.llama_grammar_copy.restype = llama_grammar_p
 
 # //
 # // Sampling functions
@@ -1436,6 +1394,74 @@ def llama_grammar_accept_token(
     llama_token,
 ]
 _lib.llama_grammar_accept_token.restype = None
+# //
+# // Beam search
+# //
+
+
+# struct llama_beam_view {
+#     const llama_token * tokens;
+#     size_t n_tokens;
+#     float p;   // Cumulative beam probability (renormalized relative to all beams)
+#     bool eob;  // Callback should set this to true when a beam is at end-of-beam.
+# };
+class llama_beam_view(ctypes.Structure):
+    _fields_ = [
+        ("tokens", llama_token_p),
+        ("n_tokens", c_size_t),
+        ("p", c_float),
+        ("eob", c_bool),
+    ]
+
+
+# // Passed to beam_search_callback function.
+# // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
+# // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
+# // These pointers are valid only during the synchronous callback, so should not be saved.
+# struct llama_beams_state {
+#     struct llama_beam_view * beam_views;
+#     size_t n_beams;               // Number of elements in beam_views[].
+#     size_t common_prefix_length;  // Current max length of prefix tokens shared by all beams.
+#     bool last_call;               // True iff this is the last callback invocation.
+# };
+class llama_beams_state(ctypes.Structure):
+    _fields_ = [
+        ("beam_views", POINTER(llama_beam_view)),
+        ("n_beams", c_size_t),
+        ("common_prefix_length", c_size_t),
+        ("last_call", c_bool),
+    ]
+
+
+# // Type of pointer to the beam_search_callback function.
+# // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
+# // passed back to beam_search_callback. This avoids having to use global variables in the callback.
+# typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
+llama_beam_search_callback_fn_t = ctypes.CFUNCTYPE(None, c_void_p, llama_beams_state)
+
+
+# /// @details Deterministically returns entire sentence constructed by a beam search.
+# /// @param ctx Pointer to the llama_context.
+# /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
+# /// @param callback_data A pointer that is simply passed back to callback.
+# /// @param n_beams Number of beams to use.
+# /// @param n_past Number of tokens already evaluated.
+# /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
+# /// @param n_threads Number of threads as passed to llama_eval().
+# LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
+def llama_beam_search(
+    ctx: llama_context_p,
+    callback: "ctypes._CFuncPtr[None, c_void_p, llama_beams_state]",  # type: ignore
+    callback_data: c_void_p,
+    n_beams: c_size_t,
+    n_past: c_int,
+    n_predict: c_int,
+    n_threads: c_int,
+):
+    return _lib.llama_beam_search(
+        ctx, callback, callback_data, n_beams, n_past, n_predict, n_threads
+    )
+
 
 # Performance information
 
@@ -1494,6 +1520,7 @@ def llama_log_set(
 def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p):
     return _lib.llama_dump_timing_info_yaml(stream, ctx)
 
+
 _lib.llama_dump_timing_info_yaml.argtypes = [ctypes.c_void_p, llama_context_p]
 _lib.llama_dump_timing_info_yaml.restype = None
 

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
+5 −0		.clang-tidy
+1 −1		.devops/full-cuda.Dockerfile
+1 −1		.devops/main-cuda.Dockerfile
+3 −0		.editorconfig
+0 −1		.github/workflows/build.yml
+36 −0		.github/workflows/code-coverage.yml
+23 −14		.gitignore
+91 −25		CMakeLists.txt
+195 −124		Makefile
+35 −4		Package.swift
+49 −48		README.md
+14 −0		codecov.yml
+244 −94		common/common.cpp
+40 −1		common/common.h
+1 −0		common/grammar-parser.cpp
+14 −14		common/log.h
+4 −4		convert-falcon-hf-to-gguf.py
+133 −35		convert-llama-ggml-to-gguf.py
+35 −12		convert.py
+1 −0		examples/CMakeLists.txt
+0 −4		examples/beam-search/beam-search.cpp
+4 −4		examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+1 −6		examples/embd-input/embd-input-lib.cpp
+7 −6		examples/embedding/embedding.cpp
+21 −21		examples/gguf/gguf.cpp
+34 −34		examples/gptneox-wip/falcon-main.cpp
+39 −38		examples/gptneox-wip/gptneox-main.cpp
+26 −21		examples/llama-bench/llama-bench.cpp
+38 −143		examples/main/main.cpp
+8 −7		examples/perplexity/perplexity.cpp
+1 −1		examples/quantize-stats/quantize-stats.cpp
+3 −4		examples/quantize/quantize.cpp
+2 −2		examples/save-load-state/save-load-state.cpp
+2,005 −1,939		examples/server/index.html.hpp
+37 −4		examples/server/public/index.html
+54 −44		examples/server/server.cpp
+0 −4		examples/simple/simple.cpp
+8 −0		examples/speculative/CMakeLists.txt
+288 −0		examples/speculative/speculative.cpp
+10 −36		examples/train-text-from-scratch/train-text-from-scratch.cpp
+4 −0		flake.nix
+101 −24		ggml-alloc.c
+97 −41		ggml-cuda.cu
+56 −15		ggml-metal.m
+232 −136		ggml-metal.metal
+7 −7		ggml-opencl.cpp
+31 −55		ggml.c
+2 −2		gguf-py/gguf/gguf.py
+1 −1		gguf-py/pyproject.toml
+34 −0		grammars/json_arr.gbnf
+43 −6		k_quants.c
+135 −67		llama.cpp
+10 −6		llama.h
+1 −1		tests/test-quantize-perf.cpp