From 5516278831bc5118bf3fb93203cdbca09ea0bfc3 Mon Sep 17 00:00:00 2001
From: Eric Curtin <ecurtin@redhat.com>
Date: Thu, 23 Jan 2025 12:40:37 +0000
Subject: [PATCH] Introduce ramalama bench

Allows benchmarking of models, GPU stacks, etc.

Signed-off-by: Eric Curtin <ecurtin@redhat.com>
---
 docs/ramalama-bench.1.md | 44 ++++++++++++++++++++++++++++++++++++++++
 docs/ramalama.1.md       |  1 +
 ramalama/cli.py          | 12 +++++++++++
 ramalama/model.py        | 31 ++++++++++++++++++++++------
 4 files changed, 82 insertions(+), 6 deletions(-)
 create mode 100644 docs/ramalama-bench.1.md

diff --git a/docs/ramalama-bench.1.md b/docs/ramalama-bench.1.md
new file mode 100644
index 00000000..498c1750
--- /dev/null
+++ b/docs/ramalama-bench.1.md
@@ -0,0 +1,44 @@
+% ramalama-bench 1
+
+## NAME
+ramalama\-bench - benchmark specified AI Model
+
+## SYNOPSIS
+**ramalama bench** [*options*] *model* [arg ...]
+
+## MODEL TRANSPORTS
+
+| Transports    | Prefix | Web Site                                            |
+| ------------- | ------ | --------------------------------------------------- |
+| URL based    | https://, http://, file:// | `https://web.site/ai.model`, `file://tmp/ai.model`|
+| HuggingFace   | huggingface://, hf://, hf.co/ | [`huggingface.co`](https://www.huggingface.co)      |
+| Ollama        | ollama:// | [`ollama.com`](https://www.ollama.com)              |
+| OCI Container Registries | oci:// | [`opencontainers.org`](https://opencontainers.org)|
+|||Examples: [`quay.io`](https://quay.io),  [`Docker Hub`](https://docker.io),[`Artifactory`](https://artifactory.com)|
+
+RamaLama defaults to the Ollama registry transport. This default can be overridden in the `ramalama.conf` file or via the RAMALAMA_TRANSPORTS
+environment. `export RAMALAMA_TRANSPORT=huggingface` Changes RamaLama to use huggingface transport.
+
+Modify individual model transports by specifying the `huggingface://`, `oci://`, `ollama://`, `https://`, `http://`, `file://` prefix to the model.
+
+URL support means if a model is on a web site or even on your local system, you can run it directly.
+
+## OPTIONS
+
+#### **--help**, **-h**
+show this help message and exit
+
+## DESCRIPTION
+Benchmark specified AI Model.
+
+## EXAMPLES
+
+```
+ramalama bench granite-moe3
+```
+
+## SEE ALSO
+**[ramalama(1)](ramalama.1.md)**
+
+## HISTORY
+Jan 2025, Originally compiled by Eric <ecurtin@redhat.com>
diff --git a/docs/ramalama.1.md b/docs/ramalama.1.md
index a26830d8..35f87c4f 100644
--- a/docs/ramalama.1.md
+++ b/docs/ramalama.1.md
@@ -129,6 +129,7 @@ show RamaLama version
 | Command                                           | Description                                                |
 | ------------------------------------------------- | ---------------------------------------------------------- |
 | [ramalama-containers(1)](ramalama-containers.1.md)| list all RamaLama containers                               |
+| [ramalama-bench(1)](ramalama-bench.1.md)          | benchmark specified AI Model                               |
 | [ramalama-convert(1)](ramalama-convert.1.md)      | convert AI Models from local storage to OCI Image          |
 | [ramalama-info(1)](ramalama-info.1.md)            | Display RamaLama configuration information                 |
 | [ramalama-list(1)](ramalama-list.1.md)            | list all downloaded AI Models                              |
diff --git a/ramalama/cli.py b/ramalama/cli.py
index b7c69cf2..e069bc9c 100644
--- a/ramalama/cli.py
+++ b/ramalama/cli.py
@@ -221,6 +221,7 @@ def configure_subcommands(parser):
     subparsers = parser.add_subparsers(dest="subcommand")
     subparsers.required = False
     help_parser(subparsers)
+    bench_parser(subparsers)
     containers_parser(subparsers)
     convert_parser(subparsers)
     info_parser(subparsers)
@@ -370,6 +371,17 @@ def list_files_by_modification():
     return sorted(models, key=lambda p: os.path.getmtime(p), reverse=True)
 
 
+def bench_cli(args):
+    model = New(args.MODEL, args)
+    model.bench(args)
+
+
+def bench_parser(subparsers):
+    parser = subparsers.add_parser("bench", aliases=["benchmark"], help="benchmark specified AI Model")
+    parser.add_argument("MODEL")  # positional argument
+    parser.set_defaults(func=bench_cli)
+
+
 def containers_parser(subparsers):
     parser = subparsers.add_parser("containers", aliases=["ps"], help="list all RamaLama containers")
     parser.add_argument("--container", default=False, action="store_false", help=argparse.SUPPRESS)
diff --git a/ramalama/model.py b/ramalama/model.py
index 742e77bc..775af5f5 100644
--- a/ramalama/model.py
+++ b/ramalama/model.py
@@ -188,7 +188,7 @@ def setup_container(self, args):
             conman_args += ["-e", f"{k}={v}"]
         return conman_args
 
-    def gpu_args(self, force=False, server=False):
+    def gpu_args(self, force=False, runner=False):
         gpu_args = []
         if (
             force
@@ -197,10 +197,10 @@ def gpu_args(self, force=False, server=False):
             or os.getenv("CUDA_VISIBLE_DEVICES")
             or platform.machine() in {"aarch64", "arm64"}  # linux and macOS report aarch64 differently
         ):
-            if server:
-                gpu_args += ["-ngl"]  # single dash
+            if runner:
+                gpu_args += ["--ngl"]  # single dash
             else:
-                gpu_args += ["--ngl"]  # double dash
+                gpu_args += ["-ngl"]  # double dash
 
             gpu_args += ["999"]
 
@@ -228,6 +228,12 @@ def exec_model_in_container(self, model_path, cmd_args, args):
         exec_cmd(conman_args, debug=args.debug)
         return True
 
+    def bench(self, args):
+        self.check_name_and_container(args)
+        model_path = self.get_model_path(args)
+        exec_args = self.build_exec_args_bench(args, model_path)
+        self.execute_model(model_path, exec_args, args)
+
     def run(self, args):
         self.check_name_and_container(args)
         prompt = self.build_prompt(args)
@@ -261,6 +267,19 @@ def get_model_path(self, args):
 
         return model_path
 
+    def build_exec_args_bench(self, args, model_path):
+        exec_model_path = MNT_FILE if args.container else model_path
+        exec_args = ["llama-bench"]
+
+        get_gpu()
+        gpu_args = self.gpu_args(force=args.gpu)
+        if gpu_args is not None:
+            exec_args.extend(gpu_args)
+
+        exec_args += ["-m", exec_model_path]
+
+        return exec_args
+
     def build_exec_args_run(self, args, model_path, prompt):
         exec_model_path = model_path if not args.container else MNT_FILE
         exec_args = ["llama-run", "-c", f"{args.context}", "--temp", f"{args.temp}"]
@@ -272,7 +291,7 @@ def build_exec_args_run(self, args, model_path, prompt):
             exec_args += ["-v"]
 
         get_gpu()
-        gpu_args = self.gpu_args(force=args.gpu)
+        gpu_args = self.gpu_args(force=args.gpu, runner=True)
         if gpu_args is not None:
             exec_args.extend(gpu_args)
 
@@ -337,7 +356,7 @@ def handle_runtime(self, args, exec_args, exec_model_path):
             exec_args = ["--port", args.port, "--model", MNT_FILE, "--max_model_len", "2048"]
         else:
             get_gpu()
-            gpu_args = self.gpu_args(force=args.gpu, server=True)
+            gpu_args = self.gpu_args(force=args.gpu)
             if gpu_args is not None:
                 exec_args.extend(gpu_args)
             exec_args.extend(["--host", args.host])