From 5516278831bc5118bf3fb93203cdbca09ea0bfc3 Mon Sep 17 00:00:00 2001 From: Eric Curtin Date: Thu, 23 Jan 2025 12:40:37 +0000 Subject: [PATCH] Introduce ramalama bench Allows benchmarking of models, GPU stacks, etc. Signed-off-by: Eric Curtin --- docs/ramalama-bench.1.md | 44 ++++++++++++++++++++++++++++++++++++++++ docs/ramalama.1.md | 1 + ramalama/cli.py | 12 +++++++++++ ramalama/model.py | 31 ++++++++++++++++++++++------ 4 files changed, 82 insertions(+), 6 deletions(-) create mode 100644 docs/ramalama-bench.1.md diff --git a/docs/ramalama-bench.1.md b/docs/ramalama-bench.1.md new file mode 100644 index 00000000..498c1750 --- /dev/null +++ b/docs/ramalama-bench.1.md @@ -0,0 +1,44 @@ +% ramalama-bench 1 + +## NAME +ramalama\-bench - benchmark specified AI Model + +## SYNOPSIS +**ramalama bench** [*options*] *model* [arg ...] + +## MODEL TRANSPORTS + +| Transports | Prefix | Web Site | +| ------------- | ------ | --------------------------------------------------- | +| URL based | https://, http://, file:// | `https://web.site/ai.model`, `file://tmp/ai.model`| +| HuggingFace | huggingface://, hf://, hf.co/ | [`huggingface.co`](https://www.huggingface.co) | +| Ollama | ollama:// | [`ollama.com`](https://www.ollama.com) | +| OCI Container Registries | oci:// | [`opencontainers.org`](https://opencontainers.org)| +|||Examples: [`quay.io`](https://quay.io), [`Docker Hub`](https://docker.io),[`Artifactory`](https://artifactory.com)| + +RamaLama defaults to the Ollama registry transport. This default can be overridden in the `ramalama.conf` file or via the RAMALAMA_TRANSPORTS +environment. `export RAMALAMA_TRANSPORT=huggingface` Changes RamaLama to use huggingface transport. + +Modify individual model transports by specifying the `huggingface://`, `oci://`, `ollama://`, `https://`, `http://`, `file://` prefix to the model. + +URL support means if a model is on a web site or even on your local system, you can run it directly. + +## OPTIONS + +#### **--help**, **-h** +show this help message and exit + +## DESCRIPTION +Benchmark specified AI Model. + +## EXAMPLES + +``` +ramalama bench granite-moe3 +``` + +## SEE ALSO +**[ramalama(1)](ramalama.1.md)** + +## HISTORY +Jan 2025, Originally compiled by Eric diff --git a/docs/ramalama.1.md b/docs/ramalama.1.md index a26830d8..35f87c4f 100644 --- a/docs/ramalama.1.md +++ b/docs/ramalama.1.md @@ -129,6 +129,7 @@ show RamaLama version | Command | Description | | ------------------------------------------------- | ---------------------------------------------------------- | | [ramalama-containers(1)](ramalama-containers.1.md)| list all RamaLama containers | +| [ramalama-bench(1)](ramalama-bench.1.md) | benchmark specified AI Model | | [ramalama-convert(1)](ramalama-convert.1.md) | convert AI Models from local storage to OCI Image | | [ramalama-info(1)](ramalama-info.1.md) | Display RamaLama configuration information | | [ramalama-list(1)](ramalama-list.1.md) | list all downloaded AI Models | diff --git a/ramalama/cli.py b/ramalama/cli.py index b7c69cf2..e069bc9c 100644 --- a/ramalama/cli.py +++ b/ramalama/cli.py @@ -221,6 +221,7 @@ def configure_subcommands(parser): subparsers = parser.add_subparsers(dest="subcommand") subparsers.required = False help_parser(subparsers) + bench_parser(subparsers) containers_parser(subparsers) convert_parser(subparsers) info_parser(subparsers) @@ -370,6 +371,17 @@ def list_files_by_modification(): return sorted(models, key=lambda p: os.path.getmtime(p), reverse=True) +def bench_cli(args): + model = New(args.MODEL, args) + model.bench(args) + + +def bench_parser(subparsers): + parser = subparsers.add_parser("bench", aliases=["benchmark"], help="benchmark specified AI Model") + parser.add_argument("MODEL") # positional argument + parser.set_defaults(func=bench_cli) + + def containers_parser(subparsers): parser = subparsers.add_parser("containers", aliases=["ps"], help="list all RamaLama containers") parser.add_argument("--container", default=False, action="store_false", help=argparse.SUPPRESS) diff --git a/ramalama/model.py b/ramalama/model.py index 742e77bc..775af5f5 100644 --- a/ramalama/model.py +++ b/ramalama/model.py @@ -188,7 +188,7 @@ def setup_container(self, args): conman_args += ["-e", f"{k}={v}"] return conman_args - def gpu_args(self, force=False, server=False): + def gpu_args(self, force=False, runner=False): gpu_args = [] if ( force @@ -197,10 +197,10 @@ def gpu_args(self, force=False, server=False): or os.getenv("CUDA_VISIBLE_DEVICES") or platform.machine() in {"aarch64", "arm64"} # linux and macOS report aarch64 differently ): - if server: - gpu_args += ["-ngl"] # single dash + if runner: + gpu_args += ["--ngl"] # single dash else: - gpu_args += ["--ngl"] # double dash + gpu_args += ["-ngl"] # double dash gpu_args += ["999"] @@ -228,6 +228,12 @@ def exec_model_in_container(self, model_path, cmd_args, args): exec_cmd(conman_args, debug=args.debug) return True + def bench(self, args): + self.check_name_and_container(args) + model_path = self.get_model_path(args) + exec_args = self.build_exec_args_bench(args, model_path) + self.execute_model(model_path, exec_args, args) + def run(self, args): self.check_name_and_container(args) prompt = self.build_prompt(args) @@ -261,6 +267,19 @@ def get_model_path(self, args): return model_path + def build_exec_args_bench(self, args, model_path): + exec_model_path = MNT_FILE if args.container else model_path + exec_args = ["llama-bench"] + + get_gpu() + gpu_args = self.gpu_args(force=args.gpu) + if gpu_args is not None: + exec_args.extend(gpu_args) + + exec_args += ["-m", exec_model_path] + + return exec_args + def build_exec_args_run(self, args, model_path, prompt): exec_model_path = model_path if not args.container else MNT_FILE exec_args = ["llama-run", "-c", f"{args.context}", "--temp", f"{args.temp}"] @@ -272,7 +291,7 @@ def build_exec_args_run(self, args, model_path, prompt): exec_args += ["-v"] get_gpu() - gpu_args = self.gpu_args(force=args.gpu) + gpu_args = self.gpu_args(force=args.gpu, runner=True) if gpu_args is not None: exec_args.extend(gpu_args) @@ -337,7 +356,7 @@ def handle_runtime(self, args, exec_args, exec_model_path): exec_args = ["--port", args.port, "--model", MNT_FILE, "--max_model_len", "2048"] else: get_gpu() - gpu_args = self.gpu_args(force=args.gpu, server=True) + gpu_args = self.gpu_args(force=args.gpu) if gpu_args is not None: exec_args.extend(gpu_args) exec_args.extend(["--host", args.host])