Merge pull request #720 from basetenlabs/bump-version-0.7.15

Release 0.7.15
basetenlabs · Nov 7, 2023 · 8966035 · 8966035
2 parents dec7081 + d58107d
commit 8966035
Show file tree

Hide file tree

Showing 32 changed files with 710 additions and 352 deletions.
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -33,7 +33,7 @@ jobs:
       matrix:
         python_version: ["3.8", "3.9", "3.10", "3.11"]
         use_gpu: ["y", "n"]
-        job_type: ["server", "training"]
+        job_type: ["server"]
     steps:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v1

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -36,7 +36,7 @@ jobs:
       matrix:
         python_version: ["3.8", "3.9", "3.10", "3.11"]
         use_gpu: ["y", "n"]
-        job_type: ["server", "training"]
+        job_type: ["server"]
     steps:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v1

diff --git a/bin/generate_base_images.py b/bin/generate_base_images.py
@@ -104,13 +104,18 @@ def _build(
             "docker",
             "buildx",
             "build",
-            "--platform=linux/amd64",
+            "--platform=linux/arm64,linux/amd64",
             ".",
             "-t",
             image_with_tag,
         ]
         if push:
             cmd.append("--push")
+
+        # Needed to support multi-arch build.
+        subprocess.run(
+            ["docker", "buildx", "create", "--use"], cwd=build_ctx_path, check=True
+        )
         subprocess.run(cmd, cwd=build_ctx_path, check=True)
 
 

diff --git a/docker/base_images/base_image.Dockerfile.jinja b/docker/base_images/base_image.Dockerfile.jinja
@@ -1,19 +1,19 @@
 {% if use_gpu %}
-FROM nvidia/cuda:11.2.1-base-ubuntu20.04
-ENV CUDNN_VERSION=8.1.0.77
-ENV CUDA=11.2
+FROM nvidia/cuda:12.2.2-base-ubuntu20.04
+ENV CUDNN_VERSION=8.9.5.29
+ENV CUDA=12.2
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
     apt-get update && apt-get install -y --no-install-recommends \
         ca-certificates \
-        cuda-command-line-tools-11-2 \
-        libcublas-11-2 \
-        libcublas-dev-11-2 \
-        libcufft-11-2 \
-        libcurand-11-2 \
-        libcusolver-11-2 \
-        libcusparse-11-2 \
+        cuda-command-line-tools-12-2 \
+        libcublas-12-2 \
+        libcublas-dev-12-2 \
+        libcufft-12-2 \
+        libcurand-12-2 \
+        libcusolver-12-2 \
+        libcusparse-12-2 \
         libcudnn8=${CUDNN_VERSION}-1+cuda${CUDA} \
         libgomp1 \
         && \

diff --git a/docs/_snippets/config-params.mdx b/docs/_snippets/config-params.mdx
@@ -231,36 +231,36 @@ Either `VLLM` for vLLM, or `TGI` for TGI.
 The arguments for the model server. This includes information such as which model you intend to load, and
 which endpoin from the server you'd like to use.
 
-### `hf_cache`
+### `model_cache`
 
-The `hf_cache` section is used for caching model weights at build-time. This is one of the biggest levers
+The `model_cache` section is used for caching model weights at build-time. This is one of the biggest levers
 for decreasing cold start times, as downloading weights can be one of the lengthiest parts of starting a new
 model instance. Using this section ensures that model weights are cached at _build_ time.
 
 See the [model cache guide](guides/model-cache) for the full details on how to use this field.
 
 <Note>
-  Despite the fact that this field is called the `hf_cache`, there are multiple backends supported, not just Hugging Face. You can
+  Despite the fact that this field is called the `model_cache`, there are multiple backends supported, not just Hugging Face. You can
   also cache weights stored on GCS, for instance.
 </Note>
 
-#### `hf_cache.<list_item>.repo_id`
+#### `model_cache.<list_item>.repo_id`
 
 The endpoint for your cloud bucket. Currently, we support Hugging Face and Google Cloud Storage.
 
 Example: `madebyollin/sdxl-vae-fp16-fix` for a Hugging Face repo, or `gcs://path-to-my-bucket` for
 a GCS bucket.
 
-#### `hf_cache.<list_item>.revision`
+#### `model_cache.<list_item>.revision`
 
 Points to your revision. This is only relevant if you are pulling By default, it refers to `main`.
 
-#### `hf_cache.<list_item>.allow_patterns`
+#### `model_cache.<list_item>.allow_patterns`
 
 Only cache files that match specified patterns. Utilize Unix shell-style wildcards to denote these patterns.
 By default, all paths are included.
 
-#### `hf_cache.<list_item>.ignore_patterns`
+#### `model_cache.<list_item>.ignore_patterns`
 
 Conversely, you can also denote file patterns to ignore, hence streamlining the caching process.
 By default, nothing is ignored.
diff --git a/docs/examples/04-image-generation.mdx b/docs/examples/04-image-generation.mdx
@@ -215,7 +215,7 @@ subsequently.
 
 To enable caching, add the following to the config:
 ```yaml
-hf_cache:
+model_cache:
   - repo_id: madebyollin/sdxl-vae-fp16-fix
     allow_patterns:
       - config.json

diff --git a/docs/examples/06-high-performance-cached-weights.mdx b/docs/examples/06-high-performance-cached-weights.mdx
@@ -89,9 +89,9 @@ requirements:
 - sentencepiece==0.1.99
 - protobuf==4.24.4
 ```
-# Configuring the hf_cache
+# Configuring the model_cache
 
-To cache model weights, set the `hf_cache` key.
+To cache model weights, set the `model_cache` key.
 The `repo_id` field allows you to specify a Huggingface
 repo to pull down and cache at build-time, and the `ignore_patterns`
 field allows you to specify files to ignore. If this is specified, then
@@ -100,7 +100,7 @@ this repo won't have to be pulled during runtime.
 Check out the [guide](https://truss.baseten.co/guides/model-cache) for more info.
 
 ```yaml config.yaml
-hf_cache:
+model_cache:
 - repo_id: "NousResearch/Llama-2-7b-chat-hf"
   ignore_patterns:
   - "*.bin"
@@ -197,7 +197,7 @@ requirements:
 - transformers==4.34.0
 - sentencepiece==0.1.99
 - protobuf==4.24.4
-hf_cache:
+model_cache:
 - repo_id: "NousResearch/Llama-2-7b-chat-hf"
   ignore_patterns:
   - "*.bin"

diff --git a/docs/examples/performance/cached-weights.mdx b/docs/examples/performance/cached-weights.mdx
@@ -3,7 +3,7 @@ title: Deploy Llama 2 with Caching
 description: "Enable fast cold starts for a model with private Hugging Face weights"
 ---
 
-In this example, we will cover how you can use the `hf_cache` key in your Truss's `config.yml` to automatically bundle model weights from a private Hugging Face repo.
+In this example, we will cover how you can use the `model_cache` key in your Truss's `config.yml` to automatically bundle model weights from a private Hugging Face repo.
 
 <Tip>
 Bundling model weights can significantly reduce cold start times because your instance won't waste time downloading the model weights from Hugging Face's servers.
@@ -116,10 +116,10 @@ Always pin exact versions for your Python dependencies. The ML/AI space moves fa
 
 ### Step 3: Configure Hugging Face caching
 
-Finally, we can configure Hugging Face caching in `config.yaml` by adding the `hf_cache` key. When building the image for your Llama 2 deployment, the Llama 2 model weights will be downloaded and cached for future use.
+Finally, we can configure Hugging Face caching in `config.yaml` by adding the `model_cache` key. When building the image for your Llama 2 deployment, the Llama 2 model weights will be downloaded and cached for future use.
 
 ```yaml config.yaml
-hf_cache:
+model_cache:
 - repo_id: "meta-llama/Llama-2-7b-chat-hf"
   ignore_patterns:
   - "*.bin"
@@ -163,7 +163,7 @@ requirements:
 - safetensors==0.3.2
 - torch==2.0.1
 - transformers==4.30.2
-hf_cache:
+model_cache:
 - repo_id: "NousResearch/Llama-2-7b-chat-hf"
   ignore_patterns:
   - "*.bin"

diff --git a/docs/guides/model-cache.mdx b/docs/guides/model-cache.mdx
@@ -18,17 +18,20 @@ In practice, this reduces the cold start for large models to just a few seconds.
 
 ## Enabling Caching for a Model
 
-To enable caching, simply add `hf_cache` to your `config.yml` with a valid `repo_id`. The `hf_cache` has a few key configurations:
+To enable caching, simply add `model_cache` to your `config.yml` with a valid `repo_id`. The `model_cache` has a few key configurations:
 - `repo_id` (required): The endpoint for your cloud bucket. Currently, we support Hugging Face and Google Cloud Storage.
 - `revision`: Points to your revision. This is only relevant if you are pulling By default, it refers to `main`.
 - `allow_patterns`: Only cache files that match specified patterns. Utilize Unix shell-style wildcards to denote these patterns.
 - `ignore_patterns`: Conversely, you can also denote file patterns to ignore, hence streamlining the caching process.
 
-Here is an example of a well written `hf_cache` for Stable Diffusion XL. Note how it only pulls the model weights that it needs using `allow_patterns`.
+<Info>We recently renamed `hf_cache` to `model_cache`, but don't worry! If you're using `hf_cache` in any of your projects, it will automatically be aliased to `model_cache`.</Info>
+
+
+Here is an example of a well written `model_cache` for Stable Diffusion XL. Note how it only pulls the model weights that it needs using `allow_patterns`.
 
 ```yaml config.yml
 ...
-hf_cache:
+model_cache:
   - repo_id: madebyollin/sdxl-vae-fp16-fix
     allow_patterns:
       - config.json
@@ -51,7 +54,7 @@ Many Hugging Face repos have model weights in different formats (`.bin`, `.safet
 There are also some additional steps depending on the cloud bucket you want to query.
 
 ### Hugging Face 🤗
-For any public Hugging Face repo, you don't need to do anything else. Adding the `hf_cache` key with an appropriate `repo_id` should be enough.
+For any public Hugging Face repo, you don't need to do anything else. Adding the `model_cache` key with an appropriate `repo_id` should be enough.
 
 However, if you want to deploy a model from a gated repo like [Llama 2](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) to Baseten, there's a few steps you need to take:
 <Steps>
@@ -86,15 +89,17 @@ Weights will be cached in the default Hugging Face cache directory, `~/.cache/hu
 ### Google Cloud Storage
 Google Cloud Storage is a great alternative to Hugging Face when you have a custom model or fine-tune you want to gate, especially if you are already using GCP and care about security and compliance.
 
-Your `hf_cache` should look something like this:
+Your `model_cache` should look something like this:
 
 ```yaml config.yml
 ...
-hf_cache:
+model_cache:
     repo_id: gcs://path-to-my-bucket
 ...
 ```
 
+If you are accessing a public GCS bucket, you can ignore the following steps, but make sure you set appropriate permissions on your bucket. Users should be able to list and view all files. Otherwise, the model build will fail.
+
 For a private GCS bucket, first export your service account key. Rename it to be `service_account.json` and add it to the `data` directory of your Truss.
 
 Your file structure should look something like this:
@@ -111,9 +116,80 @@ your-truss
 If you are using version control, like git, for your Truss, make sure to add `service_account.json` to your `.gitignore` file. You don't want to accidentally expose your service account key.
 </Warning>
 
-Weights will be cached at `/app/hf_cache/{your_bucket_name}`.
+Weights will be cached at `/app/model_cache/{your_bucket_name}`.
+
+
+### Amazon Web Services S3
+
+Another popular cloud storage option for hosting model weights is AWS S3, especially if you're already using AWS services.
+
+Your `model_cache` should look something like this:
+
+```yaml config.yml
+...
+model_cache:
+    repo_id: s3://path-to-my-bucket
+...
+```
+
+If you are accessing a public GCS bucket, you can ignore the subsequent steps, but make sure you set an appropriate appropriate policy on your bucket. Users should be able to list and view all files. Otherwise, the model build will fail.
+
+However, for a private S3 bucket, you need to first find your `aws_access_key_id`, `aws_secret_access_key`, and `aws_region` in your AWS dashboard. Create a file named `s3_credentials.json`. Inside this file, add the credentials that you identified earlier as shown below. Place this file into the `data` directory of your Truss.
+The key `aws_session_token` can be included, but is optional.
+
+Here is an example of how your `s3_credentials.json` file should look:
+
+```json
+{
+    "aws_access_key_id": "YOUR-ACCESS-KEY",
+    "aws_secret_access_key": "YOUR-SECRET-ACCESS-KEY",
+    "aws_region": "YOUR-REGION"
+}
+```
+
+Your overall file structure should now look something like this:
+
+```
+your-truss
+|--model
+| └── model.py
+|--data
+|. └── s3_credentials.json
+```
+
+When you are generating credentials, make sure that the resulting keys have at minimum the following IAM policy:
+
+```json
+{
+        "Version": "2012-10-17",
+        "Statement": [
+            {
+                "Action": [
+                    "s3:GetObject",
+                    "s3:ListObjects",
+                ],
+                "Effect": "Allow",
+                "Resource": ["arn:aws:s3:::S3_BUCKET/PATH_TO_MODEL/*"]
+            },
+            {
+                "Action": [
+                    "s3:ListBucket",
+                ],
+                "Effect": "Allow",
+                "Resource": ["arn:aws:s3:::S3_BUCKET"]
+            }
+        ]
+    }
+```
+
+
+<Warning>
+If you are using version control, like git, for your Truss, make sure to add `s3_credentials.json` to your `.gitignore` file. You don't want to accidentally expose your service account key.
+</Warning>
+
+Weights will be cached at `/app/model_cache/{your_bucket_name}`.
 
 
 ### Other Buckets
 
-We're currently workign on adding support for more bucket types, including AWS S3. If you have any suggestions, please [leave an issue](https://github.com/basetenlabs/truss/issues) on our GitHub repo.
+We can work with you to support additional bucket types if needed. If you have any suggestions, please [leave an issue](https://github.com/basetenlabs/truss/issues) on our GitHub repo.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "truss"
-version = "0.7.14"
+version = "0.7.15"
 description = "A seamless bridge from model development to model delivery"
 license = "MIT"
 readme = "README.md"