diff --git a/stable_audio_open_colab.ipynb b/stable_audio_open_colab.ipynb new file mode 100644 index 0000000..bfa9a98 --- /dev/null +++ b/stable_audio_open_colab.ipynb @@ -0,0 +1,548 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4", + "authorship_tag": "ABX9TyOQLO4/4GSQapVQeg1vezzz", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU", + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "d621e5ab73d24c0f85aca1e0a20d3e89": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_6c7808a91a3e4f72a1248832b985020c", + "IPY_MODEL_13e9dac474494304b279d662de87d8f4", + "IPY_MODEL_5d116c165c3841248f360b00ba35fffe" + ], + "layout": "IPY_MODEL_d700622dab8e41e4b3d3acc10ff42ba2" + } + }, + "6c7808a91a3e4f72a1248832b985020c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9d88d7dc67534a1fb6e1e15c9c5b937b", + "placeholder": "​", + "style": "IPY_MODEL_480f4371362742d2952ffbbd986ec729", + "value": "100%" + } + }, + "13e9dac474494304b279d662de87d8f4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f6aef59a2db049c39ee874ba3cfc4798", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_54b55dbf5a5d4a2e9afe5c7dc1e26d7d", + "value": 100 + } + }, + "5d116c165c3841248f360b00ba35fffe": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e5b2378a037846528a84c0a39547ffc2", + "placeholder": "​", + "style": "IPY_MODEL_f7659f9f7fe14dff8577d6f904e3a108", + "value": " 100/100 [00:34<00:00,  2.91it/s]" + } + }, + "d700622dab8e41e4b3d3acc10ff42ba2": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9d88d7dc67534a1fb6e1e15c9c5b937b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "480f4371362742d2952ffbbd986ec729": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f6aef59a2db049c39ee874ba3cfc4798": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "54b55dbf5a5d4a2e9afe5c7dc1e26d7d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "e5b2378a037846528a84c0a39547ffc2": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f7659f9f7fe14dff8577d6f904e3a108": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "source": [ + "!pip install -q einops stable_audio_tools" + ], + "metadata": { + "id": "ULbF5y9eoeCN" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "GM27lXQuobHw" + }, + "outputs": [], + "source": [ + "import torch\n", + "import torchaudio\n", + "from einops import rearrange\n", + "from stable_audio_tools import get_pretrained_model\n", + "from stable_audio_tools.inference.generation import generate_diffusion_cond" + ] + }, + { + "cell_type": "code", + "source": [ + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "\n", + "# Download model\n", + "model, model_config = get_pretrained_model(\"stabilityai/stable-audio-open-1.0\")\n", + "sample_rate = model_config[\"sample_rate\"]\n", + "sample_size = model_config[\"sample_size\"]\n", + "\n", + "model = model.to(device)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JQSox2_JrjSm", + "outputId": "d08c9d02-4223-4ea3-b078-ce1ae384fb34" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/torch/nn/utils/weight_norm.py:28: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n", + " warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Set up text and timing conditioning\n", + "conditioning = [{\n", + " \"prompt\": \"The sound of the piano keys being pressed, the soft melody that follows, and the gentle hum of the bass create a soothing atmosphere that envelops the listener. It's as if the music is a warm embrace, inviting you to relax and unwind. The rhythm is slow and steady, like a heartbeat, and the notes dance together in perfect harmony. It's a symphony of peace and tranquility, a lullaby for the soul.\",\n", + " \"seconds_start\": 0,\n", + " \"seconds_total\": 45\n", + "}]" + ], + "metadata": { + "id": "K87sVH9jtBB9" + }, + "execution_count": 10, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Generate stereo audio\n", + "output = generate_diffusion_cond(\n", + " model,\n", + " steps=100,\n", + " cfg_scale=7,\n", + " conditioning=conditioning,\n", + " sample_size=sample_size,\n", + " sigma_min=0.3,\n", + " sigma_max=500,\n", + " sampler_type=\"dpmpp-3m-sde\",\n", + " device=device\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 181, + "referenced_widgets": [ + "d621e5ab73d24c0f85aca1e0a20d3e89", + "6c7808a91a3e4f72a1248832b985020c", + "13e9dac474494304b279d662de87d8f4", + "5d116c165c3841248f360b00ba35fffe", + "d700622dab8e41e4b3d3acc10ff42ba2", + "9d88d7dc67534a1fb6e1e15c9c5b937b", + "480f4371362742d2952ffbbd986ec729", + "f6aef59a2db049c39ee874ba3cfc4798", + "54b55dbf5a5d4a2e9afe5c7dc1e26d7d", + "e5b2378a037846528a84c0a39547ffc2", + "f7659f9f7fe14dff8577d6f904e3a108" + ] + }, + "id": "vV1V-15ktC07", + "outputId": "e319f4a7-8db7-4c0c-e582-a0777f90868c" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "384734133\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " 0%| | 0/100 [00:00 d (b n)\")" + ], + "metadata": { + "id": "jNH-zKlZtEk5" + }, + "execution_count": 12, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Peak normalize, clip, convert to int16, and save to file\n", + "output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()\n", + "torchaudio.save(\"output.wav\", output, sample_rate)" + ], + "metadata": { + "id": "iwvWXYMjtF0B" + }, + "execution_count": 13, + "outputs": [] + } + ] +} \ No newline at end of file