riffusion · hmartiro · Jan 16, 2023
diff --git a/pyproject.toml b/pyproject.toml
@@ -66,6 +66,10 @@ ignore_missing_imports = true
 module = "diffusers.*"
 ignore_missing_imports = true
 
+[[tool.mypy.overrides]]
+module = "lora_diffusion.*"
+ignore_missing_imports = true
+
 [[tool.mypy.overrides]]
 module = "numpy.*"
 ignore_missing_imports = true

diff --git a/requirements.txt b/requirements.txt
@@ -18,3 +18,4 @@ torch
 torchaudio
 torchvision
 transformers
+git+https://github.com/cloneofsimo/lora.git
diff --git a/riffusion/external/lora/__init__.py b/riffusion/external/lora/__init__.py
diff --git a/riffusion/external/lora/run_lora_db_unet_only.sh b/riffusion/external/lora/run_lora_db_unet_only.sh
@@ -0,0 +1,24 @@
+export MODEL_NAME="riffusion/riffusion-model-v1"
+export INSTANCE_DIR="/tmp/sample_clips_tdlcqdfi/images"
+export OUTPUT_DIR="/home/ubuntu/lora_dreambooth_waterfalls_2k"
+
+accelerate launch\
+  --num_machines 1 \
+  --num_processes 8 \
+  --dynamo_backend=no \
+  --mixed_precision="fp16" \
+  riffusion/external/lora/train_lora_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --instance_prompt="style of sks" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=1e-4 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=2000
+
+# TODO try mixed_precision=fp16
+# TODO try num_processes = 8
diff --git a/riffusion/external/lora/train_lora.py b/riffusion/external/lora/train_lora.py
@@ -0,0 +1,49 @@
+from lora_diffusion.cli_lora_pti import train
+from lora_diffusion.dataset import STYLE_TEMPLATE
+
+MODEL_NAME = "riffusion/riffusion-model-v1"
+INSTANCE_DIR = "/tmp/sample_clips_xzv8p57g/images"
+OUTPUT_DIR = "./lora_output_acoustic"
+
+if __name__ == "__main__":
+    entries = [
+        "music in the style of {}",
+        "sound in the style of {}",
+        "vibe in the style of {}",
+        "audio in the style of {}",
+        "groove in the style of {}",
+    ]
+    for i in range(len(STYLE_TEMPLATE)):
+        STYLE_TEMPLATE[i] = entries[i % len(entries)]
+    print(STYLE_TEMPLATE)
+
+    train(
+        pretrained_model_name_or_path=MODEL_NAME,
+        instance_data_dir=INSTANCE_DIR,
+        output_dir=OUTPUT_DIR,
+        train_text_encoder=True,
+        resolution=512,
+        train_batch_size=1,
+        gradient_accumulation_steps=4,
+        scale_lr=True,
+        learning_rate_unet=1e-4,
+        learning_rate_text=1e-5,
+        learning_rate_ti=5e-4,
+        color_jitter=False,
+        lr_scheduler="linear",
+        lr_warmup_steps=0,
+        placeholder_tokens="<s1>|<s2>",
+        use_template="style",
+        save_steps=100,
+        max_train_steps_ti=1000,
+        max_train_steps_tuning=1000,
+        perform_inversion=True,
+        clip_ti_decay=True,
+        weight_decay_ti=0.000,
+        weight_decay_lora=0.001,
+        continue_inversion=True,
+        continue_inversion_lr=1e-4,
+        device="cuda:0",
+        lora_rank=1,
+        use_face_segmentation_condition=False,
+    )
diff --git a/riffusion/external/lora/train_lora.sh b/riffusion/external/lora/train_lora.sh
@@ -0,0 +1,37 @@
+export MODEL_NAME="riffusion/riffusion-model-v1"
+export INSTANCE_DIR="/tmp/sample_clips_xzv8p57g/images"
+export OUTPUT_DIR="./lora_output_acoustic"
+
+lora_pti \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --train_text_encoder \
+  --resolution=512 \
+  # Started as 1
+  --train_batch_size=4 \
+  --gradient_accumulation_steps=4 \
+  --scale_lr \
+  --learning_rate_unet=1e-4 \
+  --learning_rate_text=1e-5 \
+  --learning_rate_ti=5e-4 \
+#   --color_jitter \
+  --lr_scheduler="linear" \
+  --lr_warmup_steps=0 \
+  --placeholder_tokens="<s>" \
+#   initializer tokens
+# class prompt
+#   --use_template="style"\
+  --save_steps=100 \
+  --max_train_steps_ti=1000 \
+  --max_train_steps_tuning=1000 \
+  --perform_inversion=True \
+  --clip_ti_decay \
+  --weight_decay_ti=0.000 \
+  --weight_decay_lora=0.001\
+  --continue_inversion \
+  --continue_inversion_lr=1e-4 \
+  --device="cuda:0" \
+  # 1 or 4?
+  --lora_rank=4 \
+#  --use_face_segmentation_condition\