From ba7188995a749b405a3fad7d8610c1784de6ceb9 Mon Sep 17 00:00:00 2001 From: Jixian Wang Date: Sun, 30 Apr 2023 19:39:52 +0300 Subject: [PATCH 1/9] add support for diffusers>=0.18 with xformers --- library/train_util.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/library/train_util.py b/library/train_util.py index 8c6e34371..fbddaed7f 100644 --- a/library/train_util.py +++ b/library/train_util.py @@ -1766,7 +1766,7 @@ def replace_unet_modules(unet: diffusers.models.unet_2d_condition.UNet2DConditio if mem_eff_attn: replace_unet_cross_attn_to_memory_efficient() elif xformers: - replace_unet_cross_attn_to_xformers() + replace_unet_cross_attn_to_xformers(unet) def replace_unet_cross_attn_to_memory_efficient(): @@ -1809,7 +1809,7 @@ def forward_flash_attn(self, x, context=None, mask=None): diffusers.models.attention.CrossAttention.forward = forward_flash_attn -def replace_unet_cross_attn_to_xformers(): +def replace_unet_cross_attn_to_xformers(unet): print("Replace CrossAttention.forward to use xformers") try: import xformers.ops @@ -1849,7 +1849,15 @@ def forward_xformers(self, x, context=None, mask=None): out = self.to_out[1](out) return out - diffusers.models.attention.CrossAttention.forward = forward_xformers + print( hasattr(unet, "enable_xformers_memory_efficient_attention")) + if hasattr(diffusers.models.attention, "CrossAttention") and \ + hasattr(diffusers.models.attention.CrossAttention, "forward"): + diffusers.models.attention.CrossAttention.forward = forward_xformers + elif hasattr(unet, "enable_xformers_memory_efficient_attention"): + unet.enable_xformers_memory_efficient_attention(attention_op=xformers.ops.MemoryEfficientAttentionFlashAttentionOp) + print(unet.enable_xformers_memory_efficient_attention()) + else: + print('Do nothing...') # endregion From 41e5d33d91d57df0079cb7664300b434ba6043d8 Mon Sep 17 00:00:00 2001 From: Jixian Wang Date: Sun, 30 Apr 2023 19:41:53 +0300 Subject: [PATCH 2/9] update --- library/train_util.py | 1 - 1 file changed, 1 deletion(-) diff --git a/library/train_util.py b/library/train_util.py index fbddaed7f..c8c2d232c 100644 --- a/library/train_util.py +++ b/library/train_util.py @@ -1849,7 +1849,6 @@ def forward_xformers(self, x, context=None, mask=None): out = self.to_out[1](out) return out - print( hasattr(unet, "enable_xformers_memory_efficient_attention")) if hasattr(diffusers.models.attention, "CrossAttention") and \ hasattr(diffusers.models.attention.CrossAttention, "forward"): diffusers.models.attention.CrossAttention.forward = forward_xformers From 0a489ae4cbb6970cac6b907ba710902f5e308052 Mon Sep 17 00:00:00 2001 From: Jixian Wang Date: Sun, 30 Apr 2023 20:15:55 +0300 Subject: [PATCH 3/9] bugfix for support diffusers 0.15-0.16.1 --- library/train_util.py | 7 +++---- networks/lora.py | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/library/train_util.py b/library/train_util.py index c8c2d232c..fd8ec3304 100644 --- a/library/train_util.py +++ b/library/train_util.py @@ -1849,12 +1849,11 @@ def forward_xformers(self, x, context=None, mask=None): out = self.to_out[1](out) return out - if hasattr(diffusers.models.attention, "CrossAttention") and \ + if diffusers.__version__ >= "0.12": + unet.enable_xformers_memory_efficient_attention(attention_op=xformers.ops.MemoryEfficientAttentionFlashAttentionOp) + elif hasattr(diffusers.models.attention, "CrossAttention") and \ hasattr(diffusers.models.attention.CrossAttention, "forward"): diffusers.models.attention.CrossAttention.forward = forward_xformers - elif hasattr(unet, "enable_xformers_memory_efficient_attention"): - unet.enable_xformers_memory_efficient_attention(attention_op=xformers.ops.MemoryEfficientAttentionFlashAttentionOp) - print(unet.enable_xformers_memory_efficient_attention()) else: print('Do nothing...') diff --git a/networks/lora.py b/networks/lora.py index 353b1f5ac..5e4c2e080 100644 --- a/networks/lora.py +++ b/networks/lora.py @@ -605,9 +605,9 @@ def create_network_from_weights(multiplier, file, vae, text_encoder, unet, weigh class LoRANetwork(torch.nn.Module): NUM_OF_BLOCKS = 12 # フルモデル相当でのup,downの層の数 - + import diffusers # is it possible to apply conv_in and conv_out? -> yes, newer LoCon supports it (^^;) - UNET_TARGET_REPLACE_MODULE = ["Transformer2DModel", "Attention"] + UNET_TARGET_REPLACE_MODULE = ["Transformer2DModel", "Attention"] if diffusers.__version__ < "0.15.0" else ["Transformer2DModel"] UNET_TARGET_REPLACE_MODULE_CONV2D_3X3 = ["ResnetBlock2D", "Downsample2D", "Upsample2D"] TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPMLP"] LORA_PREFIX_UNET = "lora_unet" From 9ec594b421be5698af81f9e415fa9dcad2756837 Mon Sep 17 00:00:00 2001 From: Jixian Wang Date: Sun, 30 Apr 2023 20:47:02 +0300 Subject: [PATCH 4/9] tested from diffusers v0.10 to v0.16.1 --- library/train_util.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/library/train_util.py b/library/train_util.py index fd8ec3304..7e104adad 100644 --- a/library/train_util.py +++ b/library/train_util.py @@ -1849,13 +1849,13 @@ def forward_xformers(self, x, context=None, mask=None): out = self.to_out[1](out) return out - if diffusers.__version__ >= "0.12": - unet.enable_xformers_memory_efficient_attention(attention_op=xformers.ops.MemoryEfficientAttentionFlashAttentionOp) + print("diffusers version:", diffusers.__version__) + if diffusers.__version__ >= "0.11.0": + # let xformer to decide witch ops is more suitable, reference _dispatch_fwd in xformers.ops + unet.enable_xformers_memory_efficient_attention() elif hasattr(diffusers.models.attention, "CrossAttention") and \ hasattr(diffusers.models.attention.CrossAttention, "forward"): diffusers.models.attention.CrossAttention.forward = forward_xformers - else: - print('Do nothing...') # endregion From a3152795ad4edbde9c5ee02130fd9f1a4ad1932f Mon Sep 17 00:00:00 2001 From: Jixian Wang Date: Tue, 2 May 2023 02:34:52 +0300 Subject: [PATCH 5/9] sync random seed for multi-prcoess (multi-GPU) training --- train_network.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/train_network.py b/train_network.py index 5c4d5ad19..95e82ce92 100644 --- a/train_network.py +++ b/train_network.py @@ -61,7 +61,6 @@ def generate_step_logs(args: argparse.Namespace, current_loss, avr_loss, lr_sche return logs - def train(args): session_id = random.randint(0, 2**32) training_started_at = time.time() @@ -73,9 +72,15 @@ def train(args): use_user_config = args.dataset_config is not None if args.seed is None: - args.seed = random.randint(0, 2**32) - set_seed(args.seed) + import psutil + ppid = os.getppid() + parent_process = psutil.Process(ppid) + if len(parent_process.children()) > 1: + args.seed = ppid + else: + args.seed = random.randint(0, 2**32) + set_seed(args.seed) tokenizer = train_util.load_tokenizer(args) # データセットを準備する @@ -138,8 +143,8 @@ def train(args): # acceleratorを準備する print("prepare accelerator") accelerator, unwrap_model = train_util.prepare_accelerator(args) - is_main_process = accelerator.is_main_process + is_main_process = accelerator.is_main_process # mixed precisionに対応した型を用意しておき適宜castする weight_dtype, save_dtype = train_util.prepare_dtype(args) @@ -733,7 +738,7 @@ def remove_model(old_ckpt_name): if is_main_process: ckpt_name = train_util.get_last_ckpt_name(args, "." + args.save_model_as) save_model(ckpt_name, network, global_step, num_train_epochs, force_sync_upload=True) - + print("model saved.") From 03fee84ad82bcf82d2ae3a8a8c1e1a9a2f0ea759 Mon Sep 17 00:00:00 2001 From: Jixian Wang Date: Tue, 2 May 2023 04:16:14 +0300 Subject: [PATCH 6/9] add mini-batch support for DDP --- train_network.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/train_network.py b/train_network.py index 95e82ce92..e15d6cb4d 100644 --- a/train_network.py +++ b/train_network.py @@ -574,6 +574,9 @@ def remove_model(old_ckpt_name): print(f"removing old checkpoint: {old_ckpt_file}") os.remove(old_ckpt_file) + mini_batch_size = int(args.train_batch_size) / accelerator.num_processes + mini_batch_offset = int(accelerator.process_index) * mini_batch_size + # training loop for epoch in range(num_train_epochs): if is_main_process: @@ -586,6 +589,11 @@ def remove_model(old_ckpt_name): for step, batch in enumerate(train_dataloader): current_step.value = global_step + # cut mini batch + for k in batch.keys(): + if batch[k] is None: continue + batch[k] = batch[k][int(mini_batch_offset):int(mini_batch_offset+mini_batch_size)] + with accelerator.accumulate(network): # on_step_start(text_encoder, unet) From d3664f56c0d73005ccd334d499d396e475a43c6a Mon Sep 17 00:00:00 2001 From: Jixian Wang Date: Tue, 2 May 2023 12:11:05 +0300 Subject: [PATCH 7/9] remove the train_dataloader from accelerate prepare --- notes | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 notes diff --git a/notes b/notes new file mode 100644 index 000000000..5de4353dd --- /dev/null +++ b/notes @@ -0,0 +1,37 @@ +cd /proj/suchka/image-generation/sd-scripts +accelerate launch --num_cpu_threads_per_process=2 "train_network.py" --enable_bucket --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" --train_data_dir="/proj/suchka/image-generation/lora/jixian/img" --resolution=512,512 --output_dir="/proj/suchka/image-generation/lora/jixian/model" --logging_dir="/proj/suchka/image-generation/lora/jixian/log" --network_alpha="128" --save_model_as=safetensors --network_module=networks.lora --text_encoder_lr=5e-7 --unet_lr=5e-6 --network_dim=128 --output_name="jixianwang_v3" --lr_scheduler_num_cycles="10" --learning_rate="5e-6" --lr_scheduler="constant" --train_batch_size="8" --max_train_steps="1000" --save_every_n_epochs="1" --mixed_precision="bf16" --save_precision="bf16" --caption_extension=".txt" --cache_latents --optimizer_type="AdamW8bit" --max_data_loader_n_workers="0" --bucket_reso_steps=64 --xformers --bucket_no_upscale --noise_offset=0.1 --max_grad_norm=2 + +tensorboard --logdir /proj/suchka/image-generation/lora/jixian/log --bind_all + + + +================================================================================ +8 x 1 +running training / 学習開始 + num train images * repeats / 学習画像の数×繰り返し回数: 3760 + num reg images / 正則化画像の数: 0 + num batches per epoch / 1epochのバッチ数: 470 + num epochs / epoch数: 5 + batch size per device / バッチサイズ: 4 + gradient accumulation steps / 勾配を合計するステップ数 = 1 + total optimization steps / 学習ステップ数: 2000 +steps: + +8 x 2 +running training / 学習開始 + num train images * repeats / 学習画像の数×繰り返し回数: 3760 + num reg images / 正則化画像の数: 0 + num batches per epoch / 1epochのバッチ数: 235 + num epochs / epoch数: 9 + batch size per device / バッチサイズ: 8 + gradient accumulation steps / 勾配を合計するステップ数 = 1 + total optimization steps / 学習ステップ数: 2000 + + +accelerate launch --num_cpu_threads_per_process=2 "train_network.py" --enable_bucket --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" --train_data_dir="/proj/suchka/image-generation/lora/jixian/img" --resolution=512,512 --output_dir="/proj/suchka/image-generation/lora/jixian/model" --logging_dir="/proj/suchka/image-generation/lora/jixian/log" --network_alpha="128" --save_model_as=safetensors --network_module=networks.lora --text_encoder_lr=5e-7 --unet_lr=5e-6 --network_dim=128 --output_name="jixianwang_v3" --lr_scheduler_num_cycles="10" --learning_rate="5e-6" --lr_scheduler="constant" --train_batch_size="4" --max_train_steps="2000" --save_every_n_epochs="1" --mixed_precision="bf16" --save_precision="bf16" --caption_extension=".txt" --cache_latents --optimizer_type="AdamW8bit" --max_data_loader_n_workers="0" --bucket_reso_steps=64 --xformers --bucket_no_upscale --noise_offset=0.1 --seed=1234 + + +accelerate launch --num_cpu_threads_per_process=2 "train_network.py" --enable_bucket --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" --train_data_dir="/proj/suchka/image-generation/lora/jixian/img" --resolution=512,512 --output_dir="/proj/suchka/image-generation/lora/jixian/model" --logging_dir="/proj/suchka/image-generation/lora/jixian/log" --network_alpha="128" --save_model_as=safetensors --network_module=networks.lora --text_encoder_lr=5e-7 --unet_lr=5e-6 --network_dim=128 --output_name="jixianwang_v3" --lr_scheduler_num_cycles="10" --learning_rate="5e-6" --lr_scheduler="constant" --train_batch_size="8" --max_train_steps="2000" --save_every_n_epochs="1" --mixed_precision="bf16" --save_precision="bf16" --caption_extension=".txt" --cache_latents --optimizer_type="AdamW8bit" --max_data_loader_n_workers="0" --bucket_reso_steps=64 --xformers --bucket_no_upscale --noise_offset=0.1 --seed=1234 + +accelerate launch --num_cpu_threads_per_process=2 "train_network.py" --enable_bucket --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" --train_data_dir="/proj/suchka/image-generation/lora/jixian/img" --resolution=512,512 --output_dir="/proj/suchka/image-generation/lora/jixian/model" --logging_dir="/proj/suchka/image-generation/lora/jixian/log" --network_alpha="128" --save_model_as=safetensors --network_module=networks.lora --text_encoder_lr=5e-7 --unet_lr=5e-6 --network_dim=128 --output_name="jixianwang_v3" --lr_scheduler_num_cycles="10" --learning_rate="5e-6" --lr_scheduler="constant" --train_batch_size="4" --max_train_steps="2000" --save_every_n_epochs="1" --mixed_precision="bf16" --save_precision="bf16" --caption_extension=".txt" --cache_latents --optimizer_type="AdamW8bit" --max_data_loader_n_workers="0" --bucket_reso_steps=64 --xformers --bucket_no_upscale --noise_offset=0.1 --seed=1234 + From 65ad5d9d32c1ca142d0665bee1e9e3804bb69624 Mon Sep 17 00:00:00 2001 From: Jixian Wang Date: Tue, 2 May 2023 12:11:30 +0300 Subject: [PATCH 8/9] update data loger --- train_network.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/train_network.py b/train_network.py index e15d6cb4d..28ab2c1c5 100644 --- a/train_network.py +++ b/train_network.py @@ -270,19 +270,19 @@ def train(args): # acceleratorがなんかよろしくやってくれるらしい if train_unet and train_text_encoder: - unet, text_encoder, network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, text_encoder, network, optimizer, train_dataloader_, lr_scheduler = accelerator.prepare( unet, text_encoder, network, optimizer, train_dataloader, lr_scheduler ) elif train_unet: - unet, network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, network, optimizer, train_dataloader_, lr_scheduler = accelerator.prepare( unet, network, optimizer, train_dataloader, lr_scheduler ) elif train_text_encoder: - text_encoder, network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + text_encoder, network, optimizer, train_dataloader_, lr_scheduler = accelerator.prepare( text_encoder, network, optimizer, train_dataloader, lr_scheduler ) else: - network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(network, optimizer, train_dataloader, lr_scheduler) + network, optimizer, train_dataloader_, lr_scheduler = accelerator.prepare(network, optimizer, train_dataloader, lr_scheduler) unet.requires_grad_(False) unet.to(accelerator.device, dtype=weight_dtype) @@ -646,7 +646,7 @@ def remove_model(old_ckpt_name): loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none") loss = loss.mean([1, 2, 3]) - loss_weights = batch["loss_weights"] # 各sampleごとのweight + loss_weights = batch["loss_weights"].to(accelerator.device) # 各sampleごとのweight loss = loss * loss_weights if args.min_snr_gamma: From 411bf9f75f80a5a99b9ea78d0586914645ae1d47 Mon Sep 17 00:00:00 2001 From: Jixian Wang Date: Tue, 2 May 2023 17:14:52 +0300 Subject: [PATCH 9/9] remove notes --- notes | 37 ------------------------------------- 1 file changed, 37 deletions(-) delete mode 100644 notes diff --git a/notes b/notes deleted file mode 100644 index 5de4353dd..000000000 --- a/notes +++ /dev/null @@ -1,37 +0,0 @@ -cd /proj/suchka/image-generation/sd-scripts -accelerate launch --num_cpu_threads_per_process=2 "train_network.py" --enable_bucket --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" --train_data_dir="/proj/suchka/image-generation/lora/jixian/img" --resolution=512,512 --output_dir="/proj/suchka/image-generation/lora/jixian/model" --logging_dir="/proj/suchka/image-generation/lora/jixian/log" --network_alpha="128" --save_model_as=safetensors --network_module=networks.lora --text_encoder_lr=5e-7 --unet_lr=5e-6 --network_dim=128 --output_name="jixianwang_v3" --lr_scheduler_num_cycles="10" --learning_rate="5e-6" --lr_scheduler="constant" --train_batch_size="8" --max_train_steps="1000" --save_every_n_epochs="1" --mixed_precision="bf16" --save_precision="bf16" --caption_extension=".txt" --cache_latents --optimizer_type="AdamW8bit" --max_data_loader_n_workers="0" --bucket_reso_steps=64 --xformers --bucket_no_upscale --noise_offset=0.1 --max_grad_norm=2 - -tensorboard --logdir /proj/suchka/image-generation/lora/jixian/log --bind_all - - - -================================================================================ -8 x 1 -running training / 学習開始 - num train images * repeats / 学習画像の数×繰り返し回数: 3760 - num reg images / 正則化画像の数: 0 - num batches per epoch / 1epochのバッチ数: 470 - num epochs / epoch数: 5 - batch size per device / バッチサイズ: 4 - gradient accumulation steps / 勾配を合計するステップ数 = 1 - total optimization steps / 学習ステップ数: 2000 -steps: - -8 x 2 -running training / 学習開始 - num train images * repeats / 学習画像の数×繰り返し回数: 3760 - num reg images / 正則化画像の数: 0 - num batches per epoch / 1epochのバッチ数: 235 - num epochs / epoch数: 9 - batch size per device / バッチサイズ: 8 - gradient accumulation steps / 勾配を合計するステップ数 = 1 - total optimization steps / 学習ステップ数: 2000 - - -accelerate launch --num_cpu_threads_per_process=2 "train_network.py" --enable_bucket --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" --train_data_dir="/proj/suchka/image-generation/lora/jixian/img" --resolution=512,512 --output_dir="/proj/suchka/image-generation/lora/jixian/model" --logging_dir="/proj/suchka/image-generation/lora/jixian/log" --network_alpha="128" --save_model_as=safetensors --network_module=networks.lora --text_encoder_lr=5e-7 --unet_lr=5e-6 --network_dim=128 --output_name="jixianwang_v3" --lr_scheduler_num_cycles="10" --learning_rate="5e-6" --lr_scheduler="constant" --train_batch_size="4" --max_train_steps="2000" --save_every_n_epochs="1" --mixed_precision="bf16" --save_precision="bf16" --caption_extension=".txt" --cache_latents --optimizer_type="AdamW8bit" --max_data_loader_n_workers="0" --bucket_reso_steps=64 --xformers --bucket_no_upscale --noise_offset=0.1 --seed=1234 - - -accelerate launch --num_cpu_threads_per_process=2 "train_network.py" --enable_bucket --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" --train_data_dir="/proj/suchka/image-generation/lora/jixian/img" --resolution=512,512 --output_dir="/proj/suchka/image-generation/lora/jixian/model" --logging_dir="/proj/suchka/image-generation/lora/jixian/log" --network_alpha="128" --save_model_as=safetensors --network_module=networks.lora --text_encoder_lr=5e-7 --unet_lr=5e-6 --network_dim=128 --output_name="jixianwang_v3" --lr_scheduler_num_cycles="10" --learning_rate="5e-6" --lr_scheduler="constant" --train_batch_size="8" --max_train_steps="2000" --save_every_n_epochs="1" --mixed_precision="bf16" --save_precision="bf16" --caption_extension=".txt" --cache_latents --optimizer_type="AdamW8bit" --max_data_loader_n_workers="0" --bucket_reso_steps=64 --xformers --bucket_no_upscale --noise_offset=0.1 --seed=1234 - -accelerate launch --num_cpu_threads_per_process=2 "train_network.py" --enable_bucket --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" --train_data_dir="/proj/suchka/image-generation/lora/jixian/img" --resolution=512,512 --output_dir="/proj/suchka/image-generation/lora/jixian/model" --logging_dir="/proj/suchka/image-generation/lora/jixian/log" --network_alpha="128" --save_model_as=safetensors --network_module=networks.lora --text_encoder_lr=5e-7 --unet_lr=5e-6 --network_dim=128 --output_name="jixianwang_v3" --lr_scheduler_num_cycles="10" --learning_rate="5e-6" --lr_scheduler="constant" --train_batch_size="4" --max_train_steps="2000" --save_every_n_epochs="1" --mixed_precision="bf16" --save_precision="bf16" --caption_extension=".txt" --cache_latents --optimizer_type="AdamW8bit" --max_data_loader_n_workers="0" --bucket_reso_steps=64 --xformers --bucket_no_upscale --noise_offset=0.1 --seed=1234 -