From 61b7e793a3c4ecf899d79735dfa5fa3f8560303c Mon Sep 17 00:00:00 2001 From: the-database <25811902+the-database@users.noreply.github.com> Date: Tue, 7 Jan 2025 18:56:32 -0500 Subject: [PATCH] default to bf16 for rcan, moesr --- options/test/MoESR/MoESR2.yml | 2 +- options/test/RCAN/RCAN.yml | 2 +- ...SR2_OTF_bicubic_ms_ssim_l1_fromscratch.yml | 2 +- options/train/MoESR/MoESR2_OTF_finetune.yml | 2 +- .../train/MoESR/MoESR2_OTF_fromscratch.yml | 2 +- options/train/MoESR/MoESR2_finetune.yml | 2 +- options/train/MoESR/MoESR2_fromscratch.yml | 2 +- ...CAN_OTF_bicubic_ms_ssim_l1_fromscratch.yml | 2 +- options/train/RCAN/RCAN_OTF_finetune.yml | 2 +- options/train/RCAN/RCAN_OTF_fromscratch.yml | 2 +- options/train/RCAN/RCAN_finetune.yml | 2 +- options/train/RCAN/RCAN_fromscratch.yml | 2 +- test_scripts/test_precision.py | 16 ++++++++++++---- traiNNer/archs/arch_info.py | 2 ++ traiNNer/archs/artcnn_arch.py | 19 +++++++++++++++++++ 15 files changed, 45 insertions(+), 16 deletions(-) diff --git a/options/test/MoESR/MoESR2.yml b/options/test/MoESR/MoESR2.yml index d31128fd..0fefa17e 100644 --- a/options/test/MoESR/MoESR2.yml +++ b/options/test/MoESR/MoESR2.yml @@ -5,7 +5,7 @@ name: 4x_MoESR2 scale: 4 # 1, 2, 3, 4, 8 use_amp: true # Speed up training and reduce VRAM usage. NVIDIA only. -amp_bf16: false # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. +amp_bf16: true # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. num_gpu: auto diff --git a/options/test/RCAN/RCAN.yml b/options/test/RCAN/RCAN.yml index d0ceb4dd..795f4b25 100644 --- a/options/test/RCAN/RCAN.yml +++ b/options/test/RCAN/RCAN.yml @@ -5,7 +5,7 @@ name: 4x_RCAN scale: 4 # 1, 2, 3, 4, 8 use_amp: true # Speed up training and reduce VRAM usage. NVIDIA only. -amp_bf16: false # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. +amp_bf16: true # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. num_gpu: auto diff --git a/options/train/MoESR/MoESR2_OTF_bicubic_ms_ssim_l1_fromscratch.yml b/options/train/MoESR/MoESR2_OTF_bicubic_ms_ssim_l1_fromscratch.yml index 5b9d09e2..71e28fc6 100644 --- a/options/train/MoESR/MoESR2_OTF_bicubic_ms_ssim_l1_fromscratch.yml +++ b/options/train/MoESR/MoESR2_OTF_bicubic_ms_ssim_l1_fromscratch.yml @@ -6,7 +6,7 @@ name: 4x_MoESR2_OTF_bicubic_ms_ssim_l1 scale: 4 # 1, 2, 3, 4, 8 use_amp: true # Speed up training and reduce VRAM usage. NVIDIA only. -amp_bf16: false # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. +amp_bf16: true # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. use_channels_last: false # Enable channels last memory format while using AMP. Reduces VRAM and speeds up training for most architectures, but some architectures are slower with channels last. fast_matmul: false # Trade precision for performance. num_gpu: auto diff --git a/options/train/MoESR/MoESR2_OTF_finetune.yml b/options/train/MoESR/MoESR2_OTF_finetune.yml index f1446b3a..3f1b21db 100644 --- a/options/train/MoESR/MoESR2_OTF_finetune.yml +++ b/options/train/MoESR/MoESR2_OTF_finetune.yml @@ -6,7 +6,7 @@ name: 4x_MoESR2_OTF_finetune scale: 4 # 1, 2, 3, 4, 8 use_amp: true # Speed up training and reduce VRAM usage. NVIDIA only. -amp_bf16: false # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. +amp_bf16: true # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. use_channels_last: false # Enable channels last memory format while using AMP. Reduces VRAM and speeds up training for most architectures, but some architectures are slower with channels last. fast_matmul: false # Trade precision for performance. num_gpu: auto diff --git a/options/train/MoESR/MoESR2_OTF_fromscratch.yml b/options/train/MoESR/MoESR2_OTF_fromscratch.yml index 81b9bcf5..ea4c0af6 100644 --- a/options/train/MoESR/MoESR2_OTF_fromscratch.yml +++ b/options/train/MoESR/MoESR2_OTF_fromscratch.yml @@ -6,7 +6,7 @@ name: 4x_MoESR2_OTF_fromscratch scale: 4 # 1, 2, 3, 4, 8 use_amp: true # Speed up training and reduce VRAM usage. NVIDIA only. -amp_bf16: false # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. +amp_bf16: true # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. use_channels_last: false # Enable channels last memory format while using AMP. Reduces VRAM and speeds up training for most architectures, but some architectures are slower with channels last. fast_matmul: false # Trade precision for performance. num_gpu: auto diff --git a/options/train/MoESR/MoESR2_finetune.yml b/options/train/MoESR/MoESR2_finetune.yml index f41aef03..3ae4ecd7 100644 --- a/options/train/MoESR/MoESR2_finetune.yml +++ b/options/train/MoESR/MoESR2_finetune.yml @@ -6,7 +6,7 @@ name: 4x_MoESR2 scale: 4 # 1, 2, 3, 4, 8 use_amp: true # Speed up training and reduce VRAM usage. NVIDIA only. -amp_bf16: false # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. +amp_bf16: true # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. use_channels_last: false # Enable channels last memory format while using AMP. Reduces VRAM and speeds up training for most architectures, but some architectures are slower with channels last. fast_matmul: false # Trade precision for performance. num_gpu: auto diff --git a/options/train/MoESR/MoESR2_fromscratch.yml b/options/train/MoESR/MoESR2_fromscratch.yml index 9559b236..8a2d9143 100644 --- a/options/train/MoESR/MoESR2_fromscratch.yml +++ b/options/train/MoESR/MoESR2_fromscratch.yml @@ -6,7 +6,7 @@ name: 4x_MoESR2 scale: 4 # 1, 2, 3, 4, 8 use_amp: true # Speed up training and reduce VRAM usage. NVIDIA only. -amp_bf16: false # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. +amp_bf16: true # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. use_channels_last: false # Enable channels last memory format while using AMP. Reduces VRAM and speeds up training for most architectures, but some architectures are slower with channels last. fast_matmul: false # Trade precision for performance. num_gpu: auto diff --git a/options/train/RCAN/RCAN_OTF_bicubic_ms_ssim_l1_fromscratch.yml b/options/train/RCAN/RCAN_OTF_bicubic_ms_ssim_l1_fromscratch.yml index 8e79edcf..7089ef45 100644 --- a/options/train/RCAN/RCAN_OTF_bicubic_ms_ssim_l1_fromscratch.yml +++ b/options/train/RCAN/RCAN_OTF_bicubic_ms_ssim_l1_fromscratch.yml @@ -6,7 +6,7 @@ name: 4x_RCAN_OTF_bicubic_ms_ssim_l1 scale: 4 # 1, 2, 3, 4, 8 use_amp: true # Speed up training and reduce VRAM usage. NVIDIA only. -amp_bf16: false # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. +amp_bf16: true # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. use_channels_last: true # Enable channels last memory format while using AMP. Reduces VRAM and speeds up training for most architectures, but some architectures are slower with channels last. fast_matmul: false # Trade precision for performance. num_gpu: auto diff --git a/options/train/RCAN/RCAN_OTF_finetune.yml b/options/train/RCAN/RCAN_OTF_finetune.yml index 17354942..61ca84f5 100644 --- a/options/train/RCAN/RCAN_OTF_finetune.yml +++ b/options/train/RCAN/RCAN_OTF_finetune.yml @@ -6,7 +6,7 @@ name: 4x_RCAN_OTF_finetune scale: 4 # 1, 2, 3, 4, 8 use_amp: true # Speed up training and reduce VRAM usage. NVIDIA only. -amp_bf16: false # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. +amp_bf16: true # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. use_channels_last: true # Enable channels last memory format while using AMP. Reduces VRAM and speeds up training for most architectures, but some architectures are slower with channels last. fast_matmul: false # Trade precision for performance. num_gpu: auto diff --git a/options/train/RCAN/RCAN_OTF_fromscratch.yml b/options/train/RCAN/RCAN_OTF_fromscratch.yml index ca28af68..5b9e04ea 100644 --- a/options/train/RCAN/RCAN_OTF_fromscratch.yml +++ b/options/train/RCAN/RCAN_OTF_fromscratch.yml @@ -6,7 +6,7 @@ name: 4x_RCAN_OTF_fromscratch scale: 4 # 1, 2, 3, 4, 8 use_amp: true # Speed up training and reduce VRAM usage. NVIDIA only. -amp_bf16: false # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. +amp_bf16: true # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. use_channels_last: true # Enable channels last memory format while using AMP. Reduces VRAM and speeds up training for most architectures, but some architectures are slower with channels last. fast_matmul: false # Trade precision for performance. num_gpu: auto diff --git a/options/train/RCAN/RCAN_finetune.yml b/options/train/RCAN/RCAN_finetune.yml index 4fd0047b..1288030b 100644 --- a/options/train/RCAN/RCAN_finetune.yml +++ b/options/train/RCAN/RCAN_finetune.yml @@ -6,7 +6,7 @@ name: 4x_RCAN scale: 4 # 1, 2, 3, 4, 8 use_amp: true # Speed up training and reduce VRAM usage. NVIDIA only. -amp_bf16: false # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. +amp_bf16: true # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. use_channels_last: true # Enable channels last memory format while using AMP. Reduces VRAM and speeds up training for most architectures, but some architectures are slower with channels last. fast_matmul: false # Trade precision for performance. num_gpu: auto diff --git a/options/train/RCAN/RCAN_fromscratch.yml b/options/train/RCAN/RCAN_fromscratch.yml index bc6be879..571fb696 100644 --- a/options/train/RCAN/RCAN_fromscratch.yml +++ b/options/train/RCAN/RCAN_fromscratch.yml @@ -6,7 +6,7 @@ name: 4x_RCAN scale: 4 # 1, 2, 3, 4, 8 use_amp: true # Speed up training and reduce VRAM usage. NVIDIA only. -amp_bf16: false # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. +amp_bf16: true # Use bf16 instead of fp16 for AMP, RTX 3000 series or newer only. Only recommended if fp16 doesn't work. use_channels_last: true # Enable channels last memory format while using AMP. Reduces VRAM and speeds up training for most architectures, but some architectures are slower with channels last. fast_matmul: false # Trade precision for performance. num_gpu: auto diff --git a/test_scripts/test_precision.py b/test_scripts/test_precision.py index e212bc53..79a90a94 100644 --- a/test_scripts/test_precision.py +++ b/test_scripts/test_precision.py @@ -52,12 +52,12 @@ def format_extra_params(extra_arch_params: dict[str, Any]) -> str: def compare_precision( net: nn.Module, input_tensor: Tensor, criterion: nn.Module ) -> tuple[float, float]: - with torch.no_grad(): + with torch.inference_mode(): fp32_output = net(input_tensor) fp16_loss = None try: - with autocast(dtype=torch.float16, device_type="cuda"): + with autocast(dtype=torch.float16, device_type="cuda"), torch.inference_mode(): fp16_output = net(input_tensor) fp16_loss = criterion(fp16_output.float(), fp32_output).item() except Exception as e: @@ -66,7 +66,7 @@ def compare_precision( bf16_loss = None try: - with autocast(dtype=torch.bfloat16, device_type="cuda"): + with autocast(dtype=torch.bfloat16, device_type="cuda"), torch.inference_mode(): bf16_output = net(input_tensor) bf16_loss = criterion(bf16_output.float(), fp32_output).item() except Exception as e: @@ -82,7 +82,15 @@ def compare_precision( label = f"{name} {format_extra_params(extra_arch_params)} {scale}x" try: - if "realplksr" not in name: + if name not in { + "rcan", + "esrgan", + "compact", + "span", + "dat_2", + "spanplus", + "realplksr", + }: continue net: nn.Module = arch(scale=scale, **extra_arch_params).eval().to("cuda") diff --git a/traiNNer/archs/arch_info.py b/traiNNer/archs/arch_info.py index c82ad02b..d3e34efb 100644 --- a/traiNNer/archs/arch_info.py +++ b/traiNNer/archs/arch_info.py @@ -26,6 +26,8 @@ "hit_sir", "hit_sng", "hit_srf", + "rcan", + "moesr2", "rgt_s", "rgt", "seemore_t", diff --git a/traiNNer/archs/artcnn_arch.py b/traiNNer/archs/artcnn_arch.py index f2c81067..95221b86 100644 --- a/traiNNer/archs/artcnn_arch.py +++ b/traiNNer/archs/artcnn_arch.py @@ -105,3 +105,22 @@ def artcnn_r8f64( kernel_size=kernel_size, act=act, ) + + +@ARCH_REGISTRY.register() +def artcnn_r8f48( + in_ch: int = 3, + scale: int = 4, + filters: int = 48, + n_block: int = 8, + kernel_size: int = 3, + act: type[nn.Module] = nn.ReLU, +) -> ArtCNN: + return ArtCNN( + scale=scale, + in_ch=in_ch, + n_block=n_block, + filters=filters, + kernel_size=kernel_size, + act=act, + )