From c1aa59a5da0b59dab98d73d1b4a343870e13e3a1 Mon Sep 17 00:00:00 2001
From: hooke007 <hooke007@qq.com>
Date: Thu, 11 May 2023 22:00:32 +0100
Subject: [PATCH] =?UTF-8?q?=E5=90=8C=E6=AD=A5=E4=B8=8E=E6=95=B4=E5=90=88?=
 =?UTF-8?q?=E4=B8=8A=E6=B8=B8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

核心：
mpv 缓存选项细化
input_easy 快捷键中的个别数值修改为浮点

脚本：
osc_plus 的局部渲染优化
thumbfast 修正预览精度，增加预览质量的选择
uosc 启用空闲界面；翻译词条增补修正；修复一个原因未知的崩溃

着色器：
nlmeans_hqx 取代 nlmeans_hq；增加 nlmeans_2x
---
 portable_config/input_easy.conf               |   20 +-
 portable_config/mpv.conf                      |    9 +-
 portable_config/script-opts.conf              |   45 +-
 portable_config/script-opts/console.conf      |    9 +-
 portable_config/script-opts/thumbfast.conf    |    8 +-
 portable_config/script-opts/uosc.conf         |    3 +
 portable_config/script-opts/uosc_lang.conf    |   70 +-
 portable_config/script-opts/ytdl_hook.conf    |    2 +-
 portable_config/scripts/osc_plus.lua          |   67 +-
 portable_config/scripts/thumbfast.lua         |  375 ++-
 .../scripts/uosc/elements/Logo.lua            |   61 +
 .../scripts/uosc/elements/Menu.lua            |    4 +-
 .../scripts/uosc/elements/Timeline.lua        |    2 +-
 .../scripts/uosc/elements/TopBar.lua          |    4 +
 portable_config/scripts/uosc/lib/ass.lua      |    2 +-
 portable_config/scripts/uosc/lib/lang.lua     |   72 +-
 portable_config/scripts/uosc/lib/menus.lua    |   10 +-
 portable_config/scripts/uosc/lib/text.lua     |    4 +-
 portable_config/scripts/uosc/lib/utils.lua    |    2 +-
 portable_config/scripts/uosc/main.lua         |  103 +-
 portable_config/shaders/guided.glsl           |   11 +-
 portable_config/shaders/guided_lgc.glsl       |    7 +-
 portable_config/shaders/guided_s.glsl         |    2 +-
 portable_config/shaders/nlmeans.glsl          |  680 +++---
 portable_config/shaders/nlmeans_2x.glsl       | 1247 ++++++++++
 portable_config/shaders/nlmeans_hq.glsl       | 2161 -----------------
 portable_config/shaders/nlmeans_hqx.glsl      | 1288 ++++++++++
 portable_config/shaders/nlmeans_lgc.glsl      |  671 ++---
 portable_config/shaders/nlmeans_lq.glsl       |  674 ++---
 portable_config/shaders/nlmeans_temporal.glsl |  701 +++---
 30 files changed, 4650 insertions(+), 3664 deletions(-)
 create mode 100644 portable_config/scripts/uosc/elements/Logo.lua
 create mode 100644 portable_config/shaders/nlmeans_2x.glsl
 delete mode 100644 portable_config/shaders/nlmeans_hq.glsl
 create mode 100644 portable_config/shaders/nlmeans_hqx.glsl

diff --git a/portable_config/input_easy.conf b/portable_config/input_easy.conf
index d5acb2df..9db0dc69 100644
--- a/portable_config/input_easy.conf
+++ b/portable_config/input_easy.conf
@@ -32,16 +32,16 @@
  o                    show-progress                # 临时显示时间码/进度条
  O                    no-osd cycle-values osd-level 3 1 # 常驻显示时间码
 
- 1                    add contrast -1              # 对比度 -
- 2                    add contrast  1              # 对比度 +
- 3                    add brightness -1            # 明度 -
- 4                    add brightness  1            # 明度 +
- 5                    add gamma -1                 # 伽马 -
- 6                    add gamma  1                 # 伽马 +
- 7                    add saturation -1            # 饱和度/纯度 -
- 8                    add saturation  1            # 饱和度/纯度 +
- 9                    add hue -1                   # 色相 -
- 0                    add hue  1                   # 色相 +
+ 1                    add contrast -0.5              # 对比度 -
+ 2                    add contrast  0.5              # 对比度 +
+ 3                    add brightness -0.5            # 明度 -
+ 4                    add brightness  0.5            # 明度 +
+ 5                    add gamma -0.5                 # 伽马 -
+ 6                    add gamma  0.5                 # 伽马 +
+ 7                    add saturation -0.5            # 饱和度/纯度 -
+ 8                    add saturation  0.5            # 饱和度/纯度 +
+ 9                    add hue -0.5                   # 色相 -
+ 0                    add hue  0.5                   # 色相 +
  Ctrl+BS              set contrast 0 ; set brightness 0 ; set gamma 0 ; set saturation 0 ; set hue 0 # 重置（视频）均衡器
 
 ##当输出旋转之后，垂直方向因跟随输出方向的变化而改变。输出旋转操作会启动缩略图重建进程
diff --git a/portable_config/mpv.conf b/portable_config/mpv.conf
index 3eceb65b..85699d72 100644
--- a/portable_config/mpv.conf
+++ b/portable_config/mpv.conf
@@ -83,11 +83,14 @@
 ##⇘⇘缓存相关
 
  demuxer-max-bytes      = 150MiB      # 播放网络视频时的向后缓存大小（KiB或MiB），默认 150MiB
- icc-cache-dir          =
-                                      # 指定目录存储和加载从ICC配置文件创建的3dlut缓存（例值 "~~/_cache/icc"），默认为空（内存）
+ icc-cache              = no          # 是否在本地存储ICC配置文件的3dlut缓存，默认 no
                                       # 可以用来加快加载速度，未压缩的LUT的大小取决于 --icc-3dlut-size
+ icc-cache-dir          =
+                                      # 指定ICC配置文件的3dlut缓存目录（例值 "~~/_cache/icc"），WIN平台默认为主设置路径
+
+ gpu-shader-cache       = no          # 是否在本地存储GLSL着色器的编译缓存，可以提高启动性能，默认 no
  gpu-shader-cache-dir   =
-                                      # 在此目录中存储和加载已编译的GLSL着色器缓存，可以提高启动性能（例值 "~~/_cache/shader"），默认为空（内存）
+                                      # 指定GLSL着色器的编译缓存目录（例值 "~~/_cache/shader"），WIN平台默认为主设置路径
  watch-later-directory  =
                                       # 稍后观看功能的缓存目录，其中的文件记录 --watch-later-options 指定的项。默认为空（实际路径为 "~~/watch_later"）
 
diff --git a/portable_config/script-opts.conf b/portable_config/script-opts.conf
index 2bcd88a0..7a5d4344 100644
--- a/portable_config/script-opts.conf
+++ b/portable_config/script-opts.conf
@@ -10,9 +10,12 @@
 # 控制台 #
 ##########
 
- script-opts-append = console-scale=1                # 字体缩放。当 --hidpi-window-scale=no 时，不再考量显示缩放率
- script-opts-append = console-font=                  # 指定控制台的字体
- script-opts-append = console-font_size=24           # 字体大小默认16。最终大小将与缩放率相乘
+ script-opts-append = console-scale=1                # 字体缩放（当 --hidpi-window-scale=no 时，不再考量DPI的影响），默认 1
+ script-opts-append = console-font=                  # 指定控制台的字体，默认为空
+ script-opts-append = console-font_size=16           # 字体大小，默认 16（最终大小将与前项之中的 scale 相乘）
+ script-opts-append = console-border_size=1          # 字体边框宽度，默认 1
+
+ script-opts-append = console-history_dedup=yes      # 移除历史记录中的重复条目，只保留最新的一项，默认 yes
 
 
 
@@ -149,7 +152,7 @@
 
  script-opts-append = ytdl_hook-try_ytdl_first=no     # 首选尝试用YTDL解析（默认 no）
 
- script-opts-append = ytdl_hook-exclude="%.avi$|%.flac$|%.flv$|%.mp3$|%.m3u$|%.m3u8$|%.m4a$|%.m4v$|%.mkv$|%.mp4$|%.ts$|%.VOB$|%.wav$|%.webm$|%.wmw$"
+ script-opts-append = ytdl_hook-exclude="%.avi$|%.flac$|%.flv$|%.mp3$|%.m3u$|%.m3u8$|%.m4a$|%.m4v$|%.mkv$|%.mp4$|%.ts$|%.VOB$|%.wav$|%.webm$|%.wmv$"
                                                       # 解析地址黑名单，格式解释见 https://mpv.io/manual/master/#options-exclude
                                                       # 推荐在 try_ytdl_first=yes 的情况下使用，可合理加速部分地址的解析
  script-opts-append = ytdl_hook-all_formats=no        # 默认 no https://mpv.io/manual/master/#options-all-formats
@@ -425,26 +428,28 @@
 ################
 
  script-opts-append = thumbfast-socket=
-                                                 # Socket 路径，留空即自动
+                                                          # Socket 路径，留空即自动
  script-opts-append = thumbfast-tnpath=
-                                                 # 缩略图缓存路径，留空即自动
- script-opts-append = thumbfast-max_height=300   # 缩略图的尺寸，以像素为单位，默认 300 300
+                                                          # 缩略图缓存路径，留空即自动
+ script-opts-append = thumbfast-max_height=300            # 缩略图的尺寸，以像素为单位，默认 300 300
  script-opts-append = thumbfast-max_width=300
 
- script-opts-append = thumbfast-overlay_id=42    # 勿改
+ script-opts-append = thumbfast-overlay_id=42             # 勿改
 
- script-opts-append = thumbfast-spawn_first=no   # 加载文件时就开始生成缩略图，默认 no
- script-opts-append = thumbfast-network=no       # 是否对流媒体启用，默认 no
- script-opts-append = thumbfast-audio=no         # 是否对音频文件启用，默认 no
- script-opts-append = thumbfast-hwdec=yes        # 是否使用硬解加速，默认 yes
- script-opts-append = thumbfast-direct_io=yes    # [仅Windows且LuaJIT] 使用Windows的原生API来写入pipe。默认 yes
+ script-opts-append = thumbfast-spawn_first=no            # 加载文件时就开始生成缩略图，默认 no
+ script-opts-append = thumbfast-quit_after_inactivity=0   # 是否退出超时未活动的缩略图进程（秒），默认 0 即禁用
+ script-opts-append = thumbfast-network=no                # 是否对流媒体启用，默认 no
+ script-opts-append = thumbfast-audio=no                  # 是否对音频文件启用，默认 no
+ script-opts-append = thumbfast-hwdec=yes                 # 是否使用硬解加速，默认 yes
+ script-opts-append = thumbfast-direct_io=yes             # [仅Windows且LuaJIT] 使用Windows的原生API来写入pipe。默认 yes
 
- script-opts-append = thumbfast-sw_threads=2     # 软解线程数，默认 2
- script-opts-append = thumbfast-binpath=mpv      # 自定义mpv路径，Mac使用bundle-app用户可选值为bundle，默认 mpv
- script-opts-append = thumbfast-min_duration=0   # 是否只对时长高于该值的视频启用（秒），默认 0 即禁用
- script-opts-append = thumbfast-precise=auto     # <默认auto|yes|no> 启用高精度预览，yes即始终精确帧，no即始终关键帧，默认即仅光标静止时为精确帧
- script-opts-append = thumbfast-frequency=0.1    # 解码频率（秒），默认 0.1
- script-opts-append = thumbfast-auto_run=yes     # 自动运行，默认 yes
+ script-opts-append = thumbfast-sw_threads=2              # 软解线程数，默认 2
+ script-opts-append = thumbfast-binpath=mpv               # 自定义mpv路径，Mac使用bundle-app用户可选值为bundle，默认 mpv
+ script-opts-append = thumbfast-min_duration=0            # 是否只对时长高于该值的视频启用（秒），默认 0 即禁用
+ script-opts-append = thumbfast-precise=0                 # <默认0|1|2> 预览精度。0 为自动（仅光标静止时为精确帧），1 为始终关键帧，2 为始终精确帧
+ script-opts-append = thumbfast-quality=1                 # <0|默认1|2> 预览质量。0 为自动，1 为不映射hdr，2 为减少锯齿并支持将一般hdr源映射到sdr
+ script-opts-append = thumbfast-frequency=0.1             # 解码频率（秒），默认 0.1
+ script-opts-append = thumbfast-auto_run=yes              # 自动运行，默认 yes
 
 
 
@@ -562,5 +567,7 @@
  script-opts-append = uosc-chapter_range_patterns=openings:オープニング;endings:エンディング
                                                                   # 补充额外的lua模式来识别简单章节范围的起始点（除 ads 外的所有章节）。示例即默认值
 
+ script-opts-append = uosc-idlescreen=yes                         # 空闲是否显示图标，默认 yes
+ script-opts-append = uosc-idlemsg=default                        # 空闲显示的文字信息，默认 default
  script-opts-append = uosc-idle_call_menu=0                       # 空闲自动弹出上下文菜单。设置为 <0.02-2> 之间的数为延迟触发的时间，否则禁用（默认 0）
  script-opts-append = custom_font=default                         # 自定义界面字体，默认值 default 即使用 mpv.conf 中 --osd-font 的字体
diff --git a/portable_config/script-opts/console.conf b/portable_config/script-opts/console.conf
index 55e9ffe4..99d1c1a3 100644
--- a/portable_config/script-opts/console.conf
+++ b/portable_config/script-opts/console.conf
@@ -3,9 +3,12 @@
 
 scale=1
 ##字体缩放率。当 --hidpi-window-scale=no 时，不再考量显示缩放率
-
 font=
 ##指定控制台的字体
+font_size=16
+##字体大小，默认16。最终大小将与 --scale 相乘
+border_size=1
+##字体边框宽度，默认1。
 
-font_size=24
-##字体大小默认16。最终大小将与 --scale 相乘
+history_dedup=yes
+##移除历史记录中的重复条目，只保留最新的一项，默认yes。
diff --git a/portable_config/script-opts/thumbfast.conf b/portable_config/script-opts/thumbfast.conf
index b363b6b9..73c73511 100644
--- a/portable_config/script-opts/thumbfast.conf
+++ b/portable_config/script-opts/thumbfast.conf
@@ -16,6 +16,8 @@ overlay_id=42
 
 # 加载文件时就开始生成缩略图，默认 no
 spawn_first=no
+# 是否退出超时未活动的缩略图进程（秒），默认 0 即禁用
+quit_after_inactivity=0
 # 是否对流媒体启用，默认 no
 network=no
 # 是否对音频文件启用，默认 no
@@ -31,8 +33,10 @@ sw_threads=2
 binpath=mpv
 # 是否只对时长高于该值的视频启用（秒），默认 0 即禁用
 min_duration=0
-# <默认auto|yes|no> 启用高精度预览，yes即始终精确帧，no即始终关键帧，默认即仅光标静止时为精确帧
-precise=auto
+# <默认0|1|2> 预览精度。0 为自动（仅光标静止时为精确帧），1 为始终关键帧，2 为始终精确帧
+precise=0
+# <0|默认1|2> 预览质量。0 为自动，1 为不映射hdr，2 为减少锯齿并支持将一般hdr源映射到sdr
+quality=1
 # 解码频率（秒），默认 0.1
 frequency=0.1
 # 自动运行，默认 yes
diff --git a/portable_config/script-opts/uosc.conf b/portable_config/script-opts/uosc.conf
index b8536db4..ee3660d9 100644
--- a/portable_config/script-opts/uosc.conf
+++ b/portable_config/script-opts/uosc.conf
@@ -148,6 +148,9 @@ adjust_osd_margins=no
 chapter_ranges=openings:30ABF964,endings:30ABF964,ads:C54E4E80
 chapter_range_patterns=openings:オープニング;endings:エンディング
 
+# 空闲是否显示logo（默认 yes），和自定义的文字信息（默认值 default 即不显示文字）
+idlescreen=yes
+idlemsg=default
 # 空闲自动弹出上下文菜单。设置为 <0.02-2> 之间的数为延迟触发的时间，否则禁用（默认）
 idle_call_menu=0
 # 自定义界面字体，默认值 default 即使用主设置中 --osd-font 的字体
diff --git a/portable_config/script-opts/uosc_lang.conf b/portable_config/script-opts/uosc_lang.conf
index 37f441f3..d153a5d1 100644
--- a/portable_config/script-opts/uosc_lang.conf
+++ b/portable_config/script-opts/uosc_lang.conf
@@ -1,34 +1,37 @@
 
 ## context menu default
-_load=load
-_file_browser=file browser
-_import_sid=import sid
-_navigation=navigation
-_playlist=playlist
-_edition_list=edition list
-_chapter_list=chapter list
-_vid_list=vid list
-_aid_list=aid list
-_sid_list=sid list
-_playlist_shuffle=playlist shuffle
-_ushot=uscreenshot
-_VIDEO=VIDEO
-_decoding_api=hwdec cycle
-_deband_toggle=deband toggle
-_deint_toggle=deint toggle
-_icc_toggle=icc auto toggle
-_corpts_toggle=correct pts toggle
-_TOOLS=TOOLS
-_stats_toggle=stats toggle
-_console_on=console on
-_border_toggle=border toggle
-_ontop_toggle=ontop toggle
-_audio_device=audio device
-_stream_quality=stream quality
-_show_file_dir=show file dir
-_show_config_dir=show config dir
-_stop=stop
-_quit=quit
+_cm_load=Load
+_cm_file_browser=File Browser
+_cm_import_sid=Import SID
+_cm_navigation=Navigation
+_cm_playlist=Playlist
+_cm_edition_list=Edition-list
+_cm_chapter_list=Chapter-list
+_cm_vid_list=VID-list
+_cm_aid_list=AID-list
+_cm_sid_list=SID-list
+_cm_playlist_shuffle=Playlist Shuffle
+_cm_ushot=uScreenshot
+_cm_video=VIDEO
+_cm_decoding_api=hwdec cycle
+_cm_deband_toggle=deband toggle
+_cm_deint_toggle=deint toggle
+_cm_icc_toggle=icc auto toggle
+_cm_corpts_toggle=correct pts toggle
+_cm_tools=TOOLS
+_cm_stats_toggle=stats toggle
+_cm_console_on=console on
+_cm_border_toggle=border toggle
+_cm_ontop_toggle=ontop toggle
+_cm_audio_device=audio device
+_cm_stream_quality=Stream Quality
+_cm_show_file_dir=show file dir
+_cm_show_config_dir=show config dir
+_cm_stop=Stop
+_cm_quit=Quit
+
+## no_border_title
+_border_title=No File
 
 ## track_loaders sub_menu
 _sid_menu=subtitle track
@@ -41,22 +44,25 @@ _aid_submenu_title=aid list
 _vid_submenu_title=vid list
 _playlist_submenu_title=playlist
 _chapter_list_submenu_title=chapter list
+_chapter_list_submenu_item_title=unnamed chapter 
 _edition_list_submenu_title=edition list
 _edition_list_submenu_item_title=edition
 _stream_quality_submenu_title=stream quality list
 _audio_device_submenu_title=audio device list
+_audio_device_submenu_item_title=Autoselect device
 
 _submenu_import=import
 _submenu_load_file=load file
 _submenu_id_disabled=disabled
+_submenu_id_hint=channel(s)
 _submenu_id_forced=forced
 _submenu_id_default=default
 _submenu_id_external=external
 _submenu_id_title=track 
+_submenu_file_browser_item_hint=driver list
+_submenu_file_browser_item_hint2=parent dir
+_submenu_file_browser_item2_hint=driver
 _submenu_file_browser_title=driver list
-_submenu_file_browser_item_title=parent dir
-_submenu_file_browser_item2_title=driver
-_submenu_file_browser_item3_title=driver list
 
 ## built-in_shortcut
 _button01=MENU
diff --git a/portable_config/script-opts/ytdl_hook.conf b/portable_config/script-opts/ytdl_hook.conf
index 5893731e..9b60db3e 100644
--- a/portable_config/script-opts/ytdl_hook.conf
+++ b/portable_config/script-opts/ytdl_hook.conf
@@ -6,7 +6,7 @@ try_ytdl_first=no
 
 ##解析地址黑名单，格式解释见 https://mpv.io/manual/master/#options-exclude
 ##推荐在 try_ytdl_first=yes 的情况下使用，可合理加速网络地址的解析
-exclude="%.avi$|%.flac$|%.flv$|%.mp3$|%.m3u$|%.m3u8$|%.m4a$|%.m4v$|%.mkv$|%.mp4$|%.ts$|%.VOB$|%.wav$|%.webm$|%.wmw$"
+exclude="%.avi$|%.flac$|%.flv$|%.mp3$|%.m3u$|%.m3u8$|%.m4a$|%.m4v$|%.mkv$|%.mp4$|%.ts$|%.VOB$|%.wav$|%.webm$|%.wmv$"
 
 ##https://mpv.io/manual/master/#options-all-formats
 all_formats=no
diff --git a/portable_config/scripts/osc_plus.lua b/portable_config/scripts/osc_plus.lua
index ad9f0182..9108658b 100644
--- a/portable_config/scripts/osc_plus.lua
+++ b/portable_config/scripts/osc_plus.lua
@@ -1,10 +1,10 @@
 --[[
 SOURCE_ https://github.com/mpv-player/mpv/blob/master/player/lua/osc.lua
-COMMIT_ 292a5868cb60c481ae9eaed7d21e67dcff41938f
+COMMIT_ b7ffe0d16eec8153d9609382997baaf6a29e5e4f
 文档_ https://github.com/hooke007/MPV_lazy/discussions/18
 
 改进版本的OSC，不兼容其它OSC类脚本（实现全部功能需搭配 新缩略图引擎 thumbfast ）
-（可选）mpv.conf的前置条件 --osc=no （否则个别功能不可用，例如 启动时显示OSC）
+（可选）mpv.conf的前置条件 --osc=no （否则个别功能可能不可用）
 
 示例在 input.conf 中写入：
 SHIFT+DEL   script-binding osc_plus/visibility   # 切换 osc_plus 的可见性
@@ -2408,8 +2408,8 @@ function osc_init()
             -- mouse move events may pile up during seeking and may still get
             -- sent when the user is done seeking, so we need to throw away
             -- identical seeks
-            thumbfast.pause = false --暂停渲染缩略图
-            mp.commandv("script-message-to", "thumbfast", "clear")
+            thumbfast.pause = false -- 暂停渲染缩略图
+            -- mp.commandv("script-message-to", "thumbfast", "clear") -- 会有高几率冻结
             local seekto = get_slider_value(element)
             if (element.state.lastseek == nil) or
                 (not (element.state.lastseek == seekto)) then
@@ -2639,6 +2639,12 @@ end
 
 function hide_osc()
     msg.trace("hide_osc")
+
+    -- 关联 thumbfast.lua
+    if thumbfast.width ~= 0 or thumbfast.height ~= 0 then
+        mp.commandv("script-message-to", "thumbfast", "clear")
+    end
+
     if not state.enabled then
         -- typically hide happens at render() from tick(), but now tick() is
         -- no-op and won't render again to remove the osc, so do that manually.
@@ -2800,14 +2806,14 @@ function render()
 
     --mouse show/hide area
     for k,cords in pairs(osc_param.areas["showhide"]) do
-        set_virt_mouse_area(cords.x1, cords.y1, cords.x2, cords.y2, "showhide")
+        set_virt_mouse_area(cords.x1, cords.y1, cords.x2, cords.y2, "showhide_osc_plus")
     end
     if osc_param.areas["showhide_wc"] then
         for k,cords in pairs(osc_param.areas["showhide_wc"]) do
-            set_virt_mouse_area(cords.x1, cords.y1, cords.x2, cords.y2, "showhide_wc")
+            set_virt_mouse_area(cords.x1, cords.y1, cords.x2, cords.y2, "showhide_wc_osc_plus")
         end
     else
-        set_virt_mouse_area(0, 0, 0, 0, "showhide_wc")
+        set_virt_mouse_area(0, 0, 0, 0, "showhide_wc_osc_plus")
     end
     do_enable_keybindings()
 
@@ -2816,13 +2822,13 @@ function render()
 
     for _,cords in ipairs(osc_param.areas["input"]) do
         if state.osc_visible then -- activate only when OSC is actually visible
-            set_virt_mouse_area(cords.x1, cords.y1, cords.x2, cords.y2, "input")
+            set_virt_mouse_area(cords.x1, cords.y1, cords.x2, cords.y2, "input_osc_plus")
         end
         if state.osc_visible ~= state.input_enabled then
             if state.osc_visible then
-                mp.enable_key_bindings("input")
+                mp.enable_key_bindings("input_osc_plus")
             else
-                mp.disable_key_bindings("input")
+                mp.disable_key_bindings("input_osc_plus")
             end
             state.input_enabled = state.osc_visible
         end
@@ -2835,13 +2841,13 @@ function render()
     if osc_param.areas["window-controls"] then
         for _,cords in ipairs(osc_param.areas["window-controls"]) do
             if state.osc_visible then -- activate only when OSC is actually visible
-                set_virt_mouse_area(cords.x1, cords.y1, cords.x2, cords.y2, "window-controls")
+                set_virt_mouse_area(cords.x1, cords.y1, cords.x2, cords.y2, "window-controls_osc_plus")
             end
             if state.osc_visible ~= state.windowcontrols_buttons then
                 if state.osc_visible then
-                    mp.enable_key_bindings("window-controls")
+                    mp.enable_key_bindings("window-controls_osc_plus")
                 else
-                    mp.disable_key_bindings("window-controls")
+                    mp.disable_key_bindings("window-controls_osc_plus")
                 end
                 state.windowcontrols_buttons = state.osc_visible
             end
@@ -3018,6 +3024,9 @@ function tick()
         -- render idle message
         msg.trace("idle message")
         local _, _, display_aspect = mp.get_osd_size()
+        if display_aspect == 0 then
+            return
+        end
         local display_h = 360
         local display_w = display_h * display_aspect
         -- logo is rendered at 2^(6-1) = 32 times resolution with size 1800x1800
@@ -3050,8 +3059,8 @@ function tick()
         set_osd(display_w, display_h, ass.text)
 
         if state.showhide_enabled then
-            mp.disable_key_bindings("showhide")
-            mp.disable_key_bindings("showhide_wc")
+            mp.disable_key_bindings("showhide_osc_plus")
+            mp.disable_key_bindings("showhide_wc_osc_plus")
             state.showhide_enabled = false
         end
 
@@ -3086,8 +3095,8 @@ end
 function do_enable_keybindings()
     if state.enabled then
         if not state.showhide_enabled then
-            mp.enable_key_bindings("showhide", "allow-vo-dragging+allow-hide-cursor")
-            mp.enable_key_bindings("showhide_wc", "allow-vo-dragging+allow-hide-cursor")
+            mp.enable_key_bindings("showhide_osc_plus", "allow-vo-dragging+allow-hide-cursor")
+            mp.enable_key_bindings("showhide_wc_osc_plus", "allow-vo-dragging+allow-hide-cursor")
         end
         state.showhide_enabled = true
     end
@@ -3100,8 +3109,8 @@ function enable_osc(enable)
     else
         hide_osc() -- acts immediately when state.enabled == false
         if state.showhide_enabled then
-            mp.disable_key_bindings("showhide")
-            mp.disable_key_bindings("showhide_wc")
+            mp.disable_key_bindings("showhide_osc_plus")
+            mp.disable_key_bindings("showhide_wc_osc_plus")
         end
         state.showhide_enabled = false
     end
@@ -3204,11 +3213,11 @@ end)
 mp.set_key_bindings({
     {"mouse_move",              function(e) process_event("mouse_move", nil) end},
     {"mouse_leave",             mouse_leave},
-}, "showhide", "force")
+}, "showhide_osc_plus", "force")
 mp.set_key_bindings({
     {"mouse_move",              function(e) process_event("mouse_move", nil) end},
     {"mouse_leave",             mouse_leave},
-}, "showhide_wc", "force")
+}, "showhide_wc_osc_plus", "force")
 do_enable_keybindings()
 
 --mouse input bindings
@@ -3227,14 +3236,14 @@ mp.set_key_bindings({
     {"mbtn_left_dbl",       "ignore"},
     {"shift+mbtn_left_dbl", "ignore"},
     {"mbtn_right_dbl",      function(e) process_event("mbtn_right_dbl", "press") end}, -- 右键双击检查
-}, "input", "force")
-mp.enable_key_bindings("input")
+}, "input_osc_plus", "force")
+mp.enable_key_bindings("input_osc_plus")
 
 mp.set_key_bindings({
     {"mbtn_left",           function(e) process_event("mbtn_left", "up") end,
                             function(e) process_event("mbtn_left", "down")  end},
-}, "window-controls", "force")
-mp.enable_key_bindings("window-controls")
+}, "window-controls_osc_plus", "force")
+mp.enable_key_bindings("window-controls_osc_plus")
 
 function get_hidetimeout()
     if user_opts.visibility == "always" then
@@ -3290,8 +3299,8 @@ function visibility_mode(mode, no_osd)
     -- Reset the input state on a mode change. The input state will be
     -- recalculated on the next render cycle, except in 'never' mode where it
     -- will just stay disabled.
-    mp.disable_key_bindings("input")
-    mp.disable_key_bindings("window-controls")
+    mp.disable_key_bindings("input_osc_plus")
+    mp.disable_key_bindings("window-controls_osc_plus")
     state.input_enabled = false
 
     update_margins()
@@ -3386,5 +3395,5 @@ mp.register_script_message("thumbfast-info", function(json)
     end
 end)
 
-set_virt_mouse_area(0, 0, 0, 0, "input")
-set_virt_mouse_area(0, 0, 0, 0, "window-controls")
+set_virt_mouse_area(0, 0, 0, 0, "input_osc_plus")
+set_virt_mouse_area(0, 0, 0, 0, "window-controls_osc_plus")
diff --git a/portable_config/scripts/thumbfast.lua b/portable_config/scripts/thumbfast.lua
index 77a86f2f..47ace55c 100644
--- a/portable_config/scripts/thumbfast.lua
+++ b/portable_config/scripts/thumbfast.lua
@@ -1,6 +1,6 @@
 --[[
 SOURCE_ https://github.com/po5/thumbfast/blob/master/thumbfast.lua
-COMMIT_ ddc61957ce38b62283c5d7ef99a7252c7499cc8b
+COMMIT_ 8aa6faf10adad899e05cc9b850cde904d37515be
 
 适配多个OSC类脚本的新缩略图引擎
 
@@ -13,26 +13,28 @@ COMMIT_ ddc61957ce38b62283c5d7ef99a7252c7499cc8b
 
 local options = {
 
-    socket = "",           -- Socket path (leave empty for auto)
-    tnpath = "",           -- 缩略图缓存路径（确保目录真实存在），留空即自动
+    socket = "",                 -- Socket path (leave empty for auto)
+    tnpath = "",                 -- 缩略图缓存路径（确保目录真实存在），留空即自动
 
-    max_height = 300,      -- Maximum thumbnail size in pixels (scaled down to fit) Values are scaled when hidpi is enabled
+    max_height = 300,            -- Maximum thumbnail size in pixels (scaled down to fit) Values are scaled when hidpi is enabled
     max_width = 300,
 
-    overlay_id = 42,       -- Overlay id
+    overlay_id = 42,             -- Overlay id
 
-    spawn_first = false,   -- Spawn thumbnailer on file load for faster initial thumbnails
-    network = false,       -- Enable on network playback
-    audio = false,         -- Enable on audio playback
-    hwdec = true,          -- 启用硬解加速
-    direct_io = true,      -- Windows only: use native Windows API to write to pipe (requires LuaJIT)
+    spawn_first = false,         -- Spawn thumbnailer on file load for faster initial thumbnails
+    quit_after_inactivity = 0,   -- Close thumbnailer process after an inactivity period in seconds, 0 to disable
+    network = false,             -- Enable on network playback
+    audio = false,               -- Enable on audio playback
+    hwdec = true,                -- 启用硬解加速
+    direct_io = true,            -- Windows only: use native Windows API to write to pipe (requires LuaJIT)
 
-    sw_threads = 2,        -- 软解线程
-    binpath = "mpv",       -- 自定义mpv路径
-    min_duration = 0,      -- 对短视频关闭预览（秒）
-    precise = "auto",      -- 预览精度
-    frequency = 0.1,       -- 解码频率（秒）
-    auto_run = true,       -- 自动运行
+    sw_threads = 2,              -- 软解线程
+    binpath = "mpv",             -- 自定义mpv路径
+    min_duration = 0,            -- 对短视频关闭预览（秒）
+    precise = 0,                 -- 预览精度
+    quality = 1,                 -- 预览质量
+    frequency = 0.1,             -- 解码频率（秒）
+    auto_run = true,             -- 自动运行
 
 }
 
@@ -40,6 +42,8 @@ mp.utils = require "mp.utils"
 mp.options = require "mp.options"
 mp.options.read_options(options)
 
+local properties = {}
+
 function subprocess(args, async, callback)
     callback = callback or function() end
 
@@ -102,10 +106,15 @@ if options.direct_io then
     end
 end
 
+local file = nil
+local file_bytes = 0
 local spawned = false
-local network = false
 local disabled = false
 local spawn_waiting = false
+local spawn_working = false
+local script_written = false
+
+local dirty = false
 
 local x = nil
 local y = nil
@@ -130,26 +139,8 @@ local has_vid = 0
 
 local file_timer = nil
 local file_check_period = 1/60
-local first_file = false
-
-local function debounce(func, wait)
-    func = type(func) == "function" and func or function() end
-    wait = type(wait) == "number" and wait / 1000 or 0
 
-    local timer = nil
-    local timer_end = function ()
-        timer:kill()
-        timer = nil
-        func()
-    end
-
-    return function ()
-        if timer then
-            timer:kill()
-        end
-        timer = mp.add_timeout(wait, timer_end)
-    end
-end
+local mac_bundle_mode = false
 
 local client_script = [=[
 #!/usr/bin/env bash
@@ -179,23 +170,22 @@ local function get_os()
     raw_os_name = (raw_os_name):lower()
 
     local os_patterns = {
-        ["windows"] = "Windows",
-
-        ["linux"]   = "Linux",
+        ["windows"] = "windows",
+        ["linux"]   = "linux",
 
-        ["osx"]     = "Mac",
-        ["mac"]     = "Mac",
-        ["darwin"]  = "Mac",
+        ["osx"]     = "darwin",
+        ["mac"]     = "darwin",
+        ["darwin"]  = "darwin",
 
-        ["^mingw"]  = "Windows",
-        ["^cygwin"] = "Windows",
+        ["^mingw"]  = "windows",
+        ["^cygwin"] = "windows",
 
-        ["bsd$"]    = "Mac",
-        ["sunos"]   = "Mac"
+        ["bsd$"]    = "darwin",
+        ["sunos"]   = "darwin"
     }
 
-    -- Default to linux
-    local str_os_name = "Linux"
+    -- 默认为WIN
+    local str_os_name = "windows"
 
     for pattern, name in pairs(os_patterns) do
         if raw_os_name:match(pattern) then
@@ -207,10 +197,10 @@ local function get_os()
     return str_os_name
 end
 
-local os_name = get_os()
+local os_name = mp.get_property("platform") or get_os()
 
 if options.socket == "" then
-    if os_name == "Windows" then
+    if os_name == "windows" then
         options.socket = "thumbfast"
     else
         options.socket = "/tmp/thumbfast"
@@ -218,7 +208,7 @@ if options.socket == "" then
 end
 
 if options.tnpath == "" then
-    if os_name == "Windows" then
+    if os_name == "windows" then
         options.tnpath = os.getenv("TEMP").."\\thumbfast.out"
     else
         options.tnpath = "/tmp/thumbfast.out"
@@ -231,7 +221,7 @@ options.socket = options.socket .. unique
 options.tnpath = options.tnpath .. unique
 
 if options.direct_io then
-    if os_name == "Windows" then
+    if os_name == "windows" then
         winapi.socket_wc = winapi.MultiByteToWideChar("\\\\.\\pipe\\" .. options.socket)
     end
 
@@ -242,17 +232,18 @@ end
 
 local mpv_path = options.binpath
 
-if os_name == "Mac" and options.binpath == "bundle" and unique then
+if os_name == "darwin" and options.binpath == "bundle" and unique then
     mpv_path = string.gsub(subprocess({"ps", "-o", "comm=", "-p", tostring(unique)}).stdout, "[\n\r]", "")
     mpv_path = string.gsub(mpv_path, "/mpv%-bundle$", "/mpv")
+    mac_bundle_mode = true
 end
 
 local function calc_dimensions()
-    local width = mp.get_property_number("video-params/w")
-    local height = mp.get_property_number("video-params/h")
+    local width = properties["video-params"] and properties["video-params"]["w"]
+    local height = properties["video-params"] and properties["video-params"]["h"]
     if not width or not height then return end
 
-    local scale = mp.get_property_number("display-hidpi-scale", 1)
+    local scale = properties["display-hidpi-scale"] or 1
 
     if width / height > options.max_width / options.max_height then
         effective_w = math.floor(options.max_width * scale + 0.5)
@@ -268,15 +259,13 @@ local info_timer = nil
 local auto_run = options.auto_run
 
 local function info(w, h)
-    local display_w, display_h = w, h
+    local short_video = mp.get_property_number("duration", 0) <= options.min_duration
+    local image = properties["current-tracks"] and properties["current-tracks"]["video"] and properties["current-tracks"]["video"]["image"]
+    local albumart = image and properties["current-tracks"]["video"]["albumart"]
 
-    network = mp.get_property_bool("demuxer-via-network", false)
-    local image = mp.get_property_native("current-tracks/video/image", true)
-    local albumart = image and mp.get_property_native("current-tracks/video/albumart", false)
-    local short_video = mp.get_property_native("duration", 0) <= options.min_duration
     disabled = (w or 0) == 0 or (h or 0) == 0 or
         has_vid == 0 or
-        (network and not options.network) or
+        (properties["demuxer-via-network"] and not options.network) or
         (albumart and not options.audio) or
         (image and not albumart) or
         (short_video and options.min_duration > 0)
@@ -292,71 +281,140 @@ local function info(w, h)
         info_timer = mp.add_timeout(0.05, function() info(w, h) end)
     end
 
-    local json, err = mp.utils.format_json({width=display_w, height=display_h, disabled=disabled, available=true, socket=options.socket, tnpath=options.tnpath, overlay_id=options.overlay_id})
-    mp.commandv("script-message", "thumbfast-info", json)
+    local json, err = mp.utils.format_json({width=w, height=h, disabled=disabled, available=true, socket=options.socket, tnpath=options.tnpath, overlay_id=options.overlay_id})
+    mp.command_native_async({"script-message", "thumbfast-info", json}, function() end)
 end
 
 local function remove_thumbnail_files()
+    if file then
+        file:close()
+        file = nil
+        file_bytes = 0
+    end
     os.remove(options.tnpath)
     os.remove(options.tnpath..".bgra")
 end
 
+local activity_timer
+
+local scale_sw = "fast-bilinear"
+local vf_str
+
+if options.quality == 0 then
+    if options.precise == 2 then
+        options.quality = 2
+    elseif options.precise == 0 then
+        options.quality = 1
+    elseif options.precise == 1 then
+        options.quality = 1
+    end
+    if options.sw_threads >= 4 then
+        options.quality = 2
+    elseif options.sw_threads == 1 then
+        options.quality = 1
+    end
+end
+
+if options.quality == 2 then
+    scale_sw = "bicublin"
+end
+
+local function quality()
+    local vf_str_suffix = "format=fmt=bgra"
+    local vf_str_pre
+    if options.quality == 1 then
+        vf_str = "scale=w="..effective_w..":h="..effective_h..":flags=fast_bilinear,"..vf_str_suffix
+    elseif options.quality == 2 then
+        vf_str_pre = "scale=w="..effective_w..":h="..effective_h..":flags=bicublin,"
+        vf_str = vf_str_pre..vf_str_suffix
+        if mp.get_property_number("video-params/sig-peak", 1) > 1 then
+            vf_str = vf_str_pre.."format=fmt=gbrapf32,zscale=transfer=linear,tonemap=tonemap=mobius:desat=8.0,zscale=transfer=709,"..vf_str_suffix
+        end
+    end
+    print(vf_str)
+    return vf_str
+end
+
 local function spawn(time)
     if disabled then return end
 
-    local path = mp.get_property("path")
+    local path = properties["path"]
     if path == nil then return end
 
-    local open_filename = mp.get_property("stream-open-filename")
-    local ytdl = open_filename and network and path ~= open_filename
+    if options.quit_after_inactivity > 0 then
+        if show_thumbnail or activity_timer:is_enabled() then
+            activity_timer:kill()
+        end
+        activity_timer:resume()
+    end
+
+    local open_filename = properties["stream-open-filename"]
+    local ytdl = open_filename and properties["demuxer-via-network"] and path ~= open_filename
     if ytdl then
         path = open_filename
     end
 
     remove_thumbnail_files()
 
-    local vid = mp.get_property_number("vid")
+    local vid = properties["vid"]
     has_vid = vid or 0
 
     local args = {
-        mpv_path, path, "--config=no", "--terminal=no", "--msg-level=all=no", "--idle=yes", "--keep-open=always","--pause=yes", "--ao=null", "--vo=null",
+        mpv_path, "--config=no", "--terminal=no", "--msg-level=all=no", "--idle=yes", "--keep-open=always","--pause=yes", "--ao=null", "--vo=null",
         "--load-auto-profiles=no", "--load-osd-console=no", "--load-stats-overlay=no", "--osc=no",
         "--vd-lavc-skiploopfilter=all", "--vd-lavc-skipidct=all", "--vd-lavc-software-fallback=1", "--vd-lavc-fast", "--vd-lavc-threads="..options.sw_threads, "--hwdec="..(options.hwdec and "auto" or "no"),
-        "--edition="..(mp.get_property_number("edition") or "auto"), "--vid="..(vid or "auto"), "--sub=no", "--audio=no", "--sub-auto=no", "--audio-file-auto=no",
+        "--edition="..(properties["edition"] or "auto"), "--vid="..(vid or "auto"), "--sub=no", "--audio=no", "--sub-auto=no", "--audio-file-auto=no",
         "--start="..time,
         "--ytdl-format=worst", "--demuxer-readahead-secs=0", "--demuxer-max-bytes=128KiB",
         "--gpu-dumb-mode=yes", "--tone-mapping=clip", "--hdr-compute-peak=no",
-        "--sws-scaler=fast-bilinear", "--sws-fast=yes", "--sws-allow-zimg=no",
+        "--sws-allow-zimg=no", "--sws-fast=yes", "--sws-scaler="..scale_sw,
         "--audio-pitch-correction=no",
-        "--vf=".."scale=w="..effective_w..":h="..effective_h..":flags=fast_bilinear,format=bgra",
+        "--vf="..quality(),
         "--ovc=rawvideo", "--of=image2", "--ofopts=update=1", "--ocopy-metadata=no", "--o="..options.tnpath
     }
 
-    if os_name == "Windows" then
+    if mac_bundle_mode then
+        table.insert(args, "--macos-app-activation-policy=accessory")
+    end
+
+    if os_name == "windows" then
         table.insert(args, "--input-ipc-server="..options.socket)
-    else
+    elseif not script_written then
         local client_script_path = options.socket..".run"
-        local file = io.open(client_script_path, "w+")
-        if file == nil then
+        local script = io.open(client_script_path, "w+")
+        if script == nil then
             mp.msg.error("client script write failed")
             return
         else
-            file:write(string.format(client_script, options.socket))
-            file:close()
+            script_written = true
+            script:write(string.format(client_script, options.socket))
+            script:close()
             subprocess({"chmod", "+x", client_script_path}, true)
-            table.insert(args, "--script="..client_script_path)
+            table.insert(args, "--scripts="..client_script_path)
         end
+    else
+        local client_script_path = options.socket..".run"
+        table.insert(args, "--scripts="..client_script_path)
     end
 
+    table.insert(args, path)
+
     spawned = true
     spawn_waiting = true
 
     subprocess(args, true,
         function(success, result)
-            if spawn_waiting and (success == false or result.status ~= 0) then
+            if spawn_waiting and (success == false or (result.status ~= 0 and result.status ~= -2)) then
+                spawned = false
+                spawn_waiting = false
                 mp.msg.error("mpv subprocess create failed")
+                if not spawn_working then -- notify users of required configuration
+                    mp.commandv("show-text", "thumbfast 子进程创建失败！", 5)
+                end
+            elseif success == true and result.status == 0 then
+                spawn_working = true
+                spawn_waiting = false
             end
-            spawned = false
         end
     )
 end
@@ -376,27 +434,34 @@ local function run(command)
         return
     end
 
-    local file = nil
-    if os_name == "Windows" then
-        file = io.open("\\\\.\\pipe\\"..options.socket, "r+")
-    else
+    local command_n = command.."\n"
+
+    if os_name == "windows" then
+        if file and file_bytes + #command_n >= 4096 then
+            file:close()
+            file = nil
+            file_bytes = 0
+        end
+        if not file then
+            file = io.open("\\\\.\\pipe\\"..options.socket, "r+b")
+        end
+    elseif not file then
         file = io.open(options.socket, "r+")
     end
-    if file ~= nil then
-        file:seek("end")
-        file:write(command.."\n")
-        file:close()
+    if file then
+        file_bytes = file:seek("end")
+        file:write(command_n)
+        file:flush()
     end
 end
 
 local function draw(w, h, script)
     if not w or not show_thumbnail then return end
-    local display_w, display_h = w, h
 
     if x ~= nil then
-        mp.command_native({name = "overlay-add", id=options.overlay_id, x=x, y=y, file=options.tnpath..".bgra", offset=0, fmt="bgra", w=display_w, h=display_h, stride=(4*display_w)})
+        mp.command_native_async({name = "overlay-add", id=options.overlay_id, x=x, y=y, file=options.tnpath..".bgra", offset=0, fmt="bgra", w=w, h=h, stride=(4*w)}, function() end)
     elseif script then
-        local json, err = mp.utils.format_json({width=display_w, height=display_h, x=x, y=y, socket=options.socket, tnpath=options.tnpath, overlay_id=options.overlay_id})
+        local json, err = mp.utils.format_json({width=w, height=h, x=x, y=y, socket=options.socket, tnpath=options.tnpath, overlay_id=options.overlay_id})
         mp.commandv("script-message-to", script, "thumbfast-render", json)
     end
 end
@@ -426,7 +491,7 @@ local function real_res(req_w, req_h, filesize)
 end
 
 local function move_file(from, to)
-    if os_name == "Windows" then
+    if os_name == "windows" then
         os.remove(to)
     end
     -- move the file because it can get overwritten while overlay-add is reading it, and crash the player
@@ -435,9 +500,9 @@ end
 
 local function seek(fast)
     if last_seek_time then
-        if options.precise == true then run("async seek " .. last_seek_time .. " absolute+exact")
-        elseif options.precise == false then run("async seek " .. last_seek_time .. " absolute+keyframes")
-        elseif options.precise == "auto" then
+        if options.precise == 2 then run("async seek " .. last_seek_time .. " absolute+exact")
+        elseif options.precise == 1 then run("async seek " .. last_seek_time .. " absolute+keyframes")
+        elseif options.precise == 0 then
             run("async seek " .. last_seek_time .. (fast and " absolute+keyframes" or " absolute+exact"))
         end
     end
@@ -479,10 +544,6 @@ local function check_new_thumb()
     local finfo = mp.utils.file_info(tmp)
     if not finfo then return false end
     spawn_waiting = false
-    if first_file then
-        request_seek()
-        first_file = false
-    end
     local w, h = real_res(effective_w, effective_h, finfo.size)
     if w then -- only accept valid thumbnails
         move_file(tmp, options.tnpath..".bgra")
@@ -492,6 +553,9 @@ local function check_new_thumb()
             last_real_w, last_real_h = real_w, real_h
             info(real_w, real_h)
         end
+        if not show_thumbnail then
+            file_timer:kill()
+        end
         return true
     end
     return false
@@ -504,6 +568,38 @@ file_timer = mp.add_periodic_timer(file_check_period, function()
 end)
 file_timer:kill()
 
+local function clear()
+    file_timer:kill()
+    seek_timer:kill()
+    if options.quit_after_inactivity > 0 then
+        if show_thumbnail or activity_timer:is_enabled() then
+            activity_timer:kill()
+        end
+        activity_timer:resume()
+    end
+    last_seek_time = nil
+    show_thumbnail = false
+    last_x = nil
+    last_y = nil
+    if script_name then return end
+    mp.command_native_async({name = "overlay-remove", id=options.overlay_id}, function() end)
+end
+
+local function quit()
+    activity_timer:kill()
+    if show_thumbnail then
+        activity_timer:resume()
+        return
+    end
+    run("quit")
+    spawned = false
+    real_w, real_h = nil, nil
+    clear()
+end
+
+activity_timer = mp.add_timeout(options.quit_after_inactivity, quit)
+activity_timer:kill()
+
 local function thumb(time, r_x, r_y, script)
     if disabled then return end
 
@@ -524,6 +620,13 @@ local function thumb(time, r_x, r_y, script)
         draw(real_w, real_h, script)
     end
 
+    if options.quit_after_inactivity > 0 then
+        if show_thumbnail or activity_timer:is_enabled() then
+            activity_timer:kill()
+        end
+        activity_timer:resume()
+    end
+
     if time == last_seek_time then return end
     last_seek_time = time
     if not spawned then spawn(time) end
@@ -531,18 +634,10 @@ local function thumb(time, r_x, r_y, script)
     if not file_timer:is_enabled() then file_timer:resume() end
 end
 
-local function clear()
-    file_timer:kill()
-    seek_timer:kill()
-    last_seek = 0
-    show_thumbnail = false
-    last_x = nil
-    last_y = nil
-    if script_name then return end
-    mp.command_native({name = "overlay-remove", id=options.overlay_id})
-end
-
 local function watch_changes()
+    if not dirty or not properties["video-params"] then return end
+    dirty = false
+
     local old_w = effective_w
     local old_h = effective_h
 
@@ -559,19 +654,45 @@ local function watch_changes()
     if spawned then
         if resized then
             -- mpv doesn't allow us to change output size
+            local seek_time = last_seek_time
             run("quit")
             clear()
             spawned = false
-            spawn(last_seek_time or mp.get_property_number("time-pos", 0))
+            spawn(seek_time or mp.get_property_number("time-pos", 0))
+            file_timer:resume()
         end
     end
 
     last_has_vid = has_vid
+
+    if not spawned and not disabled and options.spawn_first and resized then
+        spawn(mp.get_property_number("time-pos", 0))
+        file_timer:resume()
+    end
+end
+
+local function update_property(name, value)
+    properties[name] = value
 end
 
-local watch_changes_debounce = debounce(watch_changes, 500)
+local function update_property_dirty(name, value)
+    properties[name] = value
+    dirty = true
+end
+
+local function update_tracklist(name, value)
+    -- current-tracks shim
+    for _, track in ipairs(value) do
+        if track.type == "video" and track.selected then
+            properties["current-tracks/video/image"] = track.image
+            properties["current-tracks/video/albumart"] = track.albumart
+            return
+        end
+    end
+end
 
 local function sync_changes(prop, val)
+    update_property(prop, val)
     if val == nil then return end
 
     if type(val) == "boolean" then
@@ -592,11 +713,12 @@ local function sync_changes(prop, val)
     if not spawned then return end
 
     run("set "..prop.." "..val)
-    watch_changes_debounce()
+    dirty = true
 end
 
 local function file_load()
     clear()
+    spawned = false
     real_w, real_h = nil, nil
     last_real_w, last_real_h = nil, nil
     last_seek_time = nil
@@ -607,28 +729,27 @@ local function file_load()
 
     calc_dimensions()
     info(effective_w, effective_h)
-    if disabled then return end
-
-    spawned = false
-    if options.spawn_first then
-        mp.add_timeout(0.1, function()
-            spawn(mp.get_property_number("time-pos", 0))
-            first_file = true
-        end)
-    end
 end
 
 local function shutdown()
     run("quit")
     remove_thumbnail_files()
-    if os_name ~= "Windows" then
+    if os_name ~= "windows" then
         os.remove(options.socket)
         os.remove(options.socket..".run")
     end
 end
 
-mp.observe_property("display-hidpi-scale", "native", watch_changes)
-mp.observe_property("video-out-params", "native", watch_changes)
+mp.observe_property("current-tracks", "native", function(name, value)
+    update_property(name, value)
+end)
+
+mp.observe_property("track-list", "native", update_tracklist)
+mp.observe_property("display-hidpi-scale", "native", update_property_dirty)
+mp.observe_property("video-params", "native", update_property_dirty)
+mp.observe_property("demuxer-via-network", "native", update_property)
+mp.observe_property("stream-open-filename", "native", update_property)
+mp.observe_property("path", "native", update_property)
 mp.observe_property("vid", "native", sync_changes)
 mp.observe_property("edition", "native", sync_changes)
 
@@ -657,3 +778,5 @@ mp.add_key_binding(nil, "thumb_toggle", function()
         mp.osd_message("缩略图功能已启用", 2)
     end
 end)
+
+mp.register_idle(watch_changes)
diff --git a/portable_config/scripts/uosc/elements/Logo.lua b/portable_config/scripts/uosc/elements/Logo.lua
new file mode 100644
index 00000000..873794f4
--- /dev/null
+++ b/portable_config/scripts/uosc/elements/Logo.lua
@@ -0,0 +1,61 @@
+-- 存在问题（也许不算）：无法实时自适应缩放
+
+local Element = require('elements/Element')
+
+--[[ Logo ]]
+
+---@class Logo : Element
+local Logo = class(Element)
+
+function Logo:new() return Class.new(self) --[[@as Logo]] end
+function Logo:init()
+	Element.init(self, 'logo')
+	self.enabled = state.is_idle
+
+	self.logo_lines = {
+		-- White border
+		'{\\c&HE5E5E5&\\p5}m 895 10 b 401 10 0 410 0 905 0 1399 401 1800 895 1800 1390 1800 1790 1399 1790 905 1790 410 1390 10 895 10 {\\p0}',
+		-- Purple fill
+		'{\\c&H682167&\\p5}m 925 42 b 463 42 87 418 87 880 87 1343 463 1718 925 1718 1388 1718 1763 1343 1763 880 1763 418 1388 42 925 42{\\p0}',
+		-- Darker fill
+		'{\\c&H430142&\\p5}m 1605 828 b 1605 1175 1324 1456 977 1456 631 1456 349 1175 349 828 349 482 631 200 977 200 1324 200 1605 482 1605 828{\\p0}',
+		-- White fill
+		'{\\c&HDDDBDD&\\p5}m 1296 910 b 1296 1131 1117 1310 897 1310 676 1310 497 1131 497 910 497 689 676 511 897 511 1117 511 1296 689 1296 910{\\p0}',
+		-- Triangle
+		'{\\c&H691F69&\\p5}m 762 1113 l 762 708 b 881 776 1000 843 1119 911 1000 978 881 1046 762 1113{\\p0}',
+	}
+
+end
+
+function Logo:decide_enabled() self.enabled = state.idlescreen and state.is_idle end
+function Logo:on_prop_is_idle() self:decide_enabled() end
+function Logo:on_prop_idlescreen() self:decide_enabled() end
+
+function Logo:render()
+	if Menu:is_open() then return end
+
+	local ass = assdraw.ass_new()
+
+	-- logo is rendered at 2^(5-1) = 16 times resolution with size 1800x1800
+	local logo_size, font_size, spacing = 1800 / 16, 40, 10
+	local total_height = logo_size + font_size + spacing
+	local icon_x, icon_y = (display.width - logo_size) / 2, (display.height - total_height) / 2
+	local line_prefix = ('{\\rDefault\\an7\\1a&H00&\\bord0\\shad0\\pos(%f,%f)}'):format(icon_x, icon_y)
+
+	-- mpv logo
+	for _, line in ipairs(self.logo_lines) do
+		ass:new_event()
+		ass:append(line_prefix .. line)
+	end
+
+	if options.idlemsg == 'default' then
+		state.idlemsg = ''
+	else
+		state.idlemsg = options.idlemsg
+	end
+	ass:txt(display.width / 2, icon_y + logo_size + spacing, 8, tostring(state.idlemsg), {size = font_size})
+
+	return ass
+end
+
+return Logo
diff --git a/portable_config/scripts/uosc/elements/Menu.lua b/portable_config/scripts/uosc/elements/Menu.lua
index 4048b76b..99d736f3 100644
--- a/portable_config/scripts/uosc/elements/Menu.lua
+++ b/portable_config/scripts/uosc/elements/Menu.lua
@@ -156,7 +156,7 @@ function Menu:update(data)
 
 		-- Update items
 		local first_active_index = nil
-		menu.items = {}
+		menu.items = {} -- {{title = lang._menu_item_empty_title, value = 'ignore', italic = 'true', muted = 'true'}}
 
 		for i, item_data in ipairs(menu_data.items or {}) do
 			if item_data.active and not first_active_index then first_active_index = i end
@@ -506,7 +506,7 @@ function Menu:move_selected_item_to(index)
 	if callback and from and from ~= index and index >= 1 and index <= #self.current.items then
 		callback(from, index, self.current.submenu_path)
 		self.current.selected_index = index
-		request_render()
+		self:set_scroll_by((index - from) * self.scroll_step)
 	end
 end
 
diff --git a/portable_config/scripts/uosc/elements/Timeline.lua b/portable_config/scripts/uosc/elements/Timeline.lua
index 55a72aad..ebcd2684 100644
--- a/portable_config/scripts/uosc/elements/Timeline.lua
+++ b/portable_config/scripts/uosc/elements/Timeline.lua
@@ -394,7 +394,7 @@ function Timeline:render()
 		then
 			local scale_x, scale_y = display.scale_x, display.scale_y
 			local border, margin_x, margin_y = math.ceil(2 * scale_x), round(10 * scale_x), round(5 * scale_y)
-			local thumb_x_margin, thumb_y_margin = border + margin_x, border + margin_y
+			local thumb_x_margin, thumb_y_margin = border + margin_x + bax, border + margin_y
 			local thumb_width, thumb_height = thumbnail.width, thumbnail.height
 			local thumb_x = round(clamp(
 				thumb_x_margin, cursor_x * scale_x - thumb_width / 2,
diff --git a/portable_config/scripts/uosc/elements/TopBar.lua b/portable_config/scripts/uosc/elements/TopBar.lua
index 7258785c..b15121fc 100644
--- a/portable_config/scripts/uosc/elements/TopBar.lua
+++ b/portable_config/scripts/uosc/elements/TopBar.lua
@@ -85,6 +85,10 @@ function TopBar:decide_titles()
 	self.alt_title = state.alt_title ~= '' and state.alt_title or nil
 	self.main_title = state.title ~= '' and state.title or nil
 
+	if (self.main_title == 'No file') then
+		self.main_title = lang._border_title
+	end
+
 	-- Fall back to alt title if main is empty
 	if not self.main_title then
 		self.main_title, self.alt_title = self.alt_title, nil
diff --git a/portable_config/scripts/uosc/lib/ass.lua b/portable_config/scripts/uosc/lib/ass.lua
index 108953f1..9f7132ce 100644
--- a/portable_config/scripts/uosc/lib/ass.lua
+++ b/portable_config/scripts/uosc/lib/ass.lua
@@ -85,7 +85,7 @@ function ass_mt:tooltip(element, value, opts)
 	local align_top = opts.responsive == false or element.ay - offset > opts.size * 2
 	local x = element.ax + (element.bx - element.ax) / 2
 	local y = align_top and element.ay - offset or element.by + offset
-	local margin = (opts.width_overwrite or text_width(value, opts)) / 2 + 10
+	local margin = (opts.width_overwrite or text_width(value, opts)) / 2 + 10 + Elements.window_border.size
 	self:txt(clamp(margin, x, display.width - margin), y, align_top and 2 or 8, value, opts)
 end
 
diff --git a/portable_config/scripts/uosc/lib/lang.lua b/portable_config/scripts/uosc/lib/lang.lua
index 875804b9..fdce8d44 100644
--- a/portable_config/scripts/uosc/lib/lang.lua
+++ b/portable_config/scripts/uosc/lib/lang.lua
@@ -2,35 +2,38 @@
 lang = {
 
 	-- context_menu_default
-	_load = '加载',
-	_file_browser = '※ 文件浏览器',
-	_import_sid = '※ 导入 字幕轨',
-	_navigation = '导航',
-	_playlist = '※ 播放列表',
-	_edition_list = '※ 版本列表',
-	_chapter_list = '※ 章节列表',
-	_vid_list = '※ 视频轨列表',
-	_aid_list = '※ 音频轨列表',
-	_sid_list = '※ 字幕轨列表',
-	_playlist_shuffle = '播放列表乱序重排',
-	_ushot = '※ 截屏',
-	_VIDEO = '视频',
-	_decoding_api = '切换 解码模式',
-	_deband_toggle = '切换 去色带状态',
-	_deint_toggle = '切换 去隔行状态',
-	_icc_toggle = '切换 自动校色',
-	_corpts_toggle = '切换 时间码解析模式',
-	_TOOLS = '工具',
-	_stats_toggle = '开关 常驻统计信息',
-	_console_on = '显示控制台',
-	_border_toggle = '切换 窗口边框',
-	_ontop_toggle = '切换 窗口置顶',
-	_audio_device = '※ 音频输出设备列表',
-	_stream_quality = '※ 流式传输品质',
-	_show_file_dir = '※ 打开 当前文件所在路径',
-	_show_config_dir = '※ 打开 设置目录',
-	_stop = '停止',
-	_quit = '退出mpv',
+	_cm_load = '加载',
+	_cm_file_browser = '※ 文件浏览器',
+	_cm_import_sid = '※ 导入 字幕轨',
+	_cm_navigation = '导航',
+	_cm_playlist = '※ 播放列表',
+	_cm_edition_list = '※ 版本列表',
+	_cm_chapter_list = '※ 章节列表',
+	_cm_vid_list = '※ 视频轨列表',
+	_cm_aid_list = '※ 音频轨列表',
+	_cm_sid_list = '※ 字幕轨列表',
+	_cm_playlist_shuffle = '播放列表乱序重排',
+	_cm_ushot = '※ 截屏',
+	_cm_video = '视频',
+	_cm_decoding_api = '切换 解码模式',
+	_cm_deband_toggle = '切换 去色带状态',
+	_cm_deint_toggle = '切换 去隔行状态',
+	_cm_icc_toggle = '切换 自动校色',
+	_cm_corpts_toggle = '切换 时间码解析模式',
+	_cm_tools = '工具',
+	_cm_stats_toggle = '开关 常驻统计信息',
+	_cm_console_on = '显示控制台',
+	_cm_border_toggle = '切换 窗口边框',
+	_cm_ontop_toggle = '切换 窗口置顶',
+	_cm_audio_device = '※ 音频输出设备列表',
+	_cm_stream_quality = '※ 流式传输品质',
+	_cm_show_file_dir = '※ 打开 当前文件所在路径',
+	_cm_show_config_dir = '※ 打开 设置目录',
+	_cm_stop = '停止',
+	_cm_quit = '退出mpv',
+
+	-- no-border-title
+	_border_title= '未加载文件',
 
 	-- track_loaders sub_menu menu_data
 	_sid_menu = '字幕轨',
@@ -38,27 +41,32 @@ lang = {
 	_vid_menu = '视频轨',
 	_import_id_menu = '导入 ',
 
+	-- _menu_item_empty_title = '空',
+
 	_sid_submenu_title = '字幕轨列表',
 	_aid_submenu_title = '音频轨列表',
 	_vid_submenu_title = '视频轨列表',
 	_playlist_submenu_title = '播放列表',
 	_chapter_list_submenu_title = '章节列表',
+	_chapter_list_submenu_item_title = '未命名章节 ',
 	_edition_list_submenu_title = '版本列表',
 	_edition_list_submenu_item_title = '版本',
 	_stream_quality_submenu_title = '流式传输品质',
 	_audio_device_submenu_title = '音频输出设备列表',
+	_audio_device_submenu_item_title = '自动',
 
 	_submenu_import = '导入',
 	_submenu_load_file = '打开文件',
 	_submenu_id_disabled = '禁用',
+	_submenu_id_hint = '声道',
 	_submenu_id_forced = '强制',
 	_submenu_id_default = '默认',
 	_submenu_id_external = '外挂',
 	_submenu_id_title = '轨道 ',
+	_submenu_file_browser_item_hint = '驱动器列表',
+	_submenu_file_browser_item_hint2 = '上级目录',
+	_submenu_file_browser_item2_hint = '盘符',
 	_submenu_file_browser_title = '驱动器列表',
-	_submenu_file_browser_item_title = '上级目录',
-	_submenu_file_browser_item2_title = '盘符',
-	_submenu_file_browser_item3_title = '驱动器列表',
 
 	-- built-in_shortcut
 	_button01 = '菜单',
diff --git a/portable_config/scripts/uosc/lib/menus.lua b/portable_config/scripts/uosc/lib/menus.lua
index 5855bbd9..17c5c6e2 100644
--- a/portable_config/scripts/uosc/lib/menus.lua
+++ b/portable_config/scripts/uosc/lib/menus.lua
@@ -106,7 +106,7 @@ function create_select_tracklist_type_menu_opener(menu_title, track_type, track_
 				end
 				if track['demux-fps'] then h(string.format('%.5gfps', track['demux-fps'])) end
 				h(track.codec)
-				if track['audio-channels'] then h(track['audio-channels'] .. ' channels') end
+				if track['audio-channels'] then h(track['audio-channels'] .. lang._submenu_id_hint) end
 				if track['demux-samplerate'] then h(string.format('%.3gkHz', track['demux-samplerate'] / 1000)) end
 				if track.forced then h(lang._submenu_id_forced) end
 				if track.default then h(lang._submenu_id_default) end
@@ -181,10 +181,10 @@ function open_file_navigation_menu(directory_path, handle_select, opts)
 
 	if is_root then
 		if state.platform == 'windows' then
-			items[#items + 1] = {title = '..', hint = lang._submenu_file_browser_title, value = '{drives}', separator = true}
+			items[#items + 1] = {title = '..', hint = lang._submenu_file_browser_item_hint, value = '{drives}', separator = true}
 		end
 	else
-		items[#items + 1] = {title = '..', hint = lang._submenu_file_browser_item_title, value = directory.dirname, separator = true}
+		items[#items + 1] = {title = '..', hint = lang._submenu_file_browser_item_hint2, value = directory.dirname, separator = true}
 	end
 
 	local back_path = items[#items] and items[#items].value
@@ -276,7 +276,7 @@ function open_drives_menu(handle_select, opts)
 			if drive then
 				local drive_path = normalize_path(drive)
 				items[#items + 1] = {
-					title = drive, hint = lang._submenu_file_browser_item2_title, value = drive_path, active = opts.active_path == drive_path,
+					title = drive, hint = lang._submenu_file_browser_item2_hint, value = drive_path, active = opts.active_path == drive_path,
 				}
 				if opts.selected_path == drive_path then selected_index = #items end
 			end
@@ -286,7 +286,7 @@ function open_drives_menu(handle_select, opts)
 	end
 
 	return Menu:open(
-		{type = opts.type, title = opts.title or lang._submenu_file_browser_item3_title, items = items, selected_index = selected_index},
+		{type = opts.type, title = opts.title or lang._submenu_file_browser_title, items = items, selected_index = selected_index},
 		handle_select
 	)
 end
diff --git a/portable_config/scripts/uosc/lib/text.lua b/portable_config/scripts/uosc/lib/text.lua
index d573b816..eca4de2a 100644
--- a/portable_config/scripts/uosc/lib/text.lua
+++ b/portable_config/scripts/uosc/lib/text.lua
@@ -87,7 +87,9 @@ local function utf8_to_unicode(str, i)
 		unicode = char_byte * (2 ^ 6) ^ (byte_count - 1)
 	end
 	for j = 2, byte_count do
-		char_byte = str:byte(i + j - 1) - 0x80
+		if i + j - 1 <= #str then -- 临时修复 https://github.com/tomasklaen/uosc/issues/515
+			char_byte = str:byte(i + j - 1) - 0x80
+		end
 		unicode = unicode + char_byte * (2 ^ 6) ^ (byte_count - j)
 	end
 	return round(unicode)
diff --git a/portable_config/scripts/uosc/lib/utils.lua b/portable_config/scripts/uosc/lib/utils.lua
index e07c10d3..43892c98 100644
--- a/portable_config/scripts/uosc/lib/utils.lua
+++ b/portable_config/scripts/uosc/lib/utils.lua
@@ -539,7 +539,7 @@ function normalize_chapters(chapters)
 	table.sort(chapters, function(a, b) return a.time < b.time end)
 	-- Ensure titles
 	for index, chapter in ipairs(chapters) do
-		chapter.title = chapter.title or ('Chapter ' .. index)
+		chapter.title = chapter.title or (lang._chapter_list_submenu_item_title .. index)
 		chapter.lowercase_title = chapter.title:lower()
 	end
 	return chapters
diff --git a/portable_config/scripts/uosc/main.lua b/portable_config/scripts/uosc/main.lua
index 6b0d589f..b411b9c0 100644
--- a/portable_config/scripts/uosc/main.lua
+++ b/portable_config/scripts/uosc/main.lua
@@ -1,6 +1,6 @@
 --[[
 SOURCE_ https://github.com/tomasklaen/uosc/tree/main/scripts
-COMMIT_ ec52252380f896ca709216307e3bf021fbee914b
+COMMIT_ 5e2c93055155bc9aec7534d13804d4f0d7f8a72d
 文档_ https://github.com/hooke007/MPV_lazy/discussions/186
 
 极简主义设计驱动的多功能界面脚本群组，兼容 thumbfast 新缩略图引擎
@@ -112,6 +112,8 @@ defaults = {
 	chapter_ranges = 'openings:30abf964,endings:30abf964,ads:c54e4e80',
 	chapter_range_patterns = 'openings:オープニング;endings:エンディング',
 
+	idlescreen = true,
+	idlemsg = 'default',
 	idle_call_menu = 0,                       -- 空闲自动弹出上下文菜单
 	custom_font = 'default',                  -- 自定义界面字体
 }
@@ -145,45 +147,47 @@ function auto_ui_scale()
 		options.ui_scale = 1
 	end
 end
+-- 设置脚本属性
+mp.set_property_native('user-data/osc', { idlescreen = options.idlescreen })
 
 --[[ CONFIG ]]
 
 -- 上下文菜单的默认内容
 local function create_default_menu()
 	return {
-		{title = lang._load, items = {
-			{title = lang._file_browser, value = 'script-binding uosc/open-file'},
-			{title = lang._import_sid, value = 'script-binding uosc/load-subtitles'},
+		{title = lang._cm_load, items = {
+			{title = lang._cm_file_browser, value = 'script-binding uosc/open-file'},
+			{title = lang._cm_import_sid, value = 'script-binding uosc/load-subtitles'},
 		},},
-		{title = lang._navigation, items = {
-			{title = lang._playlist, value = 'script-binding uosc/playlist'},
-			{title = lang._edition_list, value = 'script-binding uosc/editions'},
-			{title = lang._chapter_list, value = 'script-binding uosc/chapters'},
-			{title = lang._vid_list, value = 'script-binding uosc/video'},
-			{title = lang._aid_list, value = 'script-binding uosc/audio'},
-			{title = lang._sid_list, value = 'script-binding uosc/subtitles'},
-			{title = lang._playlist_shuffle, value = 'playlist-shuffle'},
+		{title = lang._cm_navigation, items = {
+			{title = lang._cm_playlist, value = 'script-binding uosc/playlist'},
+			{title = lang._cm_edition_list, value = 'script-binding uosc/editions'},
+			{title = lang._cm_chapter_list, value = 'script-binding uosc/chapters'},
+			{title = lang._cm_vid_list, value = 'script-binding uosc/video'},
+			{title = lang._cm_aid_list, value = 'script-binding uosc/audio'},
+			{title = lang._cm_sid_list, value = 'script-binding uosc/subtitles'},
+			{title = lang._cm_playlist_shuffle, value = 'playlist-shuffle'},
 		},},
-		{title = lang._ushot, value = 'script-binding uosc/shot'},
-		{title = lang._VIDEO, items = {
-			{title = lang._decoding_api, value = 'cycle-values hwdec no auto auto-copy'},
-			{title = lang._deband_toggle, value = 'cycle deband'},
-			{title = lang._deint_toggle, value = 'cycle deinterlace'},
-			{title = lang._icc_toggle, value = 'cycle icc-profile-auto'},
-			{title = lang._corpts_toggle, value = 'cycle correct-pts'},
+		{title = lang._cm_ushot, value = 'script-binding uosc/shot'},
+		{title = lang._cm_video, items = {
+			{title = lang._cm_decoding_api, value = 'cycle-values hwdec no auto auto-copy'},
+			{title = lang._cm_deband_toggle, value = 'cycle deband'},
+			{title = lang._cm_deint_toggle, value = 'cycle deinterlace'},
+			{title = lang._cm_icc_toggle, value = 'cycle icc-profile-auto'},
+			{title = lang._cm_corpts_toggle, value = 'cycle correct-pts'},
 		},},
-		{title = lang._TOOLS, items = {
-			{title = lang._stats_toggle, value = 'script-binding display-stats-toggle'},
-			{title = lang._console_on, value = 'script-binding console/enable'},
-			{title = lang._border_toggle, value = 'cycle border'},
-			{title = lang._ontop_toggle, value = 'cycle ontop'},
-			{title = lang._audio_device, value = 'script-binding uosc/audio-device'},
-			{title = lang._stream_quality, value = 'script-binding uosc/stream-quality'},
-			{title = lang._show_file_dir, value = 'script-binding uosc/show-in-directory'},
-			{title = lang._show_config_dir, value = 'script-binding uosc/open-config-directory'},
+		{title = lang._cm_tools, items = {
+			{title = lang._cm_stats_toggle, value = 'script-binding display-stats-toggle'},
+			{title = lang._cm_console_on, value = 'script-binding console/enable'},
+			{title = lang._cm_border_toggle, value = 'cycle border'},
+			{title = lang._cm_ontop_toggle, value = 'cycle ontop'},
+			{title = lang._cm_audio_device, value = 'script-binding uosc/audio-device'},
+			{title = lang._cm_stream_quality, value = 'script-binding uosc/stream-quality'},
+			{title = lang._cm_show_file_dir, value = 'script-binding uosc/show-in-directory'},
+			{title = lang._cm_show_config_dir, value = 'script-binding uosc/open-config-directory'},
 		},},
-		{title = lang._stop, value = 'stop'},
-		{title = lang._quit, value = 'quit'},
+		{title = lang._cm_stop, value = 'stop'},
+		{title = lang._cm_quit, value = 'quit'},
 	}
 end
 
@@ -439,6 +443,8 @@ state = {
 	margin_left = 0,
 	margin_right = 0,
 	hidpi_scale = 1,
+	idlescreen = options.idlescreen,
+	idlemsg = options.idlemsg,
 }
 thumbnail = {width = 0, height = 0, disabled = false}
 external = {} -- Properties set by external scripts
@@ -512,17 +518,19 @@ end
 function update_margins()
 	if display.height == 0 then return end
 
-	local function is_persistent(element) return element and element.enabled and element:is_persistent() end
+	local function causes_margin(element)
+		return element and element.enabled and (element:is_persistent() or element.min_visibility > 0.5)
+	end
 	local timeline, top_bar, controls, volume = Elements.timeline, Elements.top_bar, Elements.controls, Elements.volume
 	-- margins are normalized to window size
 	local left, right, top, bottom = 0, 0, 0, 0
 
-	if is_persistent(controls) then bottom = (display.height - controls.ay) / display.height
-	elseif is_persistent(timeline) then bottom = (display.height - timeline.ay) / display.height end
+	if causes_margin(controls) then bottom = (display.height - controls.ay) / display.height
+	elseif causes_margin(timeline) then bottom = (display.height - timeline.ay) / display.height end
 
-	if is_persistent(top_bar) then top = top_bar.title_by / display.height end
+	if causes_margin(top_bar) then top = top_bar.title_by / display.height end
 
-	if is_persistent(volume) then
+	if causes_margin(volume) then
 		if options.volume == 'left' then left = volume.bx / display.width
 		elseif options.volume == 'right' then right = volume.ax / display.width end
 	end
@@ -1041,7 +1049,7 @@ bind_command('show-in-directory', function()
 
 	if state.platform == 'windows' then
 		utils.subprocess_detached({args = {'explorer', '/select,', state.path}, cancellable = false})
-	elseif state.platform == 'macos' then
+	elseif state.platform == 'darwin' then
 		utils.subprocess_detached({args = {'open', '-R', state.path}, cancellable = false})
 	elseif state.platform == 'linux' then
 		local result = utils.subprocess({args = {'nautilus', state.path}, cancellable = false})
@@ -1203,10 +1211,14 @@ bind_command('audio-device', create_self_updating_menu_opener({
 		local items = {}
 		for _, device in ipairs(audio_device_list) do
 			if device.name == 'auto' or string.match(device.name, '^' .. ao) then
+				local title = device.description
+				if title == 'Autoselect device' then
+					title = lang._audio_device_submenu_item_title
+				end
 				local hint = string.match(device.name, ao .. '/(.+)')
 				if not hint then hint = device.name end
 				items[#items + 1] = {
-					title = device.description,
+					title = title,
 					hint = hint,
 					active = device.name == current_device,
 					value = device.name,
@@ -1226,7 +1238,7 @@ bind_command('open-config-directory', function()
 
 		if state.platform == 'windows' then
 			args = {'explorer', '/select,', config.path}
-		elseif state.platform == 'macos' then
+		elseif state.platform == 'darwin' then
 			args = {'open', '-R', config.path}
 		elseif state.platform == 'linux' then
 			args = {'xdg-open', config.dirname}
@@ -1324,9 +1336,24 @@ mp.register_script_message('set-min-visibility', function(visibility, elements)
 end)
 mp.register_script_message('flash-elements', function(elements) Elements:flash(split(elements, ' *, *')) end)
 mp.register_script_message('overwrite-binding', function(name, command) key_binding_overwrites[name] = command end)
+if options.idlescreen then
+	mp.register_script_message('osc-idlescreen', function(mode, no_osd)
+		if mode == 'cycle' then mode = state.idlescreen and 'no' or 'yes' end
+		set_state('idlescreen', mode == 'yes')
+		utils.shared_script_property_set('osc-idlescreen', mode)
+		mp.set_property_native('user-data/osc', { idlescreen = state.idlescreen })
+
+		if not no_osd and mp.get_property_number('osd-level', 1) >= 1 then
+			mp.osd_message('LOGO的可见性：' .. tostring(mode))
+		end
+	end)
+end
 
 --[[ ELEMENTS ]]
 
+if options.idlescreen then
+	require('elements/Logo'):new()
+end
 require('elements/WindowBorder'):new()
 require('elements/BufferingIndicator'):new()
 require('elements/PauseIndicator'):new()
diff --git a/portable_config/shaders/guided.glsl b/portable_config/shaders/guided.glsl
index 3c804da3..bf3f9e2b 100644
--- a/portable_config/shaders/guided.glsl
+++ b/portable_config/shaders/guided.glsl
@@ -16,7 +16,7 @@
  * along with this program. If not, see <https://www.gnu.org/licenses/>.
  */
 
-//desc: Guided filter guided by the downscaled image
+// Description: guided.glsl: Guided by the downscaled image
 
 /* The radius can be adjusted with the MEANI stage's downscaling factor. 
  * Higher numbers give a bigger radius.
@@ -33,10 +33,10 @@
 //!HOOK LUMA
 //!HOOK CHROMA
 //!HOOK RGB
-//!DESC Guided filter (PREI)
 //!BIND HOOKED
 //!WIDTH HOOKED.w 1.25 /
 //!HEIGHT HOOKED.h 1.25 /
+//!DESC Guided filter (PREI)
 //!SAVE PREI
 
 vec4 hook()
@@ -47,10 +47,10 @@ vec4 hook()
 //!HOOK LUMA
 //!HOOK CHROMA
 //!HOOK RGB
-//!DESC Guided filter (I)
 //!BIND PREI
-//!WIDTH HOOKED.w 1.0 /
-//!HEIGHT HOOKED.h 1.0 /
+//!WIDTH HOOKED.w
+//!HEIGHT HOOKED.h
+//!DESC Guided filter (I)
 //!SAVE I
 
 vec4 hook()
@@ -58,6 +58,7 @@ vec4 hook()
 	return PREI_texOff(0);
 }
 
+
 //!HOOK LUMA
 //!HOOK CHROMA
 //!HOOK RGB
diff --git a/portable_config/shaders/guided_lgc.glsl b/portable_config/shaders/guided_lgc.glsl
index 816e4511..7ff3de39 100644
--- a/portable_config/shaders/guided_lgc.glsl
+++ b/portable_config/shaders/guided_lgc.glsl
@@ -16,7 +16,7 @@
  * along with this program. If not, see <https://www.gnu.org/licenses/>.
  */
 
-//desc: Luma-guided-chroma denoising.
+// Description: guided_lgc.glsl: Luma-guided-chroma denoising.
 
 /* The radius can be adjusted with the MEANI stage's downscaling factor. 
  * Higher numbers give a bigger radius.
@@ -31,10 +31,10 @@
  */
 
 //!HOOK CHROMA
-//!DESC Guided filter (I)
 //!BIND LUMA
 //!WIDTH LUMA.w
 //!HEIGHT LUMA.h
+//!DESC Guided filter (I, share)
 //!SAVE I
 
 vec4 hook()
@@ -42,6 +42,7 @@ vec4 hook()
 	return LUMA_texOff(0);
 }
 
+
 //!HOOK CHROMA
 //!DESC Guided filter (P)
 //!BIND HOOKED
@@ -57,9 +58,9 @@ vec4 hook()
 //!HOOK CHROMA
 //!DESC Guided filter (MEANI)
 //!BIND I
+//!SAVE MEANI
 //!WIDTH I.w 2.0 /
 //!HEIGHT I.h 2.0 /
-//!SAVE MEANI
 
 vec4 hook()
 {
diff --git a/portable_config/shaders/guided_s.glsl b/portable_config/shaders/guided_s.glsl
index cc8f4467..a1c2c174 100644
--- a/portable_config/shaders/guided_s.glsl
+++ b/portable_config/shaders/guided_s.glsl
@@ -16,7 +16,7 @@
  * along with this program. If not, see <https://www.gnu.org/licenses/>.
  */
 
-//desc: "Self-guided" guided filter
+// Description: guided_s.glsl: Self-guided
 
 /* The radius can be adjusted with the MEANIP stage's downscaling factor. 
  * Higher numbers give a bigger radius.
diff --git a/portable_config/shaders/nlmeans.glsl b/portable_config/shaders/nlmeans.glsl
index ebcc5d35..655da37d 100644
--- a/portable_config/shaders/nlmeans.glsl
+++ b/portable_config/shaders/nlmeans.glsl
@@ -19,7 +19,7 @@
  * along with this program. If not, see <https://www.gnu.org/licenses/>.
  */
 
-// Profile description: Default profile, general purpose, tuned for low noise
+// Description: nlmeans.glsl: Default profile, general purpose, tuned for low noise
 
 /* The recommended usage of this shader and its variant profiles is to add them 
  * to input.conf and then dispatch the appropriate shader via a keybind during 
@@ -48,8 +48,8 @@
  * of noise.
  *
  * The denoiser will not work properly if the content has been upscaled 
- * beforehand, whether it was done by you or someone down the line. Consider 
- * issuing a command to downscale in the mpv console, like so:
+ * beforehand (whether it was done by you or not). In such cases, consider 
+ * issuing a command to downscale in the mpv console (backtick ` key):
  *
  * vf toggle scale=-2:720
  *
@@ -65,12 +65,13 @@
  * may be different for your system.
  *
  * If your GPU doesn't support textureGather, or if you are on a version of mpv 
- * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ and VLQ 
- * profiles.
+ * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ profile
  *
- * textureGather is LUMA only and limited to the following configurations:
+ * If you plan on tinkering with NLM's settings, read below:
  *
- * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}:M!=1
+ * textureGather only applies to luma and limited to the these configurations:
+ *
+ * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}
  *   - Default, very fast, rotations and reflections should be free
  *   - If this is unusually slow then try changing gpu-api and vo
  *   - If it's still slow, try setting RI/RFI to 0.
@@ -83,6 +84,7 @@
  *
  * Options which always disable textureGather:
  * 	- PD
+ * 	- NG
  */
 
 // The following is shader code injected from guided.glsl
@@ -104,7 +106,7 @@
  * along with this program. If not, see <https://www.gnu.org/licenses/>.
  */
 
-//desc: Guided filter guided by the downscaled image
+// Description: guided.glsl: Guided by the downscaled image
 
 /* The radius can be adjusted with the MEANI stage's downscaling factor. 
  * Higher numbers give a bigger radius.
@@ -120,10 +122,10 @@
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!DESC Guided filter (PREI)
 //!BIND HOOKED
 //!WIDTH HOOKED.w 1.25 /
 //!HEIGHT HOOKED.h 1.25 /
+//!DESC Guided filter (PREI)
 //!SAVE _INJ_PREI
 
 vec4 hook()
@@ -133,10 +135,10 @@ vec4 hook()
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!DESC Guided filter (I)
 //!BIND _INJ_PREI
-//!WIDTH HOOKED.w 1.0 /
-//!HEIGHT HOOKED.h 1.0 /
+//!WIDTH HOOKED.w
+//!HEIGHT HOOKED.h
+//!DESC Guided filter (I)
 //!SAVE _INJ_I
 
 vec4 hook()
@@ -144,6 +146,7 @@ vec4 hook()
 return _INJ_PREI_texOff(0);
 }
 
+
 //!HOOK LUMA
 //!HOOK CHROMA
 //!DESC Guided filter (P)
@@ -310,69 +313,52 @@ vec4 hook()
 return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0);
 }
 
-// End of source code injected from guided.glsl
+// End of source code injected from guided.glsl 
+
 //!HOOK LUMA
 //!HOOK CHROMA
-//!DESC Non-local means (downscale)
-//!WIDTH LUMA.w 3 /
-//!HEIGHT LUMA.h 3 /
-//!BIND LUMA
-//!SAVE EP
+//!BIND RF_LUMA
+//!WIDTH RF_LUMA.w
+//!HEIGHT RF_LUMA.h
+//!DESC Non-local means (RF, share)
+//!SAVE RF
 
 vec4 hook()
 {
-	return LUMA_texOff(0);
+	return RF_LUMA_texOff(0);
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!DESC Non-local means (share)
-//!BIND RF_LUMA
-//!SAVE RF
+//!BIND LUMA
+//!WIDTH LUMA.w 3 /
+//!HEIGHT LUMA.h 3 /
+//!DESC Non-local means (EP)
+//!SAVE EP
 
 vec4 hook()
 {
-	return RF_LUMA_texOff(0);
+	return LUMA_texOff(0);
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
 //!BIND HOOKED
 //!BIND RF_LUMA
-//!BIND EP
 //!BIND RF
+//!BIND EP
 //!DESC Non-local means (nlmeans.glsl)
 
-/* User variables
- *
- * It is usually preferable to denoise chroma and luma differently, so the user 
- * variables for luma and chroma are split.
- */
+// User variables
 
-/* S = denoising factor
- * P = patch size
- * R = research size
- *
- * The denoising factor controls the level of blur, higher is blurrier.
- *
- * Patch size should usually be an odd number greater than or equal to 3. 
- * Higher values are slower and not always better.
- *
- * Research size usually be an odd number greater than or equal to 3. Higher 
- * values are usually better, but slower and offer diminishing returns.
- *
- * Even-numbered patch/research sizes will sample between pixels unless PS=6. 
- * It's not known whether this is ever useful behavior or not. This is 
- * incompatible with textureGather optimizations, so NG=1 to disable them.
- */
+// It is generally preferable to denoise luma and chroma differently, so the 
+// user variables for luma and chroma are split.
+
+// Denoising factor (level of blur, higher means more blur)
 #ifdef LUMA_raw
 #define S 2.0
-#define P 3
-#define R 5
 #else
 #define S 5.0
-#define P 3
-#define R 5
 #endif
 
 /* Adaptive sharpening
@@ -380,11 +366,16 @@ vec4 hook()
  * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
  * weight map to restrict the sharpening to edges.
  *
- * Use M=4 to get a good look at which areas are/aren't sharpened.
+ * If you just want to increase/decrease sharpness then you want to change ASF.
+ *
+ * Use V=4 to visualize which areas are sharpened (black means sharpen).
  *
- * AS: 2 for sharpening, 1 for sharpening+denoising, 0 to disable
- * ASF: Sharpening factor, higher numbers make a sharper underlying image
- * ASP: Weight power, higher numbers use more of the sharp image
+ * AS:
+ * 	- 0 to disable
+ * 	- 1 to sharpen+denoise
+ * 	- 2 to sharpen only
+ * ASF: Higher numbers make a sharper image
+ * ASP: Higher numbers use more of the sharp image
  * ASW:
  * 	- 0 to use pre-WD weights
  * 	- 1 to use post-WD weights (ASP should be ~2x to compensate)
@@ -396,15 +387,15 @@ vec4 hook()
  */
 #ifdef LUMA_raw
 #define AS 0
-#define ASF 2.0
-#define ASP 4.0
+#define ASF 3.0
+#define ASP 1.0
 #define ASW 0
 #define ASK 1
 #define ASC 0.0
 #else
 #define AS 0
-#define ASF 2.0
-#define ASP 4.0
+#define ASF 3.0
+#define ASP 1.0
 #define ASW 0
 #define ASK 1
 #define ASC 0.0
@@ -412,12 +403,10 @@ vec4 hook()
 
 /* Starting weight
  *
- * Lower numbers give less weight to the pixel-of-interest, which may help 
- * handle higher noise levels, ringing, and may be useful for other things too?
+ * Also known as the center weight. This represents the weight of the 
+ * pixel-of-interest. Lower numbers may help handle heavy noise & ringing.
  *
- * EPSILON should be used instead of zero to avoid divide-by-zero errors. The 
- * avg_weight/old_avg_weight variables may be used to make SW adapt to the 
- * local noise level, e.g., SW=max(avg_weight, EPSILON)
+ * EPSILON should be used instead of zero to avoid divide-by-zero errors.
  */
 #ifdef LUMA_raw
 #define SW 1.0
@@ -432,7 +421,7 @@ vec4 hook()
  * result, especially around edges.
  * 
  * WD:
- * 	- 2: True average. Very good quality, but slower and uses more memory.
+ * 	- 2: True average. Better quality, but slower and requires GLSL 4.0 or later
  * 	- 1: Moving cumulative average. Inaccurate, tends to blur directionally.
  * 	- 0: Disable
  *
@@ -451,12 +440,14 @@ vec4 hook()
 
 /* Extremes preserve
  *
- * Reduces denoising around very bright/dark areas. The downscaling factor of 
- * EP (located near the top of this shader) controls the area sampled for 
- * luminance (higher numbers consider more area).
+ * Reduces denoising around very bright/dark areas.
+ *
+ * The downscaling factor of the EP shader stage affects what is considered a 
+ * bright/dark area. The default of 3 should be fine, it's not recommended to 
+ * change this.
  *
  * This is incompatible with RGB. If you have RGB hooks enabled then you will 
- * have to delete the EP shader stage or specify EP=0 through nlmeans_cfg.
+ * have to delete the EP shader stage or specify EP=0 through shader_cfg.
  *
  * EP: 1 to enable, 0 to disable
  * DP: EP strength on dark patches, 0 to fully denoise
@@ -478,25 +469,26 @@ vec4 hook()
 /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
 /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
 
-/* Robust filtering
+/* Patch & research sizes
  *
- * This setting is dependent on code generation from nlmeans_cfg, so this 
- * setting can only be enabled via nlmeans_cfg.
+ * Patch size should be an odd number greater than or equal to 3. Higher values 
+ * are slower and not always better.
  *
- * Compares the pixel-of-interest against a guide, which could be a downscaled 
- * image or the output of another shader such as guided.glsl
+ * Research size be an odd number greater than or equal to 3. Higher values are 
+ * generally better, but slower, blurrier, and gives diminishing returns.
  */
 #ifdef LUMA_raw
-#define RF 1
+#define P 3
+#define R 5
 #else
-#define RF 1
+#define P 3
+#define R 5
 #endif
 
-/* Search shape
+/* Patch and research shapes
  *
- * Determines the shape of patches and research zones. Different shapes have 
- * different speed and quality characteristics. Every shape (besides square) is 
- * smaller than square.
+ * Different shapes have different speed and quality characteristics. Every 
+ * shape (besides square) is smaller than square.
  *
  * PS applies applies to patches, RS applies to research zones.
  *
@@ -519,11 +511,22 @@ vec4 hook()
 #define PS 3
 #endif
 
+/* Robust filtering
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
+ *
+ * Compares the pixel-of-interest against a guide, which could be a downscaled 
+ * image or the output of another shader
+ */
+#define RF_LUMA 1
+#define RF 1
+
 /* Rotational/reflectional invariance
  *
- * Number of rotations/reflections to try for each patch comparison. Slow, but 
- * improves feature preservation, although adding more rotations/reflections 
- * gives diminishing returns. The most similar rotation/reflection will be used.
+ * Number of rotations/reflections to try for each patch comparison. Can be 
+ * slow, but improves feature preservation. More rotations/reflections gives 
+ * diminishing returns. The most similar rotation/reflection will be used.
  *
  * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
  * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
@@ -540,29 +543,39 @@ vec4 hook()
 #endif
 
 /* Temporal denoising
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
  *
  * Caveats:
- * 	- Slower, each frame needs to be researched
- * 	- Requires vo=gpu-next and nlmeans_temporal.glsl
+ * 	- Slower:
+ * 		- Each frame needs to be researched (more samples & more math)
+ * 		- Gather optimizations only apply to the current frame
+ * 	- Requires vo=gpu-next
  * 	- Luma-only (this is a bug)
  * 	- Buggy
  *
- * Gather samples across multiple frames. May cause motion blur and may 
- * struggle more with noise that persists across multiple frames (e.g., from 
- * compression or duplicate frames), but can work very well on high quality 
- * video.
+ * May cause motion blur and may struggle more with noise that persists across 
+ * multiple frames (e.g., from compression or duplicate frames), but can work 
+ * very well on high quality video.
  *
  * Motion estimation (ME) should improve quality without impacting speed.
  *
  * T: number of frames used
  * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg
+ * MEF: estimate factor, compensates for ME being one frame behind
+ * TRF: compare against the denoised frames
  */
 #ifdef LUMA_raw
 #define T 0
 #define ME 1
+#define MEF 2
+#define TRF 0
 #else
 #define T 0
 #define ME 0
+#define MEF 2
+#define TRF 0
 #endif
 
 /* Spatial kernel
@@ -574,69 +587,79 @@ vec4 hook()
  * closer/further, for instance SD=(1,1,0.5) would make the temporal axis 
  * appear closer and increase blur between frames.
  *
- * The intra-patch variants do not yet have well-understood effects. They are 
- * intended to make large patch sizes more useful. Likely slower.
+ * The intra-patch variants are supposed to help with larger patch sizes.
  *
- * SS: spatial denoising factor
+ * SST: enables spatial kernel if R>=PST, 0 fully disables
+ * SS: spatial sigma
  * SD: spatial distortion (X, Y, time)
- * PSS: intra-patch spatial denoising factor
+ * PSS: intra-patch spatial sigma
  * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables
  * PSD: intra-patch spatial distortion (X, Y)
  */
 #ifdef LUMA_raw
+#define SST 1
 #define SS 0.25
-#define SD vec3(1,1,1.5)
+#define SD vec3(1,1,1)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #else
+#define SST 1
 #define SS 0.25
-#define SD vec3(1,1,1.5)
+#define SD vec3(1,1,1)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #endif
 
-// Scaling factor (should match WIDTH/HEIGHT)
+/* Kernels
+ *
+ * SK: spatial kernel
+ * RK: range kernel (takes patch differences)
+ * PSK: intra-patch spatial kernel
+ *
+ * List of available kernels:
+ *
+ * bicubic
+ * cos
+ * gaussian
+ * lanczos
+ * quadratic
+ * sinc
+ * sphinx
+ */
 #ifdef LUMA_raw
-#define SF 1
+#define SK gaussian
+#define RK gaussian
+#define PSK gaussian
 #else
-#define SF 1
+#define SK gaussian
+#define RK gaussian
+#define PSK gaussian
 #endif
 
-/* Estimator
- *
- * 0: means
- * 1: Euclidean medians (extremely slow, may be good for heavy noise)
- * 2: weight map (not a denoiser, maybe useful for generating image masks)
- * 3: weighted median intensity (slow, may be good for heavy noise)
- * 4: edge map (based on the relevant AS settings)
- */
+// Scaling factor (should match WIDTH/HEIGHT)
 #ifdef LUMA_raw
-#define M 0
+#define SF 1
 #else
-#define M 0
+#define SF 1
 #endif
 
-/* Difference visualization
- *
- * Visualizes the difference between input/output image
+/* Visualization
  *
  * 0: off
- * 1: absolute difference scaled by S
- * 2: difference centered on 0.5
+ * 1: absolute difference between input/output to the power of 0.25
+ * 2: difference between input/output centered on 0.5
+ * 3: avg_weight
+ * 4: edge map (based on the relevant AS settings)
  */
 #ifdef LUMA_raw
-#define DV 0
+#define V 0
 #else
-#define DV 0
+#define V 0
 #endif
 
-/* Blur factor
- *
- * 0 to 1, only useful for alternative estimators. You're probably looking for 
- * "S" (denoising factor), go back to the top of the shader!
- */
+// Blur factor (0.0 returns the input image, 1.0 returns the output image)
 #ifdef LUMA_raw
 #define BF 1.0
 #else
@@ -657,17 +680,57 @@ vec4 hook()
 #define PD 0
 #endif
 
-// Duplicate 1st weight (for LGC)
+// Duplicate 1st weight (for luma-guided-chroma)
 #ifdef LUMA_raw
 #define D1W 0
 #else
 #define D1W 0
 #endif
 
-/* Shader code */
+// Skip patch comparison
+#ifdef LUMA_raw
+#define SKIP_PATCH 0
+#else
+#define SKIP_PATCH 0
+#endif
+
+// Shader code
 
 #define EPSILON 0.00000000001
 #define M_PI 3.14159265358979323846
+#define POW2(x) ((x)*(x))
+#define POW3(x) ((x)*(x)*(x))
+#define bicubic(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0))))
+#define gaussian(x) exp(-1 * POW2(x))
+#define lanczos(x) POW2(sinc(x))
+#define quadratic(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5))
+#define sinc(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI))
+#define sphinx(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI))
+
+// XXX could maybe be better optimized on LGC
+// XXX return original alpha component instead of 1.0
+#if defined(LUMA_raw)
+#define val float
+#define val_swizz(v) (v.x)
+#define unval(v) vec4(v.x, 0, 0, 1.0)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#elif defined(CHROMA_raw)
+#define val vec2
+#define val_swizz(v) (v.xy)
+#define unval(v) vec4(v.x, v.y, 0, 1.0)
+#define val_packed uint
+#define val_pack(v) packUnorm2x16(v)
+#define val_unpack(v) unpackUnorm2x16(v)
+#else
+#define val vec3
+#define val_swizz(v) (v.xyz)
+#define unval(v) vec4(v.x, v.y, v.z, 1.0)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#endif
 
 #if PS == 6
 const int hp = P/2;
@@ -682,39 +745,96 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res
 #endif
 
 // donut increment, increments without landing on (0,0,0)
-// much faster than a "continue" statement
+// much faster than a continue statement
 #define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0))))
 
-// search shapes and their corresponding areas
-#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
+// patch/research shapes
+// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R)
+// dots (.) represent samples (pixels) and X represents the pixel-of-interest
+
+// Z    .....
+// Z    .....
+// Z    ..X..
+// Z    .....
+// Z    .....
+#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr)
 
+// (in this instance Z=4)
+// Z    ....
+// Z    ....
+// Z    ..X.
+// Z    ....
+#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr)
+
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
 #define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr)
+
+// Z-4    .
+// Z-2   ...
+// hz+1 ..X
 #define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr)
 #define S_TRIANGLE_A(hz,Z) int(hz*hz+Z)
 
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
+// Z-2   ...
+// Z-4    .
 #define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr)
 #define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z)
 
-#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
+//
+// Z    ..X..
+//
 #define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++)
 
+// 90 degree rotation of S_HORIZONTAL
+#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
+
+// 1      .
+// 1      . 
+// Z    ..X..
+// 1      . 
+// 1      .
 #define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr)
 #define S_PLUS_A(hz,Z) (Z*2 - 1)
 
-#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr)
-#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr)
+// XXX implement S_PLUS w/ an X overlayed:
+// 3    . . .
+// 3     ...
+// Z    ..X..
+// 3     ...
+// 3    . . .
+
+// XXX implement an X shape:
+// 2    .   .
+// 2     . .
+// 1      X  
+// 2     . .
+// 2    .   .
+
+// 1x1 square
+#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
 
 #define T1 (T+1)
 #define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++)
 
+#ifdef LUMA_raw
+#define RF_ RF_LUMA
+#else
+#define RF_ RF
+#endif
+
 // Skip comparing the pixel-of-interest against itself, unless RF is enabled
-#if RF
+#if RF_
 #define RINCR(z,c) (z.c++)
 #else
 #define RINCR DINCR
 #endif
 
-#define R_AREA(a) (a * T1 + RF-1)
+#define R_AREA(a) (a * T1 + RF_-1)
 
 // research shapes
 // XXX would be nice to have the option of temporally-varying research sizes
@@ -803,44 +923,44 @@ const int p_area = P_AREA(P*P);
 const float r_scale = 1.0/r_area;
 const float p_scale = 1.0/p_area;
 
-#define load_(off)  HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off))
+#define sample(tex, pos, size, pt, off) tex(pos + pt * (vec2(off) + 0.5 - fract(pos*size)))
+#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off)
 
-#if RF && defined(LUMA_raw)
-#define load2_(off) RF_LUMA_tex(RF_LUMA_pos + RF_LUMA_pt * vec2(off))
+#if RF_ && defined(LUMA_raw)
+#define load2_(off) sample(RF_LUMA_tex, RF_LUMA_pos, RF_LUMA_size, RF_LUMA_pt, off)
 #define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr)))
 #define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0)
-#elif RF && D1W
-#define load2_(off) RF_tex(RF_pos + RF_pt * vec2(off))
+#elif RF_ && D1W
+#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
 #define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr)))
 #define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0)
-#elif RF
-#define load2_(off) RF_tex(RF_pos + RF_pt * vec2(off))
+#elif RF_
+#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
 #else
-#define load2_(off) HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off))
+#define load2_(off) load_(off)
 #define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr)))
 #define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0)
 #endif
 
 #if T
-vec4 load(vec3 off)
+val load(vec3 off)
 {
-	switch (int(off.z)) {
-	case 0: return load_(off);
+	switch (min(int(off.z), frame)) {
+	case 0: return val_swizz(load_(off));
+
 	}
 }
-vec4 load2(vec3 off)
+val load2(vec3 off)
 {
-	switch (int(off.z)) {
-	case 0: return load2_(off);
-	}
+	return off.z == 0 ? val_swizz(load2_(off)) : load(off);
 }
 #else
-#define load(off) load_(off)
-#define load2(off) load2_(off)
+#define load(off) val_swizz(load_(off))
+#define load2(off) val_swizz(load2_(off))
 #endif
 
-vec4 poi = load(vec3(0)); // pixel-of-interest
-vec4 poi2 = load2(vec3(0)); // guide pixel-of-interest
+val poi = load(vec3(0)); // pixel-of-interest
+val poi2 = load2(vec3(0)); // guide pixel-of-interest
 
 #if RI // rotation
 vec2 rot(vec2 p, float d)
@@ -867,22 +987,52 @@ vec2 ref(vec2 p, int d)
 #define ref(p, d) (p)
 #endif
 
-vec4 patch_comparison(vec3 r, vec3 r2)
+#if SST && R >= SST
+float spatial_r(vec3 v)
+{
+	v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size);
+	return SK(length(v*SD)*SS);
+}
+#else
+#define spatial_r(v) (1)
+#endif
+
+#if PST && P >= PST
+#define spatial_p(v) PSK(length(v*PSD)*PSS)
+#else
+#define spatial_p(v) (1)
+#endif
+
+val range(val pdiff_sq)
+{
+	const float h = S*0.013;
+	const float pdiff_scale = 1.0/(h*h);
+	pdiff_sq = sqrt(pdiff_sq * pdiff_scale);
+#if defined(LUMA_raw)
+	return RK(pdiff_sq);
+#elif defined(CHROMA_raw)
+	return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y));
+#else
+	return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z));
+#endif
+	//return exp(-pdiff_sq * pdiff_scale);
+
+	// weight function from the NLM paper, it's not very good
+	//return exp(-max(pdiff_sq - 2*S*S, 0.0) * pdiff_scale);
+}
+
+val patch_comparison(vec3 r, vec3 r2)
 {
 	vec3 p;
-	vec4 min_rot = vec4(p_area);
+	val min_rot = val(p_area);
 
 	FOR_ROTATION FOR_REFLECTION {
-		vec4 pdiff_sq = vec4(0);
+		val pdiff_sq = val(0);
 		FOR_PATCH(p) {
 			vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z);
-			vec4 diff_sq = load2(p + r2) - load2((transformed_p + r) * SF);
+			val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF);
 			diff_sq *= diff_sq;
-#if PST && P >= PST
-			float pdist = length(p.xy*PSD)*PSS;
-			pdist = exp(-(pdist*pdist));
-			diff_sq = pow(max(diff_sq, EPSILON), vec4(pdist));
-#endif
+			diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy);
 			pdiff_sq += diff_sq;
 		}
 		min_rot = min(min_rot, pdiff_sq);
@@ -894,14 +1044,15 @@ vec4 patch_comparison(vec3 r, vec3 r2)
 #define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
 #define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
 
-#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && M != 1 && REGULAR_ROTATIONS && NO_GATHER
+#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER
 // 3x3 diamond/plus patch_comparison_gather
 // XXX extend to support arbitrary sizes (probably requires code generation)
 // XXX extend to support 3x3 square
+// XXX support PSS
 const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
 const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
 vec4 poi_patch = gather_offs(0, offsets);
-vec4 patch_comparison_gather(vec3 r, vec3 r2)
+float patch_comparison_gather(vec3 r, vec3 r2)
 {
 	float min_rot = p_area - 1;
 	vec4 transformer = gather_offs(r, offsets_sf);
@@ -925,13 +1076,12 @@ vec4 patch_comparison_gather(vec3 r, vec3 r2)
 	}
 	float center_diff_sq = poi2.x - load2(r).x;
 	center_diff_sq *= center_diff_sq;
-	return vec4(min_rot + center_diff_sq, 0, 0, 0) * p_scale;
+	return (min_rot + center_diff_sq) * p_scale;
 }
-#elif (defined(LUMA_gather) || D1W) && PS == 6 && REGULAR_ROTATIONS && NO_GATHER
+#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER
 // tiled even square patch_comparison_gather
 // XXX extend to support odd square?
-// XXX rotations/reflections appear to be subtly broken
-vec4 patch_comparison_gather(vec3 r, vec3 r2)
+float patch_comparison_gather(vec3 r, vec3 r2)
 {
 	vec2 tile;
 	float min_rot = p_area;
@@ -940,40 +1090,17 @@ vec4 patch_comparison_gather(vec3 r, vec3 r2)
 	 * w z
 	 * x y
 	 */
-	FOR_ROTATION FOR_REFLECTION {
-		float pdiff_sq = 0;
-		for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) {
-			vec4 poi_patch = gather(tile + r2.xy);
-			vec4 transformer = gather(ref(rot(tile + 0.5, ri), rfi) - 0.5 + r.xy);
-
-#if RI
-			for (float i = 0; i < ri; i+=90)
-				transformer = transformer.wxyz; // rotate 90 degrees
-#endif
-#if RFI // XXX output is a little off
-			switch(rfi) {
-			case 1: transformer = transformer.zyxw; break;
-			case 2: transformer = transformer.xwzy; break;
-			}
-#endif
-
-			vec4 diff_sq = (poi_patch - transformer) * (poi_patch - transformer);
-#if PST && P >= PST
-			// XXX refactor to avoid pow (should probably break off into a function)
-			vec4 pdist = vec4(
-				exp(-pow(length((tile+vec2(0,1))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(1,1))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(1,0))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(0,0))*PSD)*PSS, 2))
-			);
-			diff_sq = pow(max(diff_sq, EPSILON), pdist);
-#endif
-			pdiff_sq += dot(diff_sq, vec4(1));
-		}
-		min_rot = min(min_rot, pdiff_sq);
+	float pdiff_sq = 0;
+	for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) {
+		vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy);
+		diff_sq *= diff_sq;
+		diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)),
+			                                 spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0)));
+		pdiff_sq += dot(diff_sq, vec4(1));
 	}
+	min_rot = min(min_rot, pdiff_sq);
 
-	return vec4(min_rot, 0, 0, 0) * p_scale;
+	return min_rot * p_scale;
 }
 #else
 #define patch_comparison_gather patch_comparison
@@ -981,9 +1108,9 @@ vec4 patch_comparison_gather(vec3 r, vec3 r2)
 
 vec4 hook()
 {
-	vec4 total_weight = vec4(0);
-	vec4 sum = vec4(0);
-	vec4 result = vec4(0);
+	val total_weight = val(0);
+	val sum = val(0);
+	val result = val(0);
 
 	vec3 r = vec3(0);
 	vec3 p = vec3(0);
@@ -997,41 +1124,38 @@ vec4 hook()
 	float me_weight = 0;
 #endif
 
-#if WD == 2 || M == 3 // weight discard, weighted median intensities
+#if WD == 2 // weight discard
 	int r_index = 0;
-	vec4 all_weights[r_area];
-	vec4 all_pixels[r_area];
+	val_packed all_weights[r_area];
+	val_packed all_pixels[r_area];
 #elif WD == 1 // weight discard
-	vec4 no_weights = vec4(0);
-	vec4 discard_total_weight = vec4(0);
-	vec4 discard_sum = vec4(0);
-#endif
-
-#if M == 1 // Euclidean medians
-	vec4 minsum = vec4(0);
+	val no_weights = val(0);
+	val discard_total_weight = val(0);
+	val discard_sum = val(0);
 #endif
 
 	FOR_FRAME(r) {
 	// XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
 #if T && ME == 1 // temporal & motion estimation max weight
 	if (r.z > 0) {
-		me += me_tmp;
+		me += me_tmp * MEF;
 		me_tmp = vec3(0);
 		maxweight = 0;
 	}
 #elif T && ME == 2 // temporal & motion estimation weighted average
 	if (r.z > 0) {
-		me += round(me_sum / me_weight);
+		me += round(me_sum / me_weight * MEF);
 		me_sum = vec3(0);
 		me_weight = 0;
 	}
 #endif
-	FOR_RESEARCH(r) {
-		// main NLM logic
-		const float h = S*0.013;
-		const float pdiff_scale = 1.0/(h*h);
-		vec4 pdiff_sq = (r.z == 0) ? patch_comparison_gather(r+me, vec3(0)) : patch_comparison(r+me, vec3(0));
-		vec4 weight = exp(-pdiff_sq * pdiff_scale);
+	FOR_RESEARCH(r) { // main NLM logic
+#if SKIP_PATCH
+		val weight = val(1);
+#else
+		val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(r+me, vec3(0))) : patch_comparison(r+me, vec3(0));
+		val weight = range(pdiff_sq);
+#endif
 
 #if T && ME == 1 // temporal & motion estimation max weight
 		me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));
@@ -1042,18 +1166,18 @@ vec4 hook()
 #endif
 
 #if D1W
-		weight = vec4(weight.x);
+		weight = val(weight.x);
 #endif
 
-		weight *= exp(-(length(r*SD)*SS * length(r*SD)*SS)); // spatial kernel
+		weight *= spatial_r(r);
 
-#if WD == 2 || M == 3 // weight discard, weighted median intensity
-		all_weights[r_index] = weight;
-		all_pixels[r_index] = load(r+me);
+#if WD == 2 // weight discard
+		all_weights[r_index] = val_pack(weight);
+		all_pixels[r_index] = val_pack(load(r+me));
 		r_index++;
 #elif WD == 1 // weight discard
-		vec4 wd_scale = 1.0/max(no_weights, 1);
-		vec4 keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
+		val wd_scale = 1.0/max(no_weights, 1);
+		val keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
 		discard_sum += load(r+me) * weight * (1 - keeps);
 		discard_total_weight += weight * (1 - keeps);
 		no_weights += keeps;
@@ -1061,45 +1185,25 @@ vec4 hook()
 
 		sum += load(r+me) * weight;
 		total_weight += weight;
-
-#if M == 1 // Euclidean median
-		// Based on: https://arxiv.org/abs/1207.3056
-		// XXX might not work with ME
-		vec3 r2;
-		vec4 wpdist_sum = vec4(0);
-		FOR_FRAME(r2) FOR_RESEARCH(r2) {
-			vec4 pdist = (r.z + r2.z) == 0 ? patch_comparison_gather(r+me, r2+me) : patch_comparison(r+me, r2+me);
-			wpdist_sum += sqrt(pdist) * (1-weight);
-		}
-
-		vec4 newmin = step(wpdist_sum, minsum); // wpdist_sum <= minsum
-		newmin *= 1 - step(wpdist_sum, vec4(0)); // && wpdist_sum > 0
-		newmin += step(minsum, vec4(0)); // || minsum <= 0
-		newmin = min(newmin, 1);
-
-		minsum = (newmin * wpdist_sum) + ((1-newmin) * minsum);
-		result = (newmin * load(r+me)) + ((1-newmin) * result);
-#endif
 	} // FOR_RESEARCH
 	} // FOR_FRAME
 
-	// XXX optionally put the denoised pixel into the frame buffer?
-#if T // temporal
-#endif
-
-	vec4 avg_weight = total_weight * r_scale;
-	vec4 old_avg_weight = avg_weight;
+	val avg_weight = total_weight * r_scale;
+	val old_avg_weight = avg_weight;
 
 #if WD == 2 // true average
-	total_weight = vec4(0);
-	sum = vec4(0);
-	vec4 no_weights = vec4(0);
+	total_weight = val(0);
+	sum = val(0);
+	val no_weights = val(0);
 
 	for (int i = 0; i < r_area; i++) {
-		vec4 keeps = step(avg_weight*WDT, all_weights[i]);
-		all_weights[i] *= keeps;
-		sum += all_pixels[i] * all_weights[i];
-		total_weight += all_weights[i];
+		val w = val_unpack(all_weights[i]);
+		val px = val_unpack(all_pixels[i]);
+		val keeps = step(avg_weight*WDT, w);
+
+		w *= keeps;
+		sum += px * w;
+		total_weight += w;
 		no_weights += keeps;
 	}
 #elif WD == 1 // moving cumulative average
@@ -1110,29 +1214,23 @@ vec4 hook()
 	avg_weight = total_weight / no_weights;
 #endif
 
-	total_weight += SW;
-	sum += poi * SW;
+	total_weight += SW * spatial_r(vec3(0));
+	sum += poi * SW * spatial_r(vec3(0));
 
-#if M == 3 // weighted median intensity
-	const float hr_area = r_area/2.0;
-	vec4 is_median, gt, lt, gte, lte, neq;
+#if V == 3 // weight map
+	result = val(avg_weight);
+#else // mean
+	result = val(sum / total_weight);
+#endif
 
-	for (int i = 0; i < r_area; i++) {
-		gt = lt = vec4(0);
-		for (int j = 0; j < r_area; j++) {
-			gte = step(all_pixels[i]*all_weights[i], all_pixels[j]*all_weights[j]);
-			lte = step(all_pixels[j]*all_weights[j], all_pixels[i]*all_weights[i]);
-			neq = 1 - gte * lte;
-			gt += gte * neq;
-			lt += lte * neq;
-		}
-		is_median = step(gt, vec4(hr_area)) * step(lt, vec4(hr_area));
-		result += step(result, vec4(0)) * is_median * all_pixels[i];
-	}
-#elif M == 2 // weight map
-	result = avg_weight;
-#elif M == 0 // mean
-	result = sum / total_weight;
+	// store frames for temporal
+#if T > 1
+
+#endif
+#if T && TRF
+	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result));
+#elif T
+	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2));
 #endif
 
 #if ASW == 0 // pre-WD weights
@@ -1142,22 +1240,20 @@ vec4 hook()
 #endif
 
 #if ASK == 0
-	vec4 sharpening_strength = pow(AS_weight, vec4(ASP));
+	val sharpening_strength = pow(AS_weight, val(ASP));
 #elif ASK == 1
-#define sigmoid(x) (tanh(x * 2*M_PI - M_PI)*0.5+0.5)
-	vec4 sharpening_strength = mix(pow(sigmoid(AS_weight), vec4(ASP)),
-	                               AS_weight, ASC);
-	// just in case ASC < 0 (will sharpen but it's janky XXX)
-	sharpening_strength = clamp(sharpening_strength, 0.0, 1.0);
+	val sharpening_strength = mix(
+			pow(smoothstep(0.0, 1.0, AS_weight), val(ASP)),
+			AS_weight, ASC);
+	// XXX normalize the result to account for a negative ASC?
 #elif ASK == 2
-	vec4 sharpening_strength = vec4(ASP);
+	val sharpening_strength = val(ASP);
 #endif
 
-	// XXX maybe allow for alternative blurs? e.g., replace result w/ load2?
 #if AS == 1 // sharpen+denoise
-	vec4 sharpened = result + (poi - result) * ASF;
+	val sharpened = result + (poi - result) * ASF;
 #elif AS == 2 // sharpen only
-	vec4 sharpened = poi + (poi - result) * ASF;
+	val sharpened = poi + (poi - result) * ASF;
 #endif
 
 #if EP // extremes preserve
@@ -1173,20 +1269,20 @@ vec4 hook()
 	result = mix(sharpened, poi, sharpening_strength);
 #endif
 
-#if M == 4 // edge map
+#if V == 4 // edge map
 	result = sharpening_strength;
 #endif
 
-#if (M == 2 || M == 4) && defined(CHROMA_raw) // drop chroma for weight maps
-	result = vec4(0.5);
+#if (V == 3 || V == 4) && defined(CHROMA_raw) // drop chroma for these visualizations
+	return vec4(0.5);
 #endif
 
-#if DV == 1
-	result = clamp(abs(poi - result) * S, 0.0, 1.0);
-#elif DV == 2
+#if V == 1
+	result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0);
+#elif V == 2
 	result = (poi - result) * 0.5 + 0.5;
 #endif
 
-	return mix(poi, result, BF);
+	return unval(mix(poi, result, BF));
 }
 
diff --git a/portable_config/shaders/nlmeans_2x.glsl b/portable_config/shaders/nlmeans_2x.glsl
new file mode 100644
index 00000000..737f245e
--- /dev/null
+++ b/portable_config/shaders/nlmeans_2x.glsl
@@ -0,0 +1,1247 @@
+/* vi: ft=c
+ *
+ * Based on vf_nlmeans.c from FFmpeg.
+ *
+ * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
+ * Copyright (c) 2016 Clément Bœsch <u pkh me>
+ *
+ * This program is free software: you can redistribute it and/or modify it 
+ * under the terms of the GNU Lesser General Public License as published by 
+ * the Free Software Foundation, either version 2.1 of the License, or (at 
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT 
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License 
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+// Description: nlmeans_2x.glsl: Experimental upscaler
+
+/* The recommended usage of this shader and its variant profiles is to add them 
+ * to input.conf and then dispatch the appropriate shader via a keybind during 
+ * media playback. Here is an example input.conf entry:
+ *
+ * F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl"; show-text "Non-local means (LUMA only)"
+ *
+ * These shaders can also be enabled by default in mpv.conf, for example:
+ *
+ * glsl-shaders='~~/shaders/nlmeans.glsl'
+ *
+ * Both of the examples above assume the shaders are located in a subdirectory 
+ * named "shaders" within mpv's config directory. Refer to the mpv 
+ * documentation for more details.
+ *
+ * This shader is highly configurable via user variables below. Although the 
+ * default settings should offer good quality at a reasonable speed, you are 
+ * encouraged to tweak them to your preferences. Be mindful that certain 
+ * settings may greatly affect speed.
+ *
+ * Denoising is most useful for noisy content. If there is no perceptible 
+ * noise, you probably won't see a positive difference.
+ *
+ * The default settings are generally tuned for low noise and high detail 
+ * preservation. The "medium" and "heavy" profiles are tuned for higher levels 
+ * of noise.
+ *
+ * The denoiser will not work properly if the content has been upscaled 
+ * beforehand (whether it was done by you or not). In such cases, consider 
+ * issuing a command to downscale in the mpv console (backtick ` key):
+ *
+ * vf toggle scale=-2:720
+ *
+ * ...replacing 720 with whatever resolution seems appropriate. Rerun the 
+ * command to undo the downscale. It may take some trial-and-error to find the 
+ * proper resolution.
+ */
+
+/* Regarding speed
+ *
+ * Speed may vary wildly for different vo and gpu-api settings. Generally 
+ * vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this 
+ * may be different for your system.
+ *
+ * If your GPU doesn't support textureGather, or if you are on a version of mpv 
+ * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ profile
+ *
+ * If you plan on tinkering with NLM's settings, read below:
+ *
+ * textureGather only applies to luma and limited to the these configurations:
+ *
+ * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}
+ *   - Default, very fast, rotations and reflections should be free
+ *   - If this is unusually slow then try changing gpu-api and vo
+ *   - If it's still slow, try setting RI/RFI to 0.
+ *
+ * - PS=6:RI={0,1,3}:RFI={0,1,2}
+ *   - Currently the only scalable variant
+ *   - Patch shape is asymmetric on two axis
+ *   - Rotations should have very little speed impact
+ *   - Reflections may have a significant speed impact
+ *
+ * Options which always disable textureGather:
+ * 	- PD
+ * 	- NG
+ */
+
+// The following is shader code injected from guided.glsl
+/* vi: ft=c
+ *
+ * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify it 
+ * under the terms of the GNU Lesser General Public License as published by 
+ * the Free Software Foundation, either version 2.1 of the License, or (at 
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT 
+ * ANY WARRANTY;  without even the implied warranty of MERCHANTABILITY or 
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License 
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+// Description: guided.glsl: Guided by the downscaled image
+
+/* The radius can be adjusted with the MEANI stage's downscaling factor. 
+ * Higher numbers give a bigger radius.
+ *
+ * The E variable can be found in the A stage.
+ *
+ * The subsampling (fast guided filter) can be adjusted with the I stage's 
+ * downscaling factor. Higher numbers are faster.
+ *
+ * The guide's subsampling can be adjusted with the PREI stage's downscaling 
+ * factor. Higher numbers downscale more.
+ */
+
+//!HOOK LUMA
+//!BIND HOOKED
+//!WIDTH HOOKED.w 1.25 /
+//!HEIGHT HOOKED.h 1.25 /
+//!DESC Guided filter (PREI)
+//!SAVE _INJ_PREI
+
+vec4 hook()
+{
+	 return HOOKED_texOff(0); 
+}
+
+//!HOOK LUMA
+//!BIND _INJ_PREI
+//!WIDTH HOOKED.w
+//!HEIGHT HOOKED.h
+//!DESC Guided filter (I)
+//!SAVE _INJ_I
+
+vec4 hook()
+{
+return _INJ_PREI_texOff(0);
+}
+
+
+//!HOOK LUMA
+//!DESC Guided filter (P)
+//!BIND HOOKED
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_P
+
+vec4 hook()
+{
+	 return HOOKED_texOff(0); 
+}
+
+//!HOOK LUMA
+//!DESC Guided filter (MEANI)
+//!BIND _INJ_I
+//!WIDTH _INJ_I.w 1.5 /
+//!HEIGHT _INJ_I.h 1.5 /
+//!SAVE _INJ_MEANI
+
+vec4 hook()
+{
+return _INJ_I_texOff(0);
+}
+
+//!HOOK LUMA
+//!DESC Guided filter (MEANP)
+//!BIND _INJ_P
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_MEANP
+
+vec4 hook()
+{
+return _INJ_P_texOff(0);
+}
+
+//!HOOK LUMA
+//!DESC Guided filter (_INJ_I_SQ)
+//!BIND _INJ_I
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_I_SQ
+
+vec4 hook()
+{
+return _INJ_I_texOff(0) * _INJ_I_texOff(0);
+}
+
+//!HOOK LUMA
+//!DESC Guided filter (_INJ_IXP)
+//!BIND _INJ_I
+//!BIND _INJ_P
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_IXP
+
+vec4 hook()
+{
+return _INJ_I_texOff(0) * _INJ_P_texOff(0);
+}
+
+//!HOOK LUMA
+//!DESC Guided filter (CORRI)
+//!BIND _INJ_I_SQ
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_CORRI
+
+vec4 hook()
+{
+return _INJ_I_SQ_texOff(0);
+}
+
+//!HOOK LUMA
+//!DESC Guided filter (CORRP)
+//!BIND _INJ_IXP
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_CORRP
+
+vec4 hook()
+{
+return _INJ_IXP_texOff(0);
+}
+
+//!HOOK LUMA
+//!DESC Guided filter (A)
+//!BIND _INJ_MEANI
+//!BIND _INJ_MEANP
+//!BIND _INJ_CORRI
+//!BIND _INJ_CORRP
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_A
+
+#define E 0.0013
+
+vec4 hook()
+{
+vec4 var = _INJ_CORRI_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANI_texOff(0);
+vec4 cov = _INJ_CORRP_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANP_texOff(0);
+	 return cov / (var + E); 
+}
+
+//!HOOK LUMA
+//!DESC Guided filter (B)
+//!BIND _INJ_A
+//!BIND _INJ_MEANI
+//!BIND _INJ_MEANP
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_B
+
+vec4 hook()
+{
+return _INJ_MEANP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANI_texOff(0);
+}
+
+//!HOOK LUMA
+//!DESC Guided filter (MEANA)
+//!BIND _INJ_A
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_MEANA
+
+vec4 hook()
+{
+return _INJ_A_texOff(0);
+}
+
+//!HOOK LUMA
+//!DESC Guided filter (MEANB)
+//!BIND _INJ_B
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_MEANB
+
+vec4 hook()
+{
+return _INJ_B_texOff(0);
+}
+
+//!HOOK LUMA
+//!DESC Guided filter
+//!BIND HOOKED
+//!BIND _INJ_MEANA
+//!BIND _INJ_MEANB
+//!SAVE RF_LUMA
+
+vec4 hook()
+{
+return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0);
+}
+
+// End of source code injected from guided.glsl 
+
+//!HOOK LUMA
+//!BIND HOOKED
+//!BIND RF_LUMA
+//!DESC Non-local means (nlmeans_2x.glsl)
+//!WIDTH HOOKED.w 2 *
+//!HEIGHT HOOKED.h 2 *
+
+// User variables
+
+// It is generally preferable to denoise luma and chroma differently, so the 
+// user variables for luma and chroma are split.
+
+// Denoising factor (level of blur, higher means more blur)
+#ifdef LUMA_raw
+#define S 12.8125
+#else
+#define S 12.8125
+#endif
+
+/* Adaptive sharpening
+ *
+ * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
+ * weight map to restrict the sharpening to edges.
+ *
+ * If you just want to increase/decrease sharpness then you want to change ASF.
+ *
+ * Use V=4 to visualize which areas are sharpened (black means sharpen).
+ *
+ * AS:
+ * 	- 0 to disable
+ * 	- 1 to sharpen+denoise
+ * 	- 2 to sharpen only
+ * ASF: Higher numbers make a sharper image
+ * ASP: Higher numbers use more of the sharp image
+ * ASW:
+ * 	- 0 to use pre-WD weights
+ * 	- 1 to use post-WD weights (ASP should be ~2x to compensate)
+ * ASK: Weight kernel:
+ * 	- 0 for power. This is the old method.
+ * 	- 1 for sigmoid. This is generally recommended.
+ * 	- 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image)
+ * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map
+ */
+#ifdef LUMA_raw
+#define AS 0
+#define ASF 3.0
+#define ASP 1.0
+#define ASW 0
+#define ASK 1
+#define ASC 0.0
+#else
+#define AS 0
+#define ASF 3.0
+#define ASP 1.0
+#define ASW 0
+#define ASK 1
+#define ASC 0.0
+#endif
+
+/* Starting weight
+ *
+ * Also known as the center weight. This represents the weight of the 
+ * pixel-of-interest. Lower numbers may help handle heavy noise & ringing.
+ *
+ * EPSILON should be used instead of zero to avoid divide-by-zero errors.
+ */
+#ifdef LUMA_raw
+#define SW 0.14876
+#else
+#define SW 0.14876
+#endif
+
+/* Weight discard
+ *
+ * Discard weights that fall below a fraction of the average weight. This culls 
+ * the most dissimilar samples from the blur, yielding a much more pleasant 
+ * result, especially around edges.
+ * 
+ * WD:
+ * 	- 2: True average. Better quality, but slower and requires GLSL 4.0 or later
+ * 	- 1: Moving cumulative average. Inaccurate, tends to blur directionally.
+ * 	- 0: Disable
+ *
+ * WDT: Threshold coefficient, higher numbers discard more
+ * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
+ */
+#ifdef LUMA_raw
+#define WD 2
+#define WDT 0.63888239592
+#define WDP 6.0
+#else
+#define WD 2
+#define WDT 0.63888239592
+#define WDP 6.0
+#endif
+
+/* Extremes preserve
+ *
+ * Reduces denoising around very bright/dark areas.
+ *
+ * The downscaling factor of the EP shader stage affects what is considered a 
+ * bright/dark area. The default of 3 should be fine, it's not recommended to 
+ * change this.
+ *
+ * This is incompatible with RGB. If you have RGB hooks enabled then you will 
+ * have to delete the EP shader stage or specify EP=0 through shader_cfg.
+ *
+ * EP: 1 to enable, 0 to disable
+ * DP: EP strength on dark patches, 0 to fully denoise
+ * BP: EP strength on bright patches, 0 to fully denoise
+ */
+#ifdef LUMA_raw
+#define EP 0
+#define BP 0.75
+#define DP 0.25
+#else
+#define EP 0
+#define BP 0.0
+#define DP 0.0
+#endif
+
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+
+/* Patch & research sizes
+ *
+ * Patch size should be an odd number greater than or equal to 3. Higher values 
+ * are slower and not always better.
+ *
+ * Research size be an odd number greater than or equal to 3. Higher values are 
+ * generally better, but slower, blurrier, and gives diminishing returns.
+ */
+#ifdef LUMA_raw
+#define P 3
+#define R 5
+#else
+#define P 3
+#define R 5
+#endif
+
+/* Patch and research shapes
+ *
+ * Different shapes have different speed and quality characteristics. Every 
+ * shape (besides square) is smaller than square.
+ *
+ * PS applies applies to patches, RS applies to research zones.
+ *
+ * Be wary of gather optimizations (see the Regarding Speed comment at the top)
+ *
+ * 0: square (symmetrical)
+ * 1: horizontal line (asymmetric)
+ * 2: vertical line (asymmetric)
+ * 3: diamond (symmetrical)
+ * 4: triangle (asymmetric, pointing upward)
+ * 5: truncated triangle (asymmetric on two axis, last row halved)
+ * 6: even sized square (asymmetric on two axis)
+ * 7: plus (symmetrical)
+ */
+#ifdef LUMA_raw
+#define RS 3
+#define PS 3
+#else
+#define RS 3
+#define PS 3
+#endif
+
+/* Robust filtering
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
+ *
+ * Compares the pixel-of-interest against a guide, which could be a downscaled 
+ * image or the output of another shader
+ */
+#define RF_LUMA 1
+#define RF 0
+
+/* Rotational/reflectional invariance
+ *
+ * Number of rotations/reflections to try for each patch comparison. Can be 
+ * slow, but improves feature preservation. More rotations/reflections gives 
+ * diminishing returns. The most similar rotation/reflection will be used.
+ *
+ * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
+ * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
+ *
+ * RI: Rotational invariance
+ * RFI (0 to 2): Reflectional invariance
+ */
+#ifdef LUMA_raw
+#define RI 3
+#define RFI 2
+#else
+#define RI 0
+#define RFI 0
+#endif
+
+/* Temporal denoising
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
+ *
+ * Caveats:
+ * 	- Slower:
+ * 		- Each frame needs to be researched (more samples & more math)
+ * 		- Gather optimizations only apply to the current frame
+ * 	- Requires vo=gpu-next
+ * 	- Luma-only (this is a bug)
+ * 	- Buggy
+ *
+ * May cause motion blur and may struggle more with noise that persists across 
+ * multiple frames (e.g., from compression or duplicate frames), but can work 
+ * very well on high quality video.
+ *
+ * Motion estimation (ME) should improve quality without impacting speed.
+ *
+ * T: number of frames used
+ * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg
+ * MEF: estimate factor, compensates for ME being one frame behind
+ * TRF: compare against the denoised frames
+ */
+#ifdef LUMA_raw
+#define T 0
+#define ME 1
+#define MEF 2
+#define TRF 0
+#else
+#define T 0
+#define ME 0
+#define MEF 2
+#define TRF 0
+#endif
+
+/* Spatial kernel
+ *
+ * Increasing the spatial denoising factor (SS) reduces the weight of further 
+ * pixels.
+ *
+ * Spatial distortion instructs the spatial kernel to view that axis as 
+ * closer/further, for instance SD=(1,1,0.5) would make the temporal axis 
+ * appear closer and increase blur between frames.
+ *
+ * The intra-patch variants are supposed to help with larger patch sizes.
+ *
+ * SST: enables spatial kernel if R>=PST, 0 fully disables
+ * SS: spatial sigma
+ * SD: spatial distortion (X, Y, time)
+ * PSS: intra-patch spatial sigma
+ * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables
+ * PSD: intra-patch spatial distortion (X, Y)
+ */
+#ifdef LUMA_raw
+#define SST 1
+#define SS 0.5547703803256947
+#define SD vec3(1,1,1)
+#define PST 0
+#define PSS 0.0
+#define PSD vec2(1,1)
+#else
+#define SST 1
+#define SS 0.5547703803256947
+#define SD vec3(1,1,1)
+#define PST 0
+#define PSS 0.0
+#define PSD vec2(1,1)
+#endif
+
+/* Kernels
+ *
+ * SK: spatial kernel
+ * RK: range kernel (takes patch differences)
+ * PSK: intra-patch spatial kernel
+ *
+ * List of available kernels:
+ *
+ * bicubic
+ * cos
+ * gaussian
+ * lanczos
+ * quadratic
+ * sinc
+ * sphinx
+ */
+#ifdef LUMA_raw
+#define SK lanczos
+#define RK gaussian
+#define PSK gaussian
+#else
+#define SK lanczos
+#define RK gaussian
+#define PSK gaussian
+#endif
+
+// Scaling factor (should match WIDTH/HEIGHT)
+#ifdef LUMA_raw
+#define SF 1
+#else
+#define SF 1
+#endif
+
+/* Visualization
+ *
+ * 0: off
+ * 1: absolute difference between input/output to the power of 0.25
+ * 2: difference between input/output centered on 0.5
+ * 3: avg_weight
+ * 4: edge map (based on the relevant AS settings)
+ */
+#ifdef LUMA_raw
+#define V 0
+#else
+#define V 0
+#endif
+
+// Blur factor (0.0 returns the input image, 1.0 returns the output image)
+#ifdef LUMA_raw
+#define BF 1.0
+#else
+#define BF 1.0
+#endif
+
+// Force disable textureGather
+#ifdef LUMA_raw
+#define NG 0
+#else
+#define NG 0
+#endif
+
+// Patch donut (probably useless)
+#ifdef LUMA_raw
+#define PD 0
+#else
+#define PD 0
+#endif
+
+// Duplicate 1st weight (for luma-guided-chroma)
+#ifdef LUMA_raw
+#define D1W 0
+#else
+#define D1W 0
+#endif
+
+// Skip patch comparison
+#ifdef LUMA_raw
+#define SKIP_PATCH 0
+#else
+#define SKIP_PATCH 0
+#endif
+
+// Shader code
+
+#define EPSILON 0.00000000001
+#define M_PI 3.14159265358979323846
+#define POW2(x) ((x)*(x))
+#define POW3(x) ((x)*(x)*(x))
+#define bicubic(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0))))
+#define gaussian(x) exp(-1 * POW2(x))
+#define lanczos(x) POW2(sinc(x))
+#define quadratic(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5))
+#define sinc(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI))
+#define sphinx(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI))
+
+// XXX could maybe be better optimized on LGC
+// XXX return original alpha component instead of 1.0
+#if defined(LUMA_raw)
+#define val float
+#define val_swizz(v) (v.x)
+#define unval(v) vec4(v.x, 0, 0, 1.0)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#elif defined(CHROMA_raw)
+#define val vec2
+#define val_swizz(v) (v.xy)
+#define unval(v) vec4(v.x, v.y, 0, 1.0)
+#define val_packed uint
+#define val_pack(v) packUnorm2x16(v)
+#define val_unpack(v) unpackUnorm2x16(v)
+#else
+#define val vec3
+#define val_swizz(v) (v.xyz)
+#define unval(v) vec4(v.x, v.y, v.z, 1.0)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#endif
+
+#if PS == 6
+const int hp = P/2;
+#else
+const float hp = int(P/2) - 0.5*(1-(P%2)); // sample between pixels for even patch sizes
+#endif
+
+#if RS == 6
+const int hr = R/2;
+#else
+const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes
+#endif
+
+// donut increment, increments without landing on (0,0,0)
+// much faster than a continue statement
+#define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0))))
+
+// patch/research shapes
+// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R)
+// dots (.) represent samples (pixels) and X represents the pixel-of-interest
+
+// Z    .....
+// Z    .....
+// Z    ..X..
+// Z    .....
+// Z    .....
+#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr)
+
+// (in this instance Z=4)
+// Z    ....
+// Z    ....
+// Z    ..X.
+// Z    ....
+#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr)
+
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
+#define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr)
+
+// Z-4    .
+// Z-2   ...
+// hz+1 ..X
+#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr)
+#define S_TRIANGLE_A(hz,Z) int(hz*hz+Z)
+
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
+// Z-2   ...
+// Z-4    .
+#define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr)
+#define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z)
+
+//
+// Z    ..X..
+//
+#define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++)
+
+// 90 degree rotation of S_HORIZONTAL
+#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
+
+// 1      .
+// 1      . 
+// Z    ..X..
+// 1      . 
+// 1      .
+#define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr)
+#define S_PLUS_A(hz,Z) (Z*2 - 1)
+
+// XXX implement S_PLUS w/ an X overlayed:
+// 3    . . .
+// 3     ...
+// Z    ..X..
+// 3     ...
+// 3    . . .
+
+// XXX implement an X shape:
+// 2    .   .
+// 2     . .
+// 1      X  
+// 2     . .
+// 2    .   .
+
+// 1x1 square
+#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
+
+#define T1 (T+1)
+#define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++)
+
+#ifdef LUMA_raw
+#define RF_ RF_LUMA
+#else
+#define RF_ RF
+#endif
+
+// Skip comparing the pixel-of-interest against itself, unless RF is enabled
+#if RF_
+#define RINCR(z,c) (z.c++)
+#else
+#define RINCR DINCR
+#endif
+
+#define R_AREA(a) (a * T1 + RF_-1)
+
+// research shapes
+// XXX would be nice to have the option of temporally-varying research sizes
+#if R == 0 || R == 1
+#define FOR_RESEARCH(r) S_1X1(r)
+const int r_area = R_AREA(1);
+#elif RS == 7
+#define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y))
+const int r_area = R_AREA(S_PLUS_A(hr,R));
+#elif RS == 6
+#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y))
+const int r_area = R_AREA(R*R);
+#elif RS == 5
+#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x))
+const int r_area = R_AREA(S_TRIANGLE_A(hr,hr));
+#elif RS == 4
+#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x))
+const int r_area = R_AREA(S_TRIANGLE_A(hr,R));
+#elif RS == 3
+#define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y))
+const int r_area = R_AREA(S_DIAMOND_A(hr,R));
+#elif RS == 2
+#define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y))
+const int r_area = R_AREA(R);
+#elif RS == 1
+#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x))
+const int r_area = R_AREA(R);
+#elif RS == 0
+#define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y))
+const int r_area = R_AREA(R*R);
+#endif
+
+#define RI1 (RI+1)
+#define RFI1 (RFI+1)
+
+#if RI
+#define FOR_ROTATION for (float ri = 0; ri < 360; ri+=360.0/RI1)
+#else
+#define FOR_ROTATION
+#endif
+
+#if RFI
+#define FOR_REFLECTION for (int rfi = 0; rfi < RFI1; rfi++)
+#else
+#define FOR_REFLECTION
+#endif
+
+#if PD
+#define PINCR DINCR
+#else
+#define PINCR(z,c) (z.c++)
+#endif
+
+#define P_AREA(a) (a - PD)
+
+// patch shapes
+#if P == 0 || P == 1
+#define FOR_PATCH(p) S_1X1(p)
+const int p_area = P_AREA(1);
+#elif PS == 7
+#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y))
+const int p_area = P_AREA(S_PLUS_A(hp,P));
+#elif PS == 6
+#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y))
+const int p_area = P_AREA(P*P);
+#elif PS == 5
+#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x))
+const int p_area = P_AREA(S_TRIANGLE_A(hp,hp));
+#elif PS == 4
+#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x))
+const int p_area = P_AREA(S_TRIANGLE_A(hp,P));
+#elif PS == 3
+#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y))
+const int p_area = P_AREA(S_DIAMOND_A(hp,P));
+#elif PS == 2
+#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y))
+const int p_area = P_AREA(P);
+#elif PS == 1
+#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x))
+const int p_area = P_AREA(P);
+#elif PS == 0
+#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y))
+const int p_area = P_AREA(P*P);
+#endif
+
+const float r_scale = 1.0/r_area;
+const float p_scale = 1.0/p_area;
+
+#define sample(tex, pos, size, pt, off) tex(pos + pt * (vec2(off) + 0.5 - fract(pos*size)))
+#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off)
+
+#if RF_ && defined(LUMA_raw)
+#define load2_(off) sample(RF_LUMA_tex, RF_LUMA_pos, RF_LUMA_size, RF_LUMA_pt, off)
+#define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr)))
+#define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0)
+#elif RF_ && D1W
+#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
+#define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr)))
+#define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0)
+#elif RF_
+#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
+#else
+#define load2_(off) load_(off)
+#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr)))
+#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0)
+#endif
+
+#if T
+val load(vec3 off)
+{
+	switch (min(int(off.z), frame)) {
+	case 0: return val_swizz(load_(off));
+
+	}
+}
+val load2(vec3 off)
+{
+	return off.z == 0 ? val_swizz(load2_(off)) : load(off);
+}
+#else
+#define load(off) val_swizz(load_(off))
+#define load2(off) val_swizz(load2_(off))
+#endif
+
+val poi = load(vec3(0)); // pixel-of-interest
+val poi2 = load2(vec3(0)); // guide pixel-of-interest
+
+#if RI // rotation
+vec2 rot(vec2 p, float d)
+{
+	return vec2(
+		p.x * cos(radians(d)) - p.y * sin(radians(d)),
+		p.y * sin(radians(d)) + p.x * cos(radians(d))
+	);
+}
+#else
+#define rot(p, d) (p)
+#endif
+
+#if RFI // reflection
+vec2 ref(vec2 p, int d)
+{
+	switch (d) {
+	case 0: return p;
+	case 1: return p * vec2(1, -1);
+	case 2: return p * vec2(-1, 1);
+	}
+}
+#else
+#define ref(p, d) (p)
+#endif
+
+#if SST && R >= SST
+float spatial_r(vec3 v)
+{
+	v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size);
+	return SK(length(v*SD)*SS);
+}
+#else
+#define spatial_r(v) (1)
+#endif
+
+#if PST && P >= PST
+#define spatial_p(v) PSK(length(v*PSD)*PSS)
+#else
+#define spatial_p(v) (1)
+#endif
+
+val range(val pdiff_sq)
+{
+	const float h = S*0.013;
+	const float pdiff_scale = 1.0/(h*h);
+	pdiff_sq = sqrt(pdiff_sq * pdiff_scale);
+#if defined(LUMA_raw)
+	return RK(pdiff_sq);
+#elif defined(CHROMA_raw)
+	return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y));
+#else
+	return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z));
+#endif
+	//return exp(-pdiff_sq * pdiff_scale);
+
+	// weight function from the NLM paper, it's not very good
+	//return exp(-max(pdiff_sq - 2*S*S, 0.0) * pdiff_scale);
+}
+
+val patch_comparison(vec3 r, vec3 r2)
+{
+	vec3 p;
+	val min_rot = val(p_area);
+
+	FOR_ROTATION FOR_REFLECTION {
+		val pdiff_sq = val(0);
+		FOR_PATCH(p) {
+			vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z);
+			val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF);
+			diff_sq *= diff_sq;
+			diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy);
+			pdiff_sq += diff_sq;
+		}
+		min_rot = min(min_rot, pdiff_sq);
+	}
+
+	return min_rot * p_scale;
+}
+
+#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
+#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
+
+#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER
+// 3x3 diamond/plus patch_comparison_gather
+// XXX extend to support arbitrary sizes (probably requires code generation)
+// XXX extend to support 3x3 square
+// XXX support PSS
+const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
+const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
+vec4 poi_patch = gather_offs(0, offsets);
+float patch_comparison_gather(vec3 r, vec3 r2)
+{
+	float min_rot = p_area - 1;
+	vec4 transformer = gather_offs(r, offsets_sf);
+	FOR_ROTATION {
+		FOR_REFLECTION {
+			float diff_sq = dot((poi_patch - transformer) * (poi_patch - transformer), vec4(1));
+			min_rot = min(diff_sq, min_rot);
+#if RFI
+			switch(rfi) {
+			case 0: transformer = transformer.zyxw; break;
+			case 1: transformer = transformer.zwxy; break; // undoes last mirror, performs another mirror
+			case 2: transformer = transformer.zyxw; break; // undoes last mirror
+			}
+#endif
+		}
+#if RI == 3
+		transformer = transformer.wxyz;
+#elif RI == 1
+		transformer = transformer.zwxy;
+#endif
+	}
+	float center_diff_sq = poi2.x - load2(r).x;
+	center_diff_sq *= center_diff_sq;
+	return (min_rot + center_diff_sq) * p_scale;
+}
+#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER
+// tiled even square patch_comparison_gather
+// XXX extend to support odd square?
+float patch_comparison_gather(vec3 r, vec3 r2)
+{
+	vec2 tile;
+	float min_rot = p_area;
+
+	/* gather order:
+	 * w z
+	 * x y
+	 */
+	float pdiff_sq = 0;
+	for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) {
+		vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy);
+		diff_sq *= diff_sq;
+		diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)),
+			                                 spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0)));
+		pdiff_sq += dot(diff_sq, vec4(1));
+	}
+	min_rot = min(min_rot, pdiff_sq);
+
+	return min_rot * p_scale;
+}
+#else
+#define patch_comparison_gather patch_comparison
+#endif
+
+vec4 hook()
+{
+	val total_weight = val(0);
+	val sum = val(0);
+	val result = val(0);
+
+	vec3 r = vec3(0);
+	vec3 p = vec3(0);
+	vec3 me = vec3(0);
+
+#if T && ME == 1 // temporal & motion estimation
+	vec3 me_tmp = vec3(0);
+	float maxweight = 0;
+#elif T && ME == 2 // temporal & motion estimation
+	vec3 me_sum = vec3(0);
+	float me_weight = 0;
+#endif
+
+#if WD == 2 // weight discard
+	int r_index = 0;
+	val_packed all_weights[r_area];
+	val_packed all_pixels[r_area];
+#elif WD == 1 // weight discard
+	val no_weights = val(0);
+	val discard_total_weight = val(0);
+	val discard_sum = val(0);
+#endif
+
+	FOR_FRAME(r) {
+	// XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
+#if T && ME == 1 // temporal & motion estimation max weight
+	if (r.z > 0) {
+		me += me_tmp * MEF;
+		me_tmp = vec3(0);
+		maxweight = 0;
+	}
+#elif T && ME == 2 // temporal & motion estimation weighted average
+	if (r.z > 0) {
+		me += round(me_sum / me_weight * MEF);
+		me_sum = vec3(0);
+		me_weight = 0;
+	}
+#endif
+	FOR_RESEARCH(r) { // main NLM logic
+#if SKIP_PATCH
+		val weight = val(1);
+#else
+		val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(r+me, vec3(0))) : patch_comparison(r+me, vec3(0));
+		val weight = range(pdiff_sq);
+#endif
+
+#if T && ME == 1 // temporal & motion estimation max weight
+		me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));
+		maxweight = max(maxweight, weight.x);
+#elif T && ME == 2 // temporal & motion estimation weighted average
+		me_sum += vec3(r.xy,0) * weight.x;
+		me_weight += weight.x;
+#endif
+
+#if D1W
+		weight = val(weight.x);
+#endif
+
+		weight *= spatial_r(r);
+
+#if WD == 2 // weight discard
+		all_weights[r_index] = val_pack(weight);
+		all_pixels[r_index] = val_pack(load(r+me));
+		r_index++;
+#elif WD == 1 // weight discard
+		val wd_scale = 1.0/max(no_weights, 1);
+		val keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
+		discard_sum += load(r+me) * weight * (1 - keeps);
+		discard_total_weight += weight * (1 - keeps);
+		no_weights += keeps;
+#endif
+
+		sum += load(r+me) * weight;
+		total_weight += weight;
+	} // FOR_RESEARCH
+	} // FOR_FRAME
+
+	val avg_weight = total_weight * r_scale;
+	val old_avg_weight = avg_weight;
+
+#if WD == 2 // true average
+	total_weight = val(0);
+	sum = val(0);
+	val no_weights = val(0);
+
+	for (int i = 0; i < r_area; i++) {
+		val w = val_unpack(all_weights[i]);
+		val px = val_unpack(all_pixels[i]);
+		val keeps = step(avg_weight*WDT, w);
+
+		w *= keeps;
+		sum += px * w;
+		total_weight += w;
+		no_weights += keeps;
+	}
+#elif WD == 1 // moving cumulative average
+	total_weight -= discard_total_weight;
+	sum -= discard_sum;
+#endif
+#if WD // weight discard
+	avg_weight = total_weight / no_weights;
+#endif
+
+	total_weight += SW * spatial_r(vec3(0));
+	sum += poi * SW * spatial_r(vec3(0));
+
+#if V == 3 // weight map
+	result = val(avg_weight);
+#else // mean
+	result = val(sum / total_weight);
+#endif
+
+	// store frames for temporal
+#if T > 1
+
+#endif
+#if T && TRF
+	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result));
+#elif T
+	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2));
+#endif
+
+#if ASW == 0 // pre-WD weights
+#define AS_weight old_avg_weight
+#elif ASW == 1 // post-WD weights
+#define AS_weight avg_weight
+#endif
+
+#if ASK == 0
+	val sharpening_strength = pow(AS_weight, val(ASP));
+#elif ASK == 1
+	val sharpening_strength = mix(
+			pow(smoothstep(0.0, 1.0, AS_weight), val(ASP)),
+			AS_weight, ASC);
+	// XXX normalize the result to account for a negative ASC?
+#elif ASK == 2
+	val sharpening_strength = val(ASP);
+#endif
+
+#if AS == 1 // sharpen+denoise
+	val sharpened = result + (poi - result) * ASF;
+#elif AS == 2 // sharpen only
+	val sharpened = poi + (poi - result) * ASF;
+#endif
+
+#if EP // extremes preserve
+	float luminance = EP_texOff(0).x;
+	// EPSILON is needed since pow(0,0) is undefined
+	float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP));
+	result = mix(poi, result, ep_weight);
+#endif
+
+#if AS == 1 // sharpen+denoise
+	result = mix(sharpened, result, sharpening_strength);
+#elif AS == 2 // sharpen only
+	result = mix(sharpened, poi, sharpening_strength);
+#endif
+
+#if V == 4 // edge map
+	result = sharpening_strength;
+#endif
+
+#if (V == 3 || V == 4) && defined(CHROMA_raw) // drop chroma for these visualizations
+	return vec4(0.5);
+#endif
+
+#if V == 1
+	result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0);
+#elif V == 2
+	result = (poi - result) * 0.5 + 0.5;
+#endif
+
+	return unval(mix(poi, result, BF));
+}
+
diff --git a/portable_config/shaders/nlmeans_hq.glsl b/portable_config/shaders/nlmeans_hq.glsl
deleted file mode 100644
index e030354d..00000000
--- a/portable_config/shaders/nlmeans_hq.glsl
+++ /dev/null
@@ -1,2161 +0,0 @@
-/* vi: ft=c
- *
- * Based on vf_nlmeans.c from FFmpeg.
- *
- * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
- * Copyright (c) 2016 Clément Bœsch <u pkh me>
- *
- * This program is free software: you can redistribute it and/or modify it 
- * under the terms of the GNU Lesser General Public License as published by 
- * the Free Software Foundation, either version 2.1 of the License, or (at 
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT 
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
- * for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License 
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
- */
-
-// Profile description: Slow, but higher quality.
-
-/* The recommended usage of this shader and its variant profiles is to add them 
- * to input.conf and then dispatch the appropriate shader via a keybind during 
- * media playback. Here is an example input.conf entry:
- *
- * F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl"; show-text "Non-local means (LUMA only)"
- *
- * These shaders can also be enabled by default in mpv.conf, for example:
- *
- * glsl-shaders='~~/shaders/nlmeans.glsl'
- *
- * Both of the examples above assume the shaders are located in a subdirectory 
- * named "shaders" within mpv's config directory. Refer to the mpv 
- * documentation for more details.
- *
- * This shader is highly configurable via user variables below. Although the 
- * default settings should offer good quality at a reasonable speed, you are 
- * encouraged to tweak them to your preferences. Be mindful that certain 
- * settings may greatly affect speed.
- *
- * Denoising is most useful for noisy content. If there is no perceptible 
- * noise, you probably won't see a positive difference.
- *
- * The default settings are generally tuned for low noise and high detail 
- * preservation. The "medium" and "heavy" profiles are tuned for higher levels 
- * of noise.
- *
- * The denoiser will not work properly if the content has been upscaled 
- * beforehand, whether it was done by you or someone down the line. Consider 
- * issuing a command to downscale in the mpv console, like so:
- *
- * vf toggle scale=-2:720
- *
- * ...replacing 720 with whatever resolution seems appropriate. Rerun the 
- * command to undo the downscale. It may take some trial-and-error to find the 
- * proper resolution.
- */
-
-/* Regarding speed
- *
- * Speed may vary wildly for different vo and gpu-api settings. Generally 
- * vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this 
- * may be different for your system.
- *
- * If your GPU doesn't support textureGather, or if you are on a version of mpv 
- * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ and VLQ 
- * profiles.
- *
- * textureGather is LUMA only and limited to the following configurations:
- *
- * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}:M!=1
- *   - Default, very fast, rotations and reflections should be free
- *   - If this is unusually slow then try changing gpu-api and vo
- *   - If it's still slow, try setting RI/RFI to 0.
- *
- * - PS=6:RI={0,1,3}:RFI={0,1,2}
- *   - Currently the only scalable variant
- *   - Patch shape is asymmetric on two axis
- *   - Rotations should have very little speed impact
- *   - Reflections may have a significant speed impact
- *
- * Options which always disable textureGather:
- * 	- PD
- */
-
-// The following is shader code injected from nlmeans.glsl
-/* vi: ft=c
- *
- * Based on vf_nlmeans.c from FFmpeg.
- *
- * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
- * Copyright (c) 2016 Clément Bœsch <u pkh me>
- *
- * This program is free software: you can redistribute it and/or modify it 
- * under the terms of the GNU Lesser General Public License as published by 
- * the Free Software Foundation, either version 2.1 of the License, or (at 
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT 
- * ANY WARRANTY;  without even the implied warranty of MERCHANTABILITY or 
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
- * for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License 
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
- */
-
-// Profile description: Default profile, general purpose, tuned for low noise
-
-/* The recommended usage of this shader and its variant profiles is to add them 
- * to input.conf and then dispatch the appropriate shader via a keybind during 
- * media playback. Here is an example input.conf entry:
- *
- * F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl";  show-text "Non-local means (LUMA only)"
- *
- * These shaders can also be enabled by default in mpv.conf, for example:
- *
- * glsl-shaders='~~/shaders/nlmeans.glsl'
- *
- * Both of the examples above assume the shaders are located in a subdirectory 
- * named "shaders" within mpv's config directory. Refer to the mpv 
- * documentation for more details.
- *
- * This shader is highly configurable via user variables below. Although the 
- * default settings should offer good quality at a reasonable speed, you are 
- * encouraged to tweak them to your preferences. Be mindful that certain 
- * settings may greatly affect speed.
- *
- * Denoising is most useful for noisy content. If there is no perceptible 
- * noise, you probably won't see a positive difference.
- *
- * The default settings are generally tuned for low noise and high detail 
- * preservation. The "medium" and "heavy" profiles are tuned for higher levels 
- * of noise.
- *
- * The denoiser will not work properly if the content has been upscaled 
- * beforehand, whether it was done by you or someone down the line. Consider 
- * issuing a command to downscale in the mpv console, like so:
- *
- * vf toggle scale=-2:720
- *
- * ...replacing 720 with whatever resolution seems appropriate. Rerun the 
- * command to undo the downscale. It may take some trial-and-error to find the 
- * proper resolution.
- */
-
-/* Regarding speed
- *
- * Speed may vary wildly for different vo and gpu-api settings. Generally 
- * vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this 
- * may be different for your system.
- *
- * If your GPU doesn't support textureGather, or if you are on a version of mpv 
- * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ and VLQ 
- * profiles.
- *
- * textureGather is LUMA only and limited to the following configurations:
- *
- * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}:M!=1
- *   - Default, very fast, rotations and reflections should be free
- *   - If this is unusually slow then try changing gpu-api and vo
- *   - If it's still slow, try setting RI/RFI to 0.
- *
- * - PS=6:RI={0,1,3}:RFI={0,1,2}
- *   - Currently the only scalable variant
- *   - Patch shape is asymmetric on two axis
- *   - Rotations should have very little speed impact
- *   - Reflections may have a significant speed impact
- *
- * Options which always disable textureGather:
- * 	 - PD
- */
-
-// The following is shader code injected from guided.glsl
-/* vi: ft=c
- *
- * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
- *
- * This program is free software: you can redistribute it and/or modify it 
- * under the terms of the GNU Lesser General Public License as published by 
- * the Free Software Foundation, either version 2.1 of the License, or (at 
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT 
- * ANY WARRANTY;   without even the implied warranty of MERCHANTABILITY or 
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
- * for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License 
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
- */
-
-//desc: Guided filter guided by the downscaled image
-
-/* The radius can be adjusted with the MEANI stage's downscaling factor. 
- * Higher numbers give a bigger radius.
- *
- * The E variable can be found in the A stage.
- *
- * The subsampling (fast guided filter) can be adjusted with the I stage's 
- * downscaling factor. Higher numbers are faster.
- *
- * The guide's subsampling can be adjusted with the PREI stage's downscaling 
- * factor. Higher numbers downscale more.
- */
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (PREI)
-//!BIND HOOKED
-//!WIDTH HOOKED.w 1.25 /
-//!HEIGHT HOOKED.h 1.25 /
-//!SAVE _INJ__INJ_PREI
-
-vec4 hook()
-{
-	  return HOOKED_texOff(0);  
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (I)
-//!BIND _INJ__INJ_PREI
-//!WIDTH HOOKED.w 1.0 /
-//!HEIGHT HOOKED.h 1.0 /
-//!SAVE _INJ__INJ_I
-
-vec4 hook()
-{
-return _INJ__INJ_PREI_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (P)
-//!BIND HOOKED
-//!WIDTH _INJ__INJ_I.w
-//!HEIGHT _INJ__INJ_I.h
-//!SAVE _INJ__INJ_P
-
-vec4 hook()
-{
-	  return HOOKED_texOff(0);  
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (MEANI)
-//!BIND _INJ__INJ_I
-//!WIDTH _INJ__INJ_I.w 1.5 /
-//!HEIGHT _INJ__INJ_I.h 1.5 /
-//!SAVE _INJ__INJ_MEANI
-
-vec4 hook()
-{
-return _INJ__INJ_I_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (MEANP)
-//!BIND _INJ__INJ_P
-//!WIDTH _INJ__INJ_MEANI.w
-//!HEIGHT _INJ__INJ_MEANI.h
-//!SAVE _INJ__INJ_MEANP
-
-vec4 hook()
-{
-return _INJ__INJ_P_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (_INJ__INJ_I_SQ)
-//!BIND _INJ__INJ_I
-//!WIDTH _INJ__INJ_I.w
-//!HEIGHT _INJ__INJ_I.h
-//!SAVE _INJ__INJ_I_SQ
-
-vec4 hook()
-{
-return _INJ__INJ_I_texOff(0) * _INJ__INJ_I_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (_INJ__INJ_IXP)
-//!BIND _INJ__INJ_I
-//!BIND _INJ__INJ_P
-//!WIDTH _INJ__INJ_I.w
-//!HEIGHT _INJ__INJ_I.h
-//!SAVE _INJ__INJ_IXP
-
-vec4 hook()
-{
-return _INJ__INJ_I_texOff(0) * _INJ__INJ_P_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (CORRI)
-//!BIND _INJ__INJ_I_SQ
-//!WIDTH _INJ__INJ_MEANI.w
-//!HEIGHT _INJ__INJ_MEANI.h
-//!SAVE _INJ__INJ_CORRI
-
-vec4 hook()
-{
-return _INJ__INJ_I_SQ_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (CORRP)
-//!BIND _INJ__INJ_IXP
-//!WIDTH _INJ__INJ_MEANI.w
-//!HEIGHT _INJ__INJ_MEANI.h
-//!SAVE _INJ__INJ_CORRP
-
-vec4 hook()
-{
-return _INJ__INJ_IXP_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (A)
-//!BIND _INJ__INJ_MEANI
-//!BIND _INJ__INJ_MEANP
-//!BIND _INJ__INJ_CORRI
-//!BIND _INJ__INJ_CORRP
-//!WIDTH _INJ__INJ_I.w
-//!HEIGHT _INJ__INJ_I.h
-//!SAVE _INJ__INJ_A
-
-#define E 0.0013
-
-vec4 hook()
-{
-vec4 var = _INJ__INJ_CORRI_texOff(0) - _INJ__INJ_MEANI_texOff(0) * _INJ__INJ_MEANI_texOff(0);
-vec4 cov = _INJ__INJ_CORRP_texOff(0) - _INJ__INJ_MEANI_texOff(0) * _INJ__INJ_MEANP_texOff(0);
-	  return cov / (var + E);  
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (B)
-//!BIND _INJ__INJ_A
-//!BIND _INJ__INJ_MEANI
-//!BIND _INJ__INJ_MEANP
-//!WIDTH _INJ__INJ_I.w
-//!HEIGHT _INJ__INJ_I.h
-//!SAVE _INJ__INJ_B
-
-vec4 hook()
-{
-return _INJ__INJ_MEANP_texOff(0) - _INJ__INJ_A_texOff(0) * _INJ__INJ_MEANI_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (MEANA)
-//!BIND _INJ__INJ_A
-//!WIDTH _INJ__INJ_MEANI.w
-//!HEIGHT _INJ__INJ_MEANI.h
-//!SAVE _INJ__INJ_MEANA
-
-vec4 hook()
-{
-return _INJ__INJ_A_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (MEANB)
-//!BIND _INJ__INJ_B
-//!WIDTH _INJ__INJ_MEANI.w
-//!HEIGHT _INJ__INJ_MEANI.h
-//!SAVE _INJ__INJ_MEANB
-
-vec4 hook()
-{
-return _INJ__INJ_B_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter
-//!BIND HOOKED
-//!BIND _INJ__INJ_MEANA
-//!BIND _INJ__INJ_MEANB
-//!SAVE _INJ_RF_LUMA
-
-vec4 hook()
-{
-return _INJ__INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ__INJ_MEANB_texOff(0);
-}
-
-// End of source code injected from guided.glsl
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Non-local means (downscale)
-//!WIDTH LUMA.w 3 /
-//!HEIGHT LUMA.h 3 /
-//!BIND LUMA
-//!SAVE _INJ_EP
-
-vec4 hook()
-{
-	 return LUMA_texOff(0); 
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Non-local means (share)
-//!BIND _INJ_RF_LUMA
-//!SAVE _INJ_RF
-
-vec4 hook()
-{
-return _INJ_RF_LUMA_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!BIND HOOKED
-//!BIND _INJ_RF_LUMA
-//!BIND _INJ_EP
-//!BIND _INJ_RF
-//!DESC Non-local means (nlmeans.glsl)
-//!SAVE RF_LUMA
-
-/* User variables
- *
- * It is usually preferable to denoise chroma and luma differently, so the user 
- * variables for luma and chroma are split.
- */
-
-/* S = denoising factor
- * P = patch size
- * R = research size
- *
- * The denoising factor controls the level of blur, higher is blurrier.
- *
- * Patch size should usually be an odd number greater than or equal to 3. 
- * Higher values are slower and not always better.
- *
- * Research size usually be an odd number greater than or equal to 3. Higher 
- * values are usually better, but slower and offer diminishing returns.
- *
- * Even-numbered patch/research sizes will sample between pixels unless PS=6. 
- * It's not known whether this is ever useful behavior or not. This is 
- * incompatible with textureGather optimizations, so NG=1 to disable them.
- */
-#ifdef LUMA_raw
-#define S 2.0
-#define P 3
-#define R 5
-#else
-#define S 5.0
-#define P 3
-#define R 5
-#endif
-
-/* Adaptive sharpening
- *
- * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
- * weight map to restrict the sharpening to edges.
- *
- * Use M=4 to get a good look at which areas are/aren't sharpened.
- *
- * AS: 2 for sharpening, 1 for sharpening+denoising, 0 to disable
- * ASF: Sharpening factor, higher numbers make a sharper underlying image
- * ASP: Weight power, higher numbers use more of the sharp image
- * ASW:
- * 	 - 0 to use pre-WD weights
- * 	 - 1 to use post-WD weights (ASP should be ~2x to compensate)
- * ASK: Weight kernel:
- * 	 - 0 for power. This is the old method.
- * 	 - 1 for sigmoid. This is generally recommended.
- * 	 - 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image)
- * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map
- */
-#ifdef LUMA_raw
-#define AS 0
-#define ASF 2.0
-#define ASP 4.0
-#define ASW 0
-#define ASK 1
-#define ASC 0.0
-#else
-#define AS 0
-#define ASF 2.0
-#define ASP 4.0
-#define ASW 0
-#define ASK 1
-#define ASC 0.0
-#endif
-
-/* Starting weight
- *
- * Lower numbers give less weight to the pixel-of-interest, which may help 
- * handle higher noise levels, ringing, and may be useful for other things too?
- *
- * EPSILON should be used instead of zero to avoid divide-by-zero errors. The 
- * avg_weight/old_avg_weight variables may be used to make SW adapt to the 
- * local noise level, e.g., SW=max(avg_weight, EPSILON)
- */
-#ifdef LUMA_raw
-#define SW 1.0
-#else
-#define SW 0.5
-#endif
-
-/* Weight discard
- *
- * Discard weights that fall below a fraction of the average weight. This culls 
- * the most dissimilar samples from the blur, yielding a much more pleasant 
- * result, especially around edges.
- * 
- * WD:
- * 	 - 2: True average. Very good quality, but slower and uses more memory.
- * 	 - 1: Moving cumulative average. Inaccurate, tends to blur directionally.
- * 	 - 0: Disable
- *
- * WDT: Threshold coefficient, higher numbers discard more
- * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
- */
-#ifdef LUMA_raw
-#define WD 2
-#define WDT 0.5
-#define WDP 6.0
-#else
-#define WD 2
-#define WDT 0.75
-#define WDP 6.0
-#endif
-
-/* Extremes preserve
- *
- * Reduces denoising around very bright/dark areas. The downscaling factor of 
- * EP (located near the top of this shader) controls the area sampled for 
- * luminance (higher numbers consider more area).
- *
- * This is incompatible with RGB. If you have RGB hooks enabled then you will 
- * have to delete the EP shader stage or specify EP=0 through nlmeans_cfg.
- *
- * EP: 1 to enable, 0 to disable
- * DP: EP strength on dark patches, 0 to fully denoise
- * BP: EP strength on bright patches, 0 to fully denoise
- */
-#ifdef LUMA_raw
-#define EP 1
-#define BP 0.75
-#define DP 0.25
-#else
-#define EP 0
-#define BP 0.0
-#define DP 0.0
-#endif
-
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-
-/* Robust filtering
- *
- * This setting is dependent on code generation from nlmeans_cfg, so this 
- * setting can only be enabled via nlmeans_cfg.
- *
- * Compares the pixel-of-interest against a guide, which could be a downscaled 
- * image or the output of another shader such as guided.glsl
- */
-#ifdef LUMA_raw
-#define RF 1
-#else
-#define RF 1
-#endif
-
-/* Search shape
- *
- * Determines the shape of patches and research zones. Different shapes have 
- * different speed and quality characteristics. Every shape (besides square) is 
- * smaller than square.
- *
- * PS applies applies to patches, RS applies to research zones.
- *
- * Be wary of gather optimizations (see the Regarding Speed comment at the top)
- *
- * 0: square (symmetrical)
- * 1: horizontal line (asymmetric)
- * 2: vertical line (asymmetric)
- * 3: diamond (symmetrical)
- * 4: triangle (asymmetric, pointing upward)
- * 5: truncated triangle (asymmetric on two axis, last row halved)
- * 6: even sized square (asymmetric on two axis)
- * 7: plus (symmetrical)
- */
-#ifdef LUMA_raw
-#define RS 3
-#define PS 3
-#else
-#define RS 3
-#define PS 3
-#endif
-
-/* Rotational/reflectional invariance
- *
- * Number of rotations/reflections to try for each patch comparison. Slow, but 
- * improves feature preservation, although adding more rotations/reflections 
- * gives diminishing returns. The most similar rotation/reflection will be used.
- *
- * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
- * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
- *
- * RI: Rotational invariance
- * RFI (0 to 2): Reflectional invariance
- */
-#ifdef LUMA_raw
-#define RI 3
-#define RFI 2
-#else
-#define RI 0
-#define RFI 0
-#endif
-
-/* Temporal denoising
- *
- * Caveats:
- * 	 - Slower, each frame needs to be researched
- * 	 - Requires vo=gpu-next and nlmeans_temporal.glsl
- * 	 - Luma-only (this is a bug)
- * 	 - Buggy
- *
- * Gather samples across multiple frames. May cause motion blur and may 
- * struggle more with noise that persists across multiple frames (e.g., from 
- * compression or duplicate frames), but can work very well on high quality 
- * video.
- *
- * Motion estimation (ME) should improve quality without impacting speed.
- *
- * T: number of frames used
- * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg
- */
-#ifdef LUMA_raw
-#define T 0
-#define ME 1
-#else
-#define T 0
-#define ME 0
-#endif
-
-/* Spatial kernel
- *
- * Increasing the spatial denoising factor (SS) reduces the weight of further 
- * pixels.
- *
- * Spatial distortion instructs the spatial kernel to view that axis as 
- * closer/further, for instance SD=(1,1,0.5) would make the temporal axis 
- * appear closer and increase blur between frames.
- *
- * The intra-patch variants do not yet have well-understood effects. They are 
- * intended to make large patch sizes more useful. Likely slower.
- *
- * SS: spatial denoising factor
- * SD: spatial distortion (X, Y, time)
- * PSS: intra-patch spatial denoising factor
- * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables
- * PSD: intra-patch spatial distortion (X, Y)
- */
-#ifdef LUMA_raw
-#define SS 0.25
-#define SD vec3(1,1,1.5)
-#define PST 0
-#define PSS 0.0
-#define PSD vec2(1,1)
-#else
-#define SS 0.25
-#define SD vec3(1,1,1.5)
-#define PST 0
-#define PSS 0.0
-#define PSD vec2(1,1)
-#endif
-
-// Scaling factor (should match WIDTH/HEIGHT)
-#ifdef LUMA_raw
-#define SF 1
-#else
-#define SF 1
-#endif
-
-/* Estimator
- *
- * 0: means
- * 1: Euclidean medians (extremely slow, may be good for heavy noise)
- * 2: weight map (not a denoiser, maybe useful for generating image masks)
- * 3: weighted median intensity (slow, may be good for heavy noise)
- * 4: edge map (based on the relevant AS settings)
- */
-#ifdef LUMA_raw
-#define M 0
-#else
-#define M 0
-#endif
-
-/* Difference visualization
- *
- * Visualizes the difference between input/output image
- *
- * 0: off
- * 1: absolute difference scaled by S
- * 2: difference centered on 0.5
- */
-#ifdef LUMA_raw
-#define DV 0
-#else
-#define DV 0
-#endif
-
-/* Blur factor
- *
- * 0 to 1, only useful for alternative estimators. You're probably looking for 
- * "S" (denoising factor), go back to the top of the shader!
- */
-#ifdef LUMA_raw
-#define BF 1.0
-#else
-#define BF 1.0
-#endif
-
-// Force disable textureGather
-#ifdef LUMA_raw
-#define NG 0
-#else
-#define NG 0
-#endif
-
-// Patch donut (probably useless)
-#ifdef LUMA_raw
-#define PD 0
-#else
-#define PD 0
-#endif
-
-// Duplicate 1st weight (for LGC)
-#ifdef LUMA_raw
-#define D1W 0
-#else
-#define D1W 0
-#endif
-
-/* Shader code */
-
-#define EPSILON 0.00000000001
-#define M_PI 3.14159265358979323846
-
-#if PS == 6
-const int hp = P/2; 
-#else
-const float hp = int(P/2) - 0.5*(1-(P%2));  // sample between pixels for even patch sizes
-#endif
-
-#if RS == 6
-const int hr = R/2; 
-#else
-const float hr = int(R/2) - 0.5*(1-(R%2));  // sample between pixels for even research sizes
-#endif
-
-// donut increment, increments without landing on (0,0,0)
-// much faster than a "continue" statement
-#define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0))))
-
-// search shapes and their corresponding areas
-#define S_1X1(z) for (z = vec3(0);  z.x <= 0;  z.x++)
-
-#define S_TRIANGLE(z,hz,incr) for (z.y = -hz;  z.y <= 0;  z.y++) for (z.x = -abs(abs(z.y) - hz);  z.x <= abs(abs(z.y) - hz);  incr)
-#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz;  z.y <= 0;  z.y++) for (z.x = -abs(abs(z.y) - hz);  z.x <= abs(abs(z.y) - hz)*int(z.y!=0);  incr)
-#define S_TRIANGLE_A(hz,Z) int(hz*hz+Z)
-
-#define S_DIAMOND(z,hz,incr) for (z.x = -hz;  z.x <= hz;  z.x++) for (z.y = -abs(abs(z.x) - hz);  z.y <= abs(abs(z.x) - hz);  incr)
-#define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z)
-
-#define S_VERTICAL(z,hz,incr) for (z.x = 0;  z.x <= 0;  z.x++) for (z.y = -hz;  z.y <= hz;  incr)
-#define S_HORIZONTAL(z,hz,incr) for (z.x = -hz;  z.x <= hz;  incr) for (z.y = 0;  z.y <= 0;  z.y++)
-
-#define S_PLUS(z,hz,incr) for (z.x = -hz;  z.x <= hz;  z.x++) for (z.y = -hz * int(z.x == 0);  z.y <= hz * int(z.x == 0);  incr)
-#define S_PLUS_A(hz,Z) (Z*2 - 1)
-
-#define S_SQUARE(z,hz,incr) for (z.x = -hz;  z.x <= hz;  z.x++) for (z.y = -hz;  z.y <= hz;  incr)
-#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz;  z.x < hz;  z.x++) for (z.y = -hz;  z.y < hz;  incr)
-
-#define T1 (T+1)
-#define FOR_FRAME(r) for (r.z = 0;  r.z < T1;  r.z++)
-
-// Skip comparing the pixel-of-interest against itself, unless RF is enabled
-#if RF
-#define RINCR(z,c) (z.c++)
-#else
-#define RINCR DINCR
-#endif
-
-#define R_AREA(a) (a * T1 + RF-1)
-
-// research shapes
-// XXX would be nice to have the option of temporally-varying research sizes
-#if R == 0 || R == 1
-#define FOR_RESEARCH(r) S_1X1(r)
-const int r_area = R_AREA(1); 
-#elif RS == 7
-#define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y))
-const int r_area = R_AREA(S_PLUS_A(hr,R)); 
-#elif RS == 6
-#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R*R); 
-#elif RS == 5
-#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x))
-const int r_area = R_AREA(S_TRIANGLE_A(hr,hr)); 
-#elif RS == 4
-#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x))
-const int r_area = R_AREA(S_TRIANGLE_A(hr,R)); 
-#elif RS == 3
-#define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y))
-const int r_area = R_AREA(S_DIAMOND_A(hr,R)); 
-#elif RS == 2
-#define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R); 
-#elif RS == 1
-#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x))
-const int r_area = R_AREA(R); 
-#elif RS == 0
-#define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R*R); 
-#endif
-
-#define RI1 (RI+1)
-#define RFI1 (RFI+1)
-
-#if RI
-#define FOR_ROTATION for (float ri = 0;  ri < 360;  ri+=360.0/RI1)
-#else
-#define FOR_ROTATION
-#endif
-
-#if RFI
-#define FOR_REFLECTION for (int rfi = 0;  rfi < RFI1;  rfi++)
-#else
-#define FOR_REFLECTION
-#endif
-
-#if PD
-#define PINCR DINCR
-#else
-#define PINCR(z,c) (z.c++)
-#endif
-
-#define P_AREA(a) (a - PD)
-
-// patch shapes
-#if P == 0 || P == 1
-#define FOR_PATCH(p) S_1X1(p)
-const int p_area = P_AREA(1); 
-#elif PS == 7
-#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y))
-const int p_area = P_AREA(S_PLUS_A(hp,P)); 
-#elif PS == 6
-#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P*P); 
-#elif PS == 5
-#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x))
-const int p_area = P_AREA(S_TRIANGLE_A(hp,hp)); 
-#elif PS == 4
-#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x))
-const int p_area = P_AREA(S_TRIANGLE_A(hp,P)); 
-#elif PS == 3
-#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y))
-const int p_area = P_AREA(S_DIAMOND_A(hp,P)); 
-#elif PS == 2
-#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P); 
-#elif PS == 1
-#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x))
-const int p_area = P_AREA(P); 
-#elif PS == 0
-#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P*P); 
-#endif
-
-const float r_scale = 1.0/r_area; 
-const float p_scale = 1.0/p_area; 
-
-#define load_(off)  HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off))
-
-#if RF && defined(LUMA_raw)
-#define load2_(off) _INJ_RF_LUMA_tex(_INJ_RF_LUMA_pos + _INJ_RF_LUMA_pt * vec2(off))
-#define gather_offs(off, off_arr) (_INJ_RF_LUMA_mul * vec4(textureGatherOffsets(_INJ_RF_LUMA_raw, _INJ_RF_LUMA_pos + vec2(off) * _INJ_RF_LUMA_pt, off_arr)))
-#define gather(off) _INJ_RF_LUMA_gather(_INJ_RF_LUMA_pos + (off) * _INJ_RF_LUMA_pt, 0)
-#elif RF && D1W
-#define load2_(off) _INJ_RF_tex(_INJ_RF_pos + _INJ_RF_pt * vec2(off))
-#define gather_offs(off, off_arr) (_INJ_RF_mul * vec4(textureGatherOffsets(_INJ_RF_raw, _INJ_RF_pos + vec2(off) * _INJ_RF_pt, off_arr)))
-#define gather(off) _INJ_RF_gather(_INJ_RF_pos + (off) * _INJ_RF_pt, 0)
-#elif RF
-#define load2_(off) _INJ_RF_tex(_INJ_RF_pos + _INJ_RF_pt * vec2(off))
-#else
-#define load2_(off) HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off))
-#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr)))
-#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0)
-#endif
-
-#if T
-vec4 load(vec3 off)
-{
-	 switch (int(off.z)) {
-	 case 0: return load_(off); 
-	 }
-}
-vec4 load2(vec3 off)
-{
-	 switch (int(off.z)) {
-	 case 0: return load2_(off); 
-	 }
-}
-#else
-#define load(off) load_(off)
-#define load2(off) load2_(off)
-#endif
-
-vec4 poi = load(vec3(0));  // pixel-of-interest
-vec4 poi2 = load2(vec3(0));  // guide pixel-of-interest
-
-#if RI // rotation
-vec2 rot(vec2 p, float d)
-{
-	 return vec2(
-	 	 p.x * cos(radians(d)) - p.y * sin(radians(d)),
-	 	 p.y * sin(radians(d)) + p.x * cos(radians(d))
-	 ); 
-}
-#else
-#define rot(p, d) (p)
-#endif
-
-#if RFI // reflection
-vec2 ref(vec2 p, int d)
-{
-	 switch (d) {
-	 case 0: return p; 
-	 case 1: return p * vec2(1, -1); 
-	 case 2: return p * vec2(-1, 1); 
-	 }
-}
-#else
-#define ref(p, d) (p)
-#endif
-
-vec4 patch_comparison(vec3 r, vec3 r2)
-{
-	 vec3 p; 
-	 vec4 min_rot = vec4(p_area); 
-
-	 FOR_ROTATION FOR_REFLECTION {
-	 	 vec4 pdiff_sq = vec4(0); 
-	 	 FOR_PATCH(p) {
-	 	 	 vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z); 
-	 	 	 vec4 diff_sq = load2(p + r2) - load2((transformed_p + r) * SF); 
-	 	 	 diff_sq *= diff_sq; 
-#if PST && P >= PST
-	 	 	 float pdist = length(p.xy*PSD)*PSS; 
-	 	 	 pdist = exp(-(pdist*pdist)); 
-	 	 	 diff_sq = pow(max(diff_sq, EPSILON), vec4(pdist)); 
-#endif
-	 	 	 pdiff_sq += diff_sq; 
-	 	 }
-	 	 min_rot = min(min_rot, pdiff_sq); 
-	 }
-
-	 return min_rot * p_scale; 
-}
-
-#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
-#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
-
-#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && M != 1 && REGULAR_ROTATIONS && NO_GATHER
-// 3x3 diamond/plus patch_comparison_gather
-// XXX extend to support arbitrary sizes (probably requires code generation)
-// XXX extend to support 3x3 square
-const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) }; 
-const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF }; 
-vec4 poi_patch = gather_offs(0, offsets); 
-vec4 patch_comparison_gather(vec3 r, vec3 r2)
-{
-	 float min_rot = p_area - 1; 
-	 vec4 transformer = gather_offs(r, offsets_sf); 
-	 FOR_ROTATION {
-	 	 FOR_REFLECTION {
-	 	 	 float diff_sq = dot((poi_patch - transformer) * (poi_patch - transformer), vec4(1)); 
-	 	 	 min_rot = min(diff_sq, min_rot); 
-#if RFI
-	 	 	 switch(rfi) {
-	 	 	 case 0: transformer = transformer.zyxw;  break; 
-	 	 	 case 1: transformer = transformer.zwxy;  break;  // undoes last mirror, performs another mirror
-	 	 	 case 2: transformer = transformer.zyxw;  break;  // undoes last mirror
-	 	 	 }
-#endif
-	 	 }
-#if RI == 3
-	 	 transformer = transformer.wxyz; 
-#elif RI == 1
-	 	 transformer = transformer.zwxy; 
-#endif
-	 }
-	 float center_diff_sq = poi2.x - load2(r).x; 
-	 center_diff_sq *= center_diff_sq; 
-	 return vec4(min_rot + center_diff_sq, 0, 0, 0) * p_scale; 
-}
-#elif (defined(LUMA_gather) || D1W) && PS == 6 && REGULAR_ROTATIONS && NO_GATHER
-// tiled even square patch_comparison_gather
-// XXX extend to support odd square?
-// XXX rotations/reflections appear to be subtly broken
-vec4 patch_comparison_gather(vec3 r, vec3 r2)
-{
-	 vec2 tile; 
-	 float min_rot = p_area; 
-
-	 /* gather order:
-	  * w z
-	  * x y
-	  */
-	 FOR_ROTATION FOR_REFLECTION {
-	 	 float pdiff_sq = 0; 
-	 	 for (tile.x = -hp;  tile.x < hp;  tile.x+=2) for (tile.y = -hp;  tile.y < hp;  tile.y+=2) {
-	 	 	 vec4 poi_patch = gather(tile + r2.xy); 
-	 	 	 vec4 transformer = gather(ref(rot(tile + 0.5, ri), rfi) - 0.5 + r.xy); 
-
-#if RI
-	 	 	 for (float i = 0;  i < ri;  i+=90)
-	 	 	 	 transformer = transformer.wxyz;  // rotate 90 degrees
-#endif
-#if RFI // XXX output is a little off
-	 	 	 switch(rfi) {
-	 	 	 case 1: transformer = transformer.zyxw;  break; 
-	 	 	 case 2: transformer = transformer.xwzy;  break; 
-	 	 	 }
-#endif
-
-	 	 	 vec4 diff_sq = (poi_patch - transformer) * (poi_patch - transformer); 
-#if PST && P >= PST
-	 	 	 // XXX refactor to avoid pow (should probably break off into a function)
-	 	 	 vec4 pdist = vec4(
-	 	 	 	 exp(-pow(length((tile+vec2(0,1))*PSD)*PSS, 2)),
-	 	 	 	 exp(-pow(length((tile+vec2(1,1))*PSD)*PSS, 2)),
-	 	 	 	 exp(-pow(length((tile+vec2(1,0))*PSD)*PSS, 2)),
-	 	 	 	 exp(-pow(length((tile+vec2(0,0))*PSD)*PSS, 2))
-	 	 	 ); 
-	 	 	 diff_sq = pow(max(diff_sq, EPSILON), pdist); 
-#endif
-	 	 	 pdiff_sq += dot(diff_sq, vec4(1)); 
-	 	 }
-	 	 min_rot = min(min_rot, pdiff_sq); 
-	 }
-
-	 return vec4(min_rot, 0, 0, 0) * p_scale; 
-}
-#else
-#define patch_comparison_gather patch_comparison
-#endif
-
-vec4 hook()
-{
-	 vec4 total_weight = vec4(0); 
-	 vec4 sum = vec4(0); 
-	 vec4 result = vec4(0); 
-
-	 vec3 r = vec3(0); 
-	 vec3 p = vec3(0); 
-	 vec3 me = vec3(0); 
-
-#if T && ME == 1 // temporal & motion estimation
-	 vec3 me_tmp = vec3(0); 
-	 float maxweight = 0; 
-#elif T && ME == 2 // temporal & motion estimation
-	 vec3 me_sum = vec3(0); 
-	 float me_weight = 0; 
-#endif
-
-#if WD == 2 || M == 3 // weight discard, weighted median intensities
-	 int r_index = 0; 
-	 vec4 all_weights[r_area]; 
-	 vec4 all_pixels[r_area]; 
-#elif WD == 1 // weight discard
-	 vec4 no_weights = vec4(0); 
-	 vec4 discard_total_weight = vec4(0); 
-	 vec4 discard_sum = vec4(0); 
-#endif
-
-#if M == 1 // Euclidean medians
-	 vec4 minsum = vec4(0); 
-#endif
-
-	 FOR_FRAME(r) {
-	 // XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
-#if T && ME == 1 // temporal & motion estimation max weight
-	 if (r.z > 0) {
-	 	 me += me_tmp; 
-	 	 me_tmp = vec3(0); 
-	 	 maxweight = 0; 
-	 }
-#elif T && ME == 2 // temporal & motion estimation weighted average
-	 if (r.z > 0) {
-	 	 me += round(me_sum / me_weight); 
-	 	 me_sum = vec3(0); 
-	 	 me_weight = 0; 
-	 }
-#endif
-	 FOR_RESEARCH(r) {
-	 	 // main NLM logic
-	 	 const float h = S*0.013; 
-	 	 const float pdiff_scale = 1.0/(h*h); 
-	 	 vec4 pdiff_sq = (r.z == 0) ? patch_comparison_gather(r+me, vec3(0)) : patch_comparison(r+me, vec3(0)); 
-	 	 vec4 weight = exp(-pdiff_sq * pdiff_scale); 
-
-#if T && ME == 1 // temporal & motion estimation max weight
-	 	 me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x)); 
-	 	 maxweight = max(maxweight, weight.x); 
-#elif T && ME == 2 // temporal & motion estimation weighted average
-	 	 me_sum += vec3(r.xy,0) * weight.x; 
-	 	 me_weight += weight.x; 
-#endif
-
-#if D1W
-	 	 weight = vec4(weight.x); 
-#endif
-
-	 	 weight *= exp(-(length(r*SD)*SS * length(r*SD)*SS));  // spatial kernel
-
-#if WD == 2 || M == 3 // weight discard, weighted median intensity
-	 	 all_weights[r_index] = weight; 
-	 	 all_pixels[r_index] = load(r+me); 
-	 	 r_index++; 
-#elif WD == 1 // weight discard
-	 	 vec4 wd_scale = 1.0/max(no_weights, 1); 
-	 	 vec4 keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight); 
-	 	 discard_sum += load(r+me) * weight * (1 - keeps); 
-	 	 discard_total_weight += weight * (1 - keeps); 
-	 	 no_weights += keeps; 
-#endif
-
-	 	 sum += load(r+me) * weight; 
-	 	 total_weight += weight; 
-
-#if M == 1 // Euclidean median
-	 	 // Based on: https://arxiv.org/abs/1207.3056
-	 	 // XXX might not work with ME
-	 	 vec3 r2; 
-	 	 vec4 wpdist_sum = vec4(0); 
-	 	 FOR_FRAME(r2) FOR_RESEARCH(r2) {
-	 	 	 vec4 pdist = (r.z + r2.z) == 0 ? patch_comparison_gather(r+me, r2+me) : patch_comparison(r+me, r2+me); 
-	 	 	 wpdist_sum += sqrt(pdist) * (1-weight); 
-	 	 }
-
-	 	 vec4 newmin = step(wpdist_sum, minsum);  // wpdist_sum <= minsum
-	 	 newmin *= 1 - step(wpdist_sum, vec4(0));  // && wpdist_sum > 0
-	 	 newmin += step(minsum, vec4(0));  // || minsum <= 0
-	 	 newmin = min(newmin, 1); 
-
-	 	 minsum = (newmin * wpdist_sum) + ((1-newmin) * minsum); 
-	 	 result = (newmin * load(r+me)) + ((1-newmin) * result); 
-#endif
-	 } // FOR_RESEARCH
-	 } // FOR_FRAME
-
-	 // XXX optionally put the denoised pixel into the frame buffer?
-#if T // temporal
-#endif
-
-	 vec4 avg_weight = total_weight * r_scale; 
-	 vec4 old_avg_weight = avg_weight; 
-
-#if WD == 2 // true average
-	 total_weight = vec4(0); 
-	 sum = vec4(0); 
-	 vec4 no_weights = vec4(0); 
-
-	 for (int i = 0;  i < r_area;  i++) {
-	 	 vec4 keeps = step(avg_weight*WDT, all_weights[i]); 
-	 	 all_weights[i] *= keeps; 
-	 	 sum += all_pixels[i] * all_weights[i]; 
-	 	 total_weight += all_weights[i]; 
-	 	 no_weights += keeps; 
-	 }
-#elif WD == 1 // moving cumulative average
-	 total_weight -= discard_total_weight; 
-	 sum -= discard_sum; 
-#endif
-#if WD // weight discard
-	 avg_weight = total_weight / no_weights; 
-#endif
-
-	 total_weight += SW; 
-	 sum += poi * SW; 
-
-#if M == 3 // weighted median intensity
-	 const float hr_area = r_area/2.0; 
-	 vec4 is_median, gt, lt, gte, lte, neq; 
-
-	 for (int i = 0;  i < r_area;  i++) {
-	 	 gt = lt = vec4(0); 
-	 	 for (int j = 0;  j < r_area;  j++) {
-	 	 	 gte = step(all_pixels[i]*all_weights[i], all_pixels[j]*all_weights[j]); 
-	 	 	 lte = step(all_pixels[j]*all_weights[j], all_pixels[i]*all_weights[i]); 
-	 	 	 neq = 1 - gte * lte; 
-	 	 	 gt += gte * neq; 
-	 	 	 lt += lte * neq; 
-	 	 }
-	 	 is_median = step(gt, vec4(hr_area)) * step(lt, vec4(hr_area)); 
-	 	 result += step(result, vec4(0)) * is_median * all_pixels[i]; 
-	 }
-#elif M == 2 // weight map
-	 result = avg_weight; 
-#elif M == 0 // mean
-	 result = sum / total_weight; 
-#endif
-
-#if ASW == 0 // pre-WD weights
-#define AS_weight old_avg_weight
-#elif ASW == 1 // post-WD weights
-#define AS_weight avg_weight
-#endif
-
-#if ASK == 0
-	 vec4 sharpening_strength = pow(AS_weight, vec4(ASP)); 
-#elif ASK == 1
-#define sigmoid(x) (tanh(x * 2*M_PI - M_PI)*0.5+0.5)
-	 vec4 sharpening_strength = mix(pow(sigmoid(AS_weight), vec4(ASP)),
-	                                AS_weight, ASC); 
-	 // just in case ASC < 0 (will sharpen but it's janky XXX)
-	 sharpening_strength = clamp(sharpening_strength, 0.0, 1.0); 
-#elif ASK == 2
-	 vec4 sharpening_strength = vec4(ASP); 
-#endif
-
-	 // XXX maybe allow for alternative blurs? e.g., replace result w/ load2?
-#if AS == 1 // sharpen+denoise
-	 vec4 sharpened = result + (poi - result) * ASF; 
-#elif AS == 2 // sharpen only
-	 vec4 sharpened = poi + (poi - result) * ASF; 
-#endif
-
-#if EP // extremes preserve
-float luminance = _INJ_EP_texOff(0).x;
-	 // EPSILON is needed since pow(0,0) is undefined
-	 float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP)); 
-	 result = mix(poi, result, ep_weight); 
-#endif
-
-#if AS == 1 // sharpen+denoise
-	 result = mix(sharpened, result, sharpening_strength); 
-#elif AS == 2 // sharpen only
-	 result = mix(sharpened, poi, sharpening_strength); 
-#endif
-
-#if M == 4 // edge map
-	 result = sharpening_strength; 
-#endif
-
-#if (M == 2 || M == 4) && defined(CHROMA_raw) // drop chroma for weight maps
-	 result = vec4(0.5); 
-#endif
-
-#if DV == 1
-	 result = clamp(abs(poi - result) * S, 0.0, 1.0); 
-#elif DV == 2
-	 result = (poi - result) * 0.5 + 0.5; 
-#endif
-
-	 return mix(poi, result, BF); 
-}
-
-// End of source code injected from nlmeans.glsl
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Non-local means (downscale)
-//!WIDTH LUMA.w 3 /
-//!HEIGHT LUMA.h 3 /
-//!BIND LUMA
-//!SAVE EP
-
-vec4 hook()
-{
-	return LUMA_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Non-local means (share)
-//!BIND RF_LUMA
-//!SAVE RF
-
-vec4 hook()
-{
-	return RF_LUMA_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!BIND HOOKED
-//!BIND RF_LUMA
-//!BIND EP
-//!BIND RF
-//!DESC Non-local means (nlmeans_hq.glsl)
-
-/* User variables
- *
- * It is usually preferable to denoise chroma and luma differently, so the user 
- * variables for luma and chroma are split.
- */
-
-/* S = denoising factor
- * P = patch size
- * R = research size
- *
- * The denoising factor controls the level of blur, higher is blurrier.
- *
- * Patch size should usually be an odd number greater than or equal to 3. 
- * Higher values are slower and not always better.
- *
- * Research size usually be an odd number greater than or equal to 3. Higher 
- * values are usually better, but slower and offer diminishing returns.
- *
- * Even-numbered patch/research sizes will sample between pixels unless PS=6. 
- * It's not known whether this is ever useful behavior or not. This is 
- * incompatible with textureGather optimizations, so NG=1 to disable them.
- */
-#ifdef LUMA_raw
-#define S 2.25
-#define P 4
-#define R 5
-#else
-#define S 5.0
-#define P 3
-#define R 5
-#endif
-
-/* Adaptive sharpening
- *
- * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
- * weight map to restrict the sharpening to edges.
- *
- * Use M=4 to get a good look at which areas are/aren't sharpened.
- *
- * AS: 2 for sharpening, 1 for sharpening+denoising, 0 to disable
- * ASF: Sharpening factor, higher numbers make a sharper underlying image
- * ASP: Weight power, higher numbers use more of the sharp image
- * ASW:
- * 	- 0 to use pre-WD weights
- * 	- 1 to use post-WD weights (ASP should be ~2x to compensate)
- * ASK: Weight kernel:
- * 	- 0 for power. This is the old method.
- * 	- 1 for sigmoid. This is generally recommended.
- * 	- 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image)
- * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map
- */
-#ifdef LUMA_raw
-#define AS 0
-#define ASF 2.0
-#define ASP 1
-#define ASW 0
-#define ASK 1
-#define ASC 0.0
-#else
-#define AS 0
-#define ASF 2.0
-#define ASP 4.0
-#define ASW 0
-#define ASK 1
-#define ASC 0.0
-#endif
-
-/* Starting weight
- *
- * Lower numbers give less weight to the pixel-of-interest, which may help 
- * handle higher noise levels, ringing, and may be useful for other things too?
- *
- * EPSILON should be used instead of zero to avoid divide-by-zero errors. The 
- * avg_weight/old_avg_weight variables may be used to make SW adapt to the 
- * local noise level, e.g., SW=max(avg_weight, EPSILON)
- */
-#ifdef LUMA_raw
-#define SW 1.0
-#else
-#define SW 0.5
-#endif
-
-/* Weight discard
- *
- * Discard weights that fall below a fraction of the average weight. This culls 
- * the most dissimilar samples from the blur, yielding a much more pleasant 
- * result, especially around edges.
- * 
- * WD:
- * 	- 2: True average. Very good quality, but slower and uses more memory.
- * 	- 1: Moving cumulative average. Inaccurate, tends to blur directionally.
- * 	- 0: Disable
- *
- * WDT: Threshold coefficient, higher numbers discard more
- * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
- */
-#ifdef LUMA_raw
-#define WD 2
-#define WDT 0.5
-#define WDP 6.0
-#else
-#define WD 2
-#define WDT 0.75
-#define WDP 6.0
-#endif
-
-/* Extremes preserve
- *
- * Reduces denoising around very bright/dark areas. The downscaling factor of 
- * EP (located near the top of this shader) controls the area sampled for 
- * luminance (higher numbers consider more area).
- *
- * This is incompatible with RGB. If you have RGB hooks enabled then you will 
- * have to delete the EP shader stage or specify EP=0 through nlmeans_cfg.
- *
- * EP: 1 to enable, 0 to disable
- * DP: EP strength on dark patches, 0 to fully denoise
- * BP: EP strength on bright patches, 0 to fully denoise
- */
-#ifdef LUMA_raw
-#define EP 1
-#define BP 0.75
-#define DP 0.25
-#else
-#define EP 0
-#define BP 0.0
-#define DP 0.0
-#endif
-
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-
-/* Robust filtering
- *
- * This setting is dependent on code generation from nlmeans_cfg, so this 
- * setting can only be enabled via nlmeans_cfg.
- *
- * Compares the pixel-of-interest against a guide, which could be a downscaled 
- * image or the output of another shader such as guided.glsl
- */
-#ifdef LUMA_raw
-#define RF 1
-#else
-#define RF 1
-#endif
-
-/* Search shape
- *
- * Determines the shape of patches and research zones. Different shapes have 
- * different speed and quality characteristics. Every shape (besides square) is 
- * smaller than square.
- *
- * PS applies applies to patches, RS applies to research zones.
- *
- * Be wary of gather optimizations (see the Regarding Speed comment at the top)
- *
- * 0: square (symmetrical)
- * 1: horizontal line (asymmetric)
- * 2: vertical line (asymmetric)
- * 3: diamond (symmetrical)
- * 4: triangle (asymmetric, pointing upward)
- * 5: truncated triangle (asymmetric on two axis, last row halved)
- * 6: even sized square (asymmetric on two axis)
- * 7: plus (symmetrical)
- */
-#ifdef LUMA_raw
-#define RS 3
-#define PS 6
-#else
-#define RS 3
-#define PS 3
-#endif
-
-/* Rotational/reflectional invariance
- *
- * Number of rotations/reflections to try for each patch comparison. Slow, but 
- * improves feature preservation, although adding more rotations/reflections 
- * gives diminishing returns. The most similar rotation/reflection will be used.
- *
- * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
- * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
- *
- * RI: Rotational invariance
- * RFI (0 to 2): Reflectional invariance
- */
-#ifdef LUMA_raw
-#define RI 0
-#define RFI 0
-#else
-#define RI 0
-#define RFI 0
-#endif
-
-/* Temporal denoising
- *
- * Caveats:
- * 	- Slower, each frame needs to be researched
- * 	- Requires vo=gpu-next and nlmeans_temporal.glsl
- * 	- Luma-only (this is a bug)
- * 	- Buggy
- *
- * Gather samples across multiple frames. May cause motion blur and may 
- * struggle more with noise that persists across multiple frames (e.g., from 
- * compression or duplicate frames), but can work very well on high quality 
- * video.
- *
- * Motion estimation (ME) should improve quality without impacting speed.
- *
- * T: number of frames used
- * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg
- */
-#ifdef LUMA_raw
-#define T 0
-#define ME 1
-#else
-#define T 0
-#define ME 0
-#endif
-
-/* Spatial kernel
- *
- * Increasing the spatial denoising factor (SS) reduces the weight of further 
- * pixels.
- *
- * Spatial distortion instructs the spatial kernel to view that axis as 
- * closer/further, for instance SD=(1,1,0.5) would make the temporal axis 
- * appear closer and increase blur between frames.
- *
- * The intra-patch variants do not yet have well-understood effects. They are 
- * intended to make large patch sizes more useful. Likely slower.
- *
- * SS: spatial denoising factor
- * SD: spatial distortion (X, Y, time)
- * PSS: intra-patch spatial denoising factor
- * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables
- * PSD: intra-patch spatial distortion (X, Y)
- */
-#ifdef LUMA_raw
-#define SS 0.25
-#define SD vec3(1,1,1.5)
-#define PST 0
-#define PSS 0.0
-#define PSD vec2(1,1)
-#else
-#define SS 0.25
-#define SD vec3(1,1,1.5)
-#define PST 0
-#define PSS 0.0
-#define PSD vec2(1,1)
-#endif
-
-// Scaling factor (should match WIDTH/HEIGHT)
-#ifdef LUMA_raw
-#define SF 1
-#else
-#define SF 1
-#endif
-
-/* Estimator
- *
- * 0: means
- * 1: Euclidean medians (extremely slow, may be good for heavy noise)
- * 2: weight map (not a denoiser, maybe useful for generating image masks)
- * 3: weighted median intensity (slow, may be good for heavy noise)
- * 4: edge map (based on the relevant AS settings)
- */
-#ifdef LUMA_raw
-#define M 0
-#else
-#define M 0
-#endif
-
-/* Difference visualization
- *
- * Visualizes the difference between input/output image
- *
- * 0: off
- * 1: absolute difference scaled by S
- * 2: difference centered on 0.5
- */
-#ifdef LUMA_raw
-#define DV 0
-#else
-#define DV 0
-#endif
-
-/* Blur factor
- *
- * 0 to 1, only useful for alternative estimators. You're probably looking for 
- * "S" (denoising factor), go back to the top of the shader!
- */
-#ifdef LUMA_raw
-#define BF 1.0
-#else
-#define BF 1.0
-#endif
-
-// Force disable textureGather
-#ifdef LUMA_raw
-#define NG 0
-#else
-#define NG 0
-#endif
-
-// Patch donut (probably useless)
-#ifdef LUMA_raw
-#define PD 0
-#else
-#define PD 0
-#endif
-
-// Duplicate 1st weight (for LGC)
-#ifdef LUMA_raw
-#define D1W 0
-#else
-#define D1W 0
-#endif
-
-/* Shader code */
-
-#define EPSILON 0.00000000001
-#define M_PI 3.14159265358979323846
-
-#if PS == 6
-const int hp = P/2;
-#else
-const float hp = int(P/2) - 0.5*(1-(P%2)); // sample between pixels for even patch sizes
-#endif
-
-#if RS == 6
-const int hr = R/2;
-#else
-const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes
-#endif
-
-// donut increment, increments without landing on (0,0,0)
-// much faster than a "continue" statement
-#define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0))))
-
-// search shapes and their corresponding areas
-#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
-
-#define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr)
-#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr)
-#define S_TRIANGLE_A(hz,Z) int(hz*hz+Z)
-
-#define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr)
-#define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z)
-
-#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
-#define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++)
-
-#define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr)
-#define S_PLUS_A(hz,Z) (Z*2 - 1)
-
-#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr)
-#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr)
-
-#define T1 (T+1)
-#define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++)
-
-// Skip comparing the pixel-of-interest against itself, unless RF is enabled
-#if RF
-#define RINCR(z,c) (z.c++)
-#else
-#define RINCR DINCR
-#endif
-
-#define R_AREA(a) (a * T1 + RF-1)
-
-// research shapes
-// XXX would be nice to have the option of temporally-varying research sizes
-#if R == 0 || R == 1
-#define FOR_RESEARCH(r) S_1X1(r)
-const int r_area = R_AREA(1);
-#elif RS == 7
-#define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y))
-const int r_area = R_AREA(S_PLUS_A(hr,R));
-#elif RS == 6
-#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R*R);
-#elif RS == 5
-#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x))
-const int r_area = R_AREA(S_TRIANGLE_A(hr,hr));
-#elif RS == 4
-#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x))
-const int r_area = R_AREA(S_TRIANGLE_A(hr,R));
-#elif RS == 3
-#define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y))
-const int r_area = R_AREA(S_DIAMOND_A(hr,R));
-#elif RS == 2
-#define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R);
-#elif RS == 1
-#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x))
-const int r_area = R_AREA(R);
-#elif RS == 0
-#define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R*R);
-#endif
-
-#define RI1 (RI+1)
-#define RFI1 (RFI+1)
-
-#if RI
-#define FOR_ROTATION for (float ri = 0; ri < 360; ri+=360.0/RI1)
-#else
-#define FOR_ROTATION
-#endif
-
-#if RFI
-#define FOR_REFLECTION for (int rfi = 0; rfi < RFI1; rfi++)
-#else
-#define FOR_REFLECTION
-#endif
-
-#if PD
-#define PINCR DINCR
-#else
-#define PINCR(z,c) (z.c++)
-#endif
-
-#define P_AREA(a) (a - PD)
-
-// patch shapes
-#if P == 0 || P == 1
-#define FOR_PATCH(p) S_1X1(p)
-const int p_area = P_AREA(1);
-#elif PS == 7
-#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y))
-const int p_area = P_AREA(S_PLUS_A(hp,P));
-#elif PS == 6
-#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P*P);
-#elif PS == 5
-#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x))
-const int p_area = P_AREA(S_TRIANGLE_A(hp,hp));
-#elif PS == 4
-#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x))
-const int p_area = P_AREA(S_TRIANGLE_A(hp,P));
-#elif PS == 3
-#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y))
-const int p_area = P_AREA(S_DIAMOND_A(hp,P));
-#elif PS == 2
-#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P);
-#elif PS == 1
-#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x))
-const int p_area = P_AREA(P);
-#elif PS == 0
-#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P*P);
-#endif
-
-const float r_scale = 1.0/r_area;
-const float p_scale = 1.0/p_area;
-
-#define load_(off)  HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off))
-
-#if RF && defined(LUMA_raw)
-#define load2_(off) RF_LUMA_tex(RF_LUMA_pos + RF_LUMA_pt * vec2(off))
-#define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr)))
-#define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0)
-#elif RF && D1W
-#define load2_(off) RF_tex(RF_pos + RF_pt * vec2(off))
-#define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr)))
-#define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0)
-#elif RF
-#define load2_(off) RF_tex(RF_pos + RF_pt * vec2(off))
-#else
-#define load2_(off) HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off))
-#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr)))
-#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0)
-#endif
-
-#if T
-vec4 load(vec3 off)
-{
-	switch (int(off.z)) {
-	case 0: return load_(off);
-	}
-}
-vec4 load2(vec3 off)
-{
-	switch (int(off.z)) {
-	case 0: return load2_(off);
-	}
-}
-#else
-#define load(off) load_(off)
-#define load2(off) load2_(off)
-#endif
-
-vec4 poi = load(vec3(0)); // pixel-of-interest
-vec4 poi2 = load2(vec3(0)); // guide pixel-of-interest
-
-#if RI // rotation
-vec2 rot(vec2 p, float d)
-{
-	return vec2(
-		p.x * cos(radians(d)) - p.y * sin(radians(d)),
-		p.y * sin(radians(d)) + p.x * cos(radians(d))
-	);
-}
-#else
-#define rot(p, d) (p)
-#endif
-
-#if RFI // reflection
-vec2 ref(vec2 p, int d)
-{
-	switch (d) {
-	case 0: return p;
-	case 1: return p * vec2(1, -1);
-	case 2: return p * vec2(-1, 1);
-	}
-}
-#else
-#define ref(p, d) (p)
-#endif
-
-vec4 patch_comparison(vec3 r, vec3 r2)
-{
-	vec3 p;
-	vec4 min_rot = vec4(p_area);
-
-	FOR_ROTATION FOR_REFLECTION {
-		vec4 pdiff_sq = vec4(0);
-		FOR_PATCH(p) {
-			vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z);
-			vec4 diff_sq = load2(p + r2) - load2((transformed_p + r) * SF);
-			diff_sq *= diff_sq;
-#if PST && P >= PST
-			float pdist = length(p.xy*PSD)*PSS;
-			pdist = exp(-(pdist*pdist));
-			diff_sq = pow(max(diff_sq, EPSILON), vec4(pdist));
-#endif
-			pdiff_sq += diff_sq;
-		}
-		min_rot = min(min_rot, pdiff_sq);
-	}
-
-	return min_rot * p_scale;
-}
-
-#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
-#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
-
-#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && M != 1 && REGULAR_ROTATIONS && NO_GATHER
-// 3x3 diamond/plus patch_comparison_gather
-// XXX extend to support arbitrary sizes (probably requires code generation)
-// XXX extend to support 3x3 square
-const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
-const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
-vec4 poi_patch = gather_offs(0, offsets);
-vec4 patch_comparison_gather(vec3 r, vec3 r2)
-{
-	float min_rot = p_area - 1;
-	vec4 transformer = gather_offs(r, offsets_sf);
-	FOR_ROTATION {
-		FOR_REFLECTION {
-			float diff_sq = dot((poi_patch - transformer) * (poi_patch - transformer), vec4(1));
-			min_rot = min(diff_sq, min_rot);
-#if RFI
-			switch(rfi) {
-			case 0: transformer = transformer.zyxw; break;
-			case 1: transformer = transformer.zwxy; break; // undoes last mirror, performs another mirror
-			case 2: transformer = transformer.zyxw; break; // undoes last mirror
-			}
-#endif
-		}
-#if RI == 3
-		transformer = transformer.wxyz;
-#elif RI == 1
-		transformer = transformer.zwxy;
-#endif
-	}
-	float center_diff_sq = poi2.x - load2(r).x;
-	center_diff_sq *= center_diff_sq;
-	return vec4(min_rot + center_diff_sq, 0, 0, 0) * p_scale;
-}
-#elif (defined(LUMA_gather) || D1W) && PS == 6 && REGULAR_ROTATIONS && NO_GATHER
-// tiled even square patch_comparison_gather
-// XXX extend to support odd square?
-// XXX rotations/reflections appear to be subtly broken
-vec4 patch_comparison_gather(vec3 r, vec3 r2)
-{
-	vec2 tile;
-	float min_rot = p_area;
-
-	/* gather order:
-	 * w z
-	 * x y
-	 */
-	FOR_ROTATION FOR_REFLECTION {
-		float pdiff_sq = 0;
-		for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) {
-			vec4 poi_patch = gather(tile + r2.xy);
-			vec4 transformer = gather(ref(rot(tile + 0.5, ri), rfi) - 0.5 + r.xy);
-
-#if RI
-			for (float i = 0; i < ri; i+=90)
-				transformer = transformer.wxyz; // rotate 90 degrees
-#endif
-#if RFI // XXX output is a little off
-			switch(rfi) {
-			case 1: transformer = transformer.zyxw; break;
-			case 2: transformer = transformer.xwzy; break;
-			}
-#endif
-
-			vec4 diff_sq = (poi_patch - transformer) * (poi_patch - transformer);
-#if PST && P >= PST
-			// XXX refactor to avoid pow (should probably break off into a function)
-			vec4 pdist = vec4(
-				exp(-pow(length((tile+vec2(0,1))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(1,1))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(1,0))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(0,0))*PSD)*PSS, 2))
-			);
-			diff_sq = pow(max(diff_sq, EPSILON), pdist);
-#endif
-			pdiff_sq += dot(diff_sq, vec4(1));
-		}
-		min_rot = min(min_rot, pdiff_sq);
-	}
-
-	return vec4(min_rot, 0, 0, 0) * p_scale;
-}
-#else
-#define patch_comparison_gather patch_comparison
-#endif
-
-vec4 hook()
-{
-	vec4 total_weight = vec4(0);
-	vec4 sum = vec4(0);
-	vec4 result = vec4(0);
-
-	vec3 r = vec3(0);
-	vec3 p = vec3(0);
-	vec3 me = vec3(0);
-
-#if T && ME == 1 // temporal & motion estimation
-	vec3 me_tmp = vec3(0);
-	float maxweight = 0;
-#elif T && ME == 2 // temporal & motion estimation
-	vec3 me_sum = vec3(0);
-	float me_weight = 0;
-#endif
-
-#if WD == 2 || M == 3 // weight discard, weighted median intensities
-	int r_index = 0;
-	vec4 all_weights[r_area];
-	vec4 all_pixels[r_area];
-#elif WD == 1 // weight discard
-	vec4 no_weights = vec4(0);
-	vec4 discard_total_weight = vec4(0);
-	vec4 discard_sum = vec4(0);
-#endif
-
-#if M == 1 // Euclidean medians
-	vec4 minsum = vec4(0);
-#endif
-
-	FOR_FRAME(r) {
-	// XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
-#if T && ME == 1 // temporal & motion estimation max weight
-	if (r.z > 0) {
-		me += me_tmp;
-		me_tmp = vec3(0);
-		maxweight = 0;
-	}
-#elif T && ME == 2 // temporal & motion estimation weighted average
-	if (r.z > 0) {
-		me += round(me_sum / me_weight);
-		me_sum = vec3(0);
-		me_weight = 0;
-	}
-#endif
-	FOR_RESEARCH(r) {
-		// main NLM logic
-		const float h = S*0.013;
-		const float pdiff_scale = 1.0/(h*h);
-		vec4 pdiff_sq = (r.z == 0) ? patch_comparison_gather(r+me, vec3(0)) : patch_comparison(r+me, vec3(0));
-		vec4 weight = exp(-pdiff_sq * pdiff_scale);
-
-#if T && ME == 1 // temporal & motion estimation max weight
-		me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));
-		maxweight = max(maxweight, weight.x);
-#elif T && ME == 2 // temporal & motion estimation weighted average
-		me_sum += vec3(r.xy,0) * weight.x;
-		me_weight += weight.x;
-#endif
-
-#if D1W
-		weight = vec4(weight.x);
-#endif
-
-		weight *= exp(-(length(r*SD)*SS * length(r*SD)*SS)); // spatial kernel
-
-#if WD == 2 || M == 3 // weight discard, weighted median intensity
-		all_weights[r_index] = weight;
-		all_pixels[r_index] = load(r+me);
-		r_index++;
-#elif WD == 1 // weight discard
-		vec4 wd_scale = 1.0/max(no_weights, 1);
-		vec4 keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
-		discard_sum += load(r+me) * weight * (1 - keeps);
-		discard_total_weight += weight * (1 - keeps);
-		no_weights += keeps;
-#endif
-
-		sum += load(r+me) * weight;
-		total_weight += weight;
-
-#if M == 1 // Euclidean median
-		// Based on: https://arxiv.org/abs/1207.3056
-		// XXX might not work with ME
-		vec3 r2;
-		vec4 wpdist_sum = vec4(0);
-		FOR_FRAME(r2) FOR_RESEARCH(r2) {
-			vec4 pdist = (r.z + r2.z) == 0 ? patch_comparison_gather(r+me, r2+me) : patch_comparison(r+me, r2+me);
-			wpdist_sum += sqrt(pdist) * (1-weight);
-		}
-
-		vec4 newmin = step(wpdist_sum, minsum); // wpdist_sum <= minsum
-		newmin *= 1 - step(wpdist_sum, vec4(0)); // && wpdist_sum > 0
-		newmin += step(minsum, vec4(0)); // || minsum <= 0
-		newmin = min(newmin, 1);
-
-		minsum = (newmin * wpdist_sum) + ((1-newmin) * minsum);
-		result = (newmin * load(r+me)) + ((1-newmin) * result);
-#endif
-	} // FOR_RESEARCH
-	} // FOR_FRAME
-
-	// XXX optionally put the denoised pixel into the frame buffer?
-#if T // temporal
-#endif
-
-	vec4 avg_weight = total_weight * r_scale;
-	vec4 old_avg_weight = avg_weight;
-
-#if WD == 2 // true average
-	total_weight = vec4(0);
-	sum = vec4(0);
-	vec4 no_weights = vec4(0);
-
-	for (int i = 0; i < r_area; i++) {
-		vec4 keeps = step(avg_weight*WDT, all_weights[i]);
-		all_weights[i] *= keeps;
-		sum += all_pixels[i] * all_weights[i];
-		total_weight += all_weights[i];
-		no_weights += keeps;
-	}
-#elif WD == 1 // moving cumulative average
-	total_weight -= discard_total_weight;
-	sum -= discard_sum;
-#endif
-#if WD // weight discard
-	avg_weight = total_weight / no_weights;
-#endif
-
-	total_weight += SW;
-	sum += poi * SW;
-
-#if M == 3 // weighted median intensity
-	const float hr_area = r_area/2.0;
-	vec4 is_median, gt, lt, gte, lte, neq;
-
-	for (int i = 0; i < r_area; i++) {
-		gt = lt = vec4(0);
-		for (int j = 0; j < r_area; j++) {
-			gte = step(all_pixels[i]*all_weights[i], all_pixels[j]*all_weights[j]);
-			lte = step(all_pixels[j]*all_weights[j], all_pixels[i]*all_weights[i]);
-			neq = 1 - gte * lte;
-			gt += gte * neq;
-			lt += lte * neq;
-		}
-		is_median = step(gt, vec4(hr_area)) * step(lt, vec4(hr_area));
-		result += step(result, vec4(0)) * is_median * all_pixels[i];
-	}
-#elif M == 2 // weight map
-	result = avg_weight;
-#elif M == 0 // mean
-	result = sum / total_weight;
-#endif
-
-#if ASW == 0 // pre-WD weights
-#define AS_weight old_avg_weight
-#elif ASW == 1 // post-WD weights
-#define AS_weight avg_weight
-#endif
-
-#if ASK == 0
-	vec4 sharpening_strength = pow(AS_weight, vec4(ASP));
-#elif ASK == 1
-#define sigmoid(x) (tanh(x * 2*M_PI - M_PI)*0.5+0.5)
-	vec4 sharpening_strength = mix(pow(sigmoid(AS_weight), vec4(ASP)),
-	                               AS_weight, ASC);
-	// just in case ASC < 0 (will sharpen but it's janky XXX)
-	sharpening_strength = clamp(sharpening_strength, 0.0, 1.0);
-#elif ASK == 2
-	vec4 sharpening_strength = vec4(ASP);
-#endif
-
-	// XXX maybe allow for alternative blurs? e.g., replace result w/ load2?
-#if AS == 1 // sharpen+denoise
-	vec4 sharpened = result + (poi - result) * ASF;
-#elif AS == 2 // sharpen only
-	vec4 sharpened = poi + (poi - result) * ASF;
-#endif
-
-#if EP // extremes preserve
-	float luminance = EP_texOff(0).x;
-	// EPSILON is needed since pow(0,0) is undefined
-	float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP));
-	result = mix(poi, result, ep_weight);
-#endif
-
-#if AS == 1 // sharpen+denoise
-	result = mix(sharpened, result, sharpening_strength);
-#elif AS == 2 // sharpen only
-	result = mix(sharpened, poi, sharpening_strength);
-#endif
-
-#if M == 4 // edge map
-	result = sharpening_strength;
-#endif
-
-#if (M == 2 || M == 4) && defined(CHROMA_raw) // drop chroma for weight maps
-	result = vec4(0.5);
-#endif
-
-#if DV == 1
-	result = clamp(abs(poi - result) * S, 0.0, 1.0);
-#elif DV == 2
-	result = (poi - result) * 0.5 + 0.5;
-#endif
-
-	return mix(poi, result, BF);
-}
-
diff --git a/portable_config/shaders/nlmeans_hqx.glsl b/portable_config/shaders/nlmeans_hqx.glsl
new file mode 100644
index 00000000..d9b0a96e
--- /dev/null
+++ b/portable_config/shaders/nlmeans_hqx.glsl
@@ -0,0 +1,1288 @@
+/* vi: ft=c
+ *
+ * Based on vf_nlmeans.c from FFmpeg.
+ *
+ * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
+ * Copyright (c) 2016 Clément Bœsch <u pkh me>
+ *
+ * This program is free software: you can redistribute it and/or modify it 
+ * under the terms of the GNU Lesser General Public License as published by 
+ * the Free Software Foundation, either version 2.1 of the License, or (at 
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT 
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License 
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+// Description: nlmeans_hqx.glsl: Very slow, should offer the best quality.
+
+/* The recommended usage of this shader and its variant profiles is to add them 
+ * to input.conf and then dispatch the appropriate shader via a keybind during 
+ * media playback. Here is an example input.conf entry:
+ *
+ * F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl"; show-text "Non-local means (LUMA only)"
+ *
+ * These shaders can also be enabled by default in mpv.conf, for example:
+ *
+ * glsl-shaders='~~/shaders/nlmeans.glsl'
+ *
+ * Both of the examples above assume the shaders are located in a subdirectory 
+ * named "shaders" within mpv's config directory. Refer to the mpv 
+ * documentation for more details.
+ *
+ * This shader is highly configurable via user variables below. Although the 
+ * default settings should offer good quality at a reasonable speed, you are 
+ * encouraged to tweak them to your preferences. Be mindful that certain 
+ * settings may greatly affect speed.
+ *
+ * Denoising is most useful for noisy content. If there is no perceptible 
+ * noise, you probably won't see a positive difference.
+ *
+ * The default settings are generally tuned for low noise and high detail 
+ * preservation. The "medium" and "heavy" profiles are tuned for higher levels 
+ * of noise.
+ *
+ * The denoiser will not work properly if the content has been upscaled 
+ * beforehand (whether it was done by you or not). In such cases, consider 
+ * issuing a command to downscale in the mpv console (backtick ` key):
+ *
+ * vf toggle scale=-2:720
+ *
+ * ...replacing 720 with whatever resolution seems appropriate. Rerun the 
+ * command to undo the downscale. It may take some trial-and-error to find the 
+ * proper resolution.
+ */
+
+/* Regarding speed
+ *
+ * Speed may vary wildly for different vo and gpu-api settings. Generally 
+ * vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this 
+ * may be different for your system.
+ *
+ * If your GPU doesn't support textureGather, or if you are on a version of mpv 
+ * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ profile
+ *
+ * If you plan on tinkering with NLM's settings, read below:
+ *
+ * textureGather only applies to luma and limited to the these configurations:
+ *
+ * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}
+ *   - Default, very fast, rotations and reflections should be free
+ *   - If this is unusually slow then try changing gpu-api and vo
+ *   - If it's still slow, try setting RI/RFI to 0.
+ *
+ * - PS=6:RI={0,1,3}:RFI={0,1,2}
+ *   - Currently the only scalable variant
+ *   - Patch shape is asymmetric on two axis
+ *   - Rotations should have very little speed impact
+ *   - Reflections may have a significant speed impact
+ *
+ * Options which always disable textureGather:
+ * 	- PD
+ * 	- NG
+ */
+
+// The following is shader code injected from guided.glsl
+/* vi: ft=c
+ *
+ * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify it 
+ * under the terms of the GNU Lesser General Public License as published by 
+ * the Free Software Foundation, either version 2.1 of the License, or (at 
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT 
+ * ANY WARRANTY;  without even the implied warranty of MERCHANTABILITY or 
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License 
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+// Description: guided.glsl: Guided by the downscaled image
+
+/* The radius can be adjusted with the MEANI stage's downscaling factor. 
+ * Higher numbers give a bigger radius.
+ *
+ * The E variable can be found in the A stage.
+ *
+ * The subsampling (fast guided filter) can be adjusted with the I stage's 
+ * downscaling factor. Higher numbers are faster.
+ *
+ * The guide's subsampling can be adjusted with the PREI stage's downscaling 
+ * factor. Higher numbers downscale more.
+ */
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!BIND HOOKED
+//!WIDTH HOOKED.w 1.25 /
+//!HEIGHT HOOKED.h 1.25 /
+//!DESC Guided filter (PREI)
+//!SAVE _INJ_PREI
+
+vec4 hook()
+{
+	 return HOOKED_texOff(0); 
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!BIND _INJ_PREI
+//!WIDTH HOOKED.w
+//!HEIGHT HOOKED.h
+//!DESC Guided filter (I)
+//!SAVE _INJ_I
+
+vec4 hook()
+{
+return _INJ_PREI_texOff(0);
+}
+
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (P)
+//!BIND HOOKED
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_P
+
+vec4 hook()
+{
+	 return HOOKED_texOff(0); 
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (MEANI)
+//!BIND _INJ_I
+//!WIDTH _INJ_I.w 1.5 /
+//!HEIGHT _INJ_I.h 1.5 /
+//!SAVE _INJ_MEANI
+
+vec4 hook()
+{
+return _INJ_I_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (MEANP)
+//!BIND _INJ_P
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_MEANP
+
+vec4 hook()
+{
+return _INJ_P_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (_INJ_I_SQ)
+//!BIND _INJ_I
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_I_SQ
+
+vec4 hook()
+{
+return _INJ_I_texOff(0) * _INJ_I_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (_INJ_IXP)
+//!BIND _INJ_I
+//!BIND _INJ_P
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_IXP
+
+vec4 hook()
+{
+return _INJ_I_texOff(0) * _INJ_P_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (CORRI)
+//!BIND _INJ_I_SQ
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_CORRI
+
+vec4 hook()
+{
+return _INJ_I_SQ_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (CORRP)
+//!BIND _INJ_IXP
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_CORRP
+
+vec4 hook()
+{
+return _INJ_IXP_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (A)
+//!BIND _INJ_MEANI
+//!BIND _INJ_MEANP
+//!BIND _INJ_CORRI
+//!BIND _INJ_CORRP
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_A
+
+#define E 0.0013
+
+vec4 hook()
+{
+vec4 var = _INJ_CORRI_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANI_texOff(0);
+vec4 cov = _INJ_CORRP_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANP_texOff(0);
+	 return cov / (var + E); 
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (B)
+//!BIND _INJ_A
+//!BIND _INJ_MEANI
+//!BIND _INJ_MEANP
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_B
+
+vec4 hook()
+{
+return _INJ_MEANP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANI_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (MEANA)
+//!BIND _INJ_A
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_MEANA
+
+vec4 hook()
+{
+return _INJ_A_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (MEANB)
+//!BIND _INJ_B
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_MEANB
+
+vec4 hook()
+{
+return _INJ_B_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter
+//!BIND HOOKED
+//!BIND _INJ_MEANA
+//!BIND _INJ_MEANB
+//!SAVE RF_LUMA
+
+vec4 hook()
+{
+return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0);
+}
+
+// End of source code injected from guided.glsl 
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!BIND RF_LUMA
+//!WIDTH RF_LUMA.w
+//!HEIGHT RF_LUMA.h
+//!DESC Non-local means (RF, share)
+//!SAVE RF
+
+vec4 hook()
+{
+	return RF_LUMA_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!BIND LUMA
+//!WIDTH LUMA.w 3 /
+//!HEIGHT LUMA.h 3 /
+//!DESC Non-local means (EP)
+//!SAVE EP
+
+vec4 hook()
+{
+	return LUMA_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!BIND HOOKED
+//!BIND RF_LUMA
+//!BIND RF
+//!BIND EP
+//!DESC Non-local means (nlmeans_hqx.glsl)
+
+// User variables
+
+// It is generally preferable to denoise luma and chroma differently, so the 
+// user variables for luma and chroma are split.
+
+// Denoising factor (level of blur, higher means more blur)
+#ifdef LUMA_raw
+#define S 2.25
+#else
+#define S 5.0
+#endif
+
+/* Adaptive sharpening
+ *
+ * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
+ * weight map to restrict the sharpening to edges.
+ *
+ * If you just want to increase/decrease sharpness then you want to change ASF.
+ *
+ * Use V=4 to visualize which areas are sharpened (black means sharpen).
+ *
+ * AS:
+ * 	- 0 to disable
+ * 	- 1 to sharpen+denoise
+ * 	- 2 to sharpen only
+ * ASF: Higher numbers make a sharper image
+ * ASP: Higher numbers use more of the sharp image
+ * ASW:
+ * 	- 0 to use pre-WD weights
+ * 	- 1 to use post-WD weights (ASP should be ~2x to compensate)
+ * ASK: Weight kernel:
+ * 	- 0 for power. This is the old method.
+ * 	- 1 for sigmoid. This is generally recommended.
+ * 	- 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image)
+ * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map
+ */
+#ifdef LUMA_raw
+#define AS 0
+#define ASF 3.0
+#define ASP 1
+#define ASW 0
+#define ASK 1
+#define ASC 0.0
+#else
+#define AS 0
+#define ASF 3.0
+#define ASP 1.0
+#define ASW 0
+#define ASK 1
+#define ASC 0.0
+#endif
+
+/* Starting weight
+ *
+ * Also known as the center weight. This represents the weight of the 
+ * pixel-of-interest. Lower numbers may help handle heavy noise & ringing.
+ *
+ * EPSILON should be used instead of zero to avoid divide-by-zero errors.
+ */
+#ifdef LUMA_raw
+#define SW 1.0
+#else
+#define SW 0.5
+#endif
+
+/* Weight discard
+ *
+ * Discard weights that fall below a fraction of the average weight. This culls 
+ * the most dissimilar samples from the blur, yielding a much more pleasant 
+ * result, especially around edges.
+ * 
+ * WD:
+ * 	- 2: True average. Better quality, but slower and requires GLSL 4.0 or later
+ * 	- 1: Moving cumulative average. Inaccurate, tends to blur directionally.
+ * 	- 0: Disable
+ *
+ * WDT: Threshold coefficient, higher numbers discard more
+ * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
+ */
+#ifdef LUMA_raw
+#define WD 2
+#define WDT 0.5
+#define WDP 6.0
+#else
+#define WD 2
+#define WDT 0.75
+#define WDP 6.0
+#endif
+
+/* Extremes preserve
+ *
+ * Reduces denoising around very bright/dark areas.
+ *
+ * The downscaling factor of the EP shader stage affects what is considered a 
+ * bright/dark area. The default of 3 should be fine, it's not recommended to 
+ * change this.
+ *
+ * This is incompatible with RGB. If you have RGB hooks enabled then you will 
+ * have to delete the EP shader stage or specify EP=0 through shader_cfg.
+ *
+ * EP: 1 to enable, 0 to disable
+ * DP: EP strength on dark patches, 0 to fully denoise
+ * BP: EP strength on bright patches, 0 to fully denoise
+ */
+#ifdef LUMA_raw
+#define EP 1
+#define BP 0.75
+#define DP 0.25
+#else
+#define EP 0
+#define BP 0.0
+#define DP 0.0
+#endif
+
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+
+/* Patch & research sizes
+ *
+ * Patch size should be an odd number greater than or equal to 3. Higher values 
+ * are slower and not always better.
+ *
+ * Research size be an odd number greater than or equal to 3. Higher values are 
+ * generally better, but slower, blurrier, and gives diminishing returns.
+ */
+#ifdef LUMA_raw
+#define P 5
+#define R 5
+#else
+#define P 5
+#define R 5
+#endif
+
+/* Patch and research shapes
+ *
+ * Different shapes have different speed and quality characteristics. Every 
+ * shape (besides square) is smaller than square.
+ *
+ * PS applies applies to patches, RS applies to research zones.
+ *
+ * Be wary of gather optimizations (see the Regarding Speed comment at the top)
+ *
+ * 0: square (symmetrical)
+ * 1: horizontal line (asymmetric)
+ * 2: vertical line (asymmetric)
+ * 3: diamond (symmetrical)
+ * 4: triangle (asymmetric, pointing upward)
+ * 5: truncated triangle (asymmetric on two axis, last row halved)
+ * 6: even sized square (asymmetric on two axis)
+ * 7: plus (symmetrical)
+ */
+#ifdef LUMA_raw
+#define RS 3
+#define PS 6
+#else
+#define RS 3
+#define PS 3
+#endif
+
+/* Robust filtering
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
+ *
+ * Compares the pixel-of-interest against a guide, which could be a downscaled 
+ * image or the output of another shader
+ */
+#define RF_LUMA 1
+#define RF 1
+
+/* Rotational/reflectional invariance
+ *
+ * Number of rotations/reflections to try for each patch comparison. Can be 
+ * slow, but improves feature preservation. More rotations/reflections gives 
+ * diminishing returns. The most similar rotation/reflection will be used.
+ *
+ * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
+ * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
+ *
+ * RI: Rotational invariance
+ * RFI (0 to 2): Reflectional invariance
+ */
+#ifdef LUMA_raw
+#define RI 0
+#define RFI 0
+#else
+#define RI 0
+#define RFI 0
+#endif
+
+/* Temporal denoising
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
+ *
+ * Caveats:
+ * 	- Slower:
+ * 		- Each frame needs to be researched (more samples & more math)
+ * 		- Gather optimizations only apply to the current frame
+ * 	- Requires vo=gpu-next
+ * 	- Luma-only (this is a bug)
+ * 	- Buggy
+ *
+ * May cause motion blur and may struggle more with noise that persists across 
+ * multiple frames (e.g., from compression or duplicate frames), but can work 
+ * very well on high quality video.
+ *
+ * Motion estimation (ME) should improve quality without impacting speed.
+ *
+ * T: number of frames used
+ * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg
+ * MEF: estimate factor, compensates for ME being one frame behind
+ * TRF: compare against the denoised frames
+ */
+#ifdef LUMA_raw
+#define T 0
+#define ME 1
+#define MEF 2
+#define TRF 0
+#else
+#define T 0
+#define ME 0
+#define MEF 2
+#define TRF 0
+#endif
+
+/* Spatial kernel
+ *
+ * Increasing the spatial denoising factor (SS) reduces the weight of further 
+ * pixels.
+ *
+ * Spatial distortion instructs the spatial kernel to view that axis as 
+ * closer/further, for instance SD=(1,1,0.5) would make the temporal axis 
+ * appear closer and increase blur between frames.
+ *
+ * The intra-patch variants are supposed to help with larger patch sizes.
+ *
+ * SST: enables spatial kernel if R>=PST, 0 fully disables
+ * SS: spatial sigma
+ * SD: spatial distortion (X, Y, time)
+ * PSS: intra-patch spatial sigma
+ * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables
+ * PSD: intra-patch spatial distortion (X, Y)
+ */
+#ifdef LUMA_raw
+#define SST 1
+#define SS 0.25
+#define SD vec3(1,1,1)
+#define PST 0
+#define PSS 0.0
+#define PSD vec2(1,1)
+#else
+#define SST 1
+#define SS 0.25
+#define SD vec3(1,1,1)
+#define PST 0
+#define PSS 0.0
+#define PSD vec2(1,1)
+#endif
+
+/* Kernels
+ *
+ * SK: spatial kernel
+ * RK: range kernel (takes patch differences)
+ * PSK: intra-patch spatial kernel
+ *
+ * List of available kernels:
+ *
+ * bicubic
+ * cos
+ * gaussian
+ * lanczos
+ * quadratic
+ * sinc
+ * sphinx
+ */
+#ifdef LUMA_raw
+#define SK gaussian
+#define RK gaussian
+#define PSK gaussian
+#else
+#define SK gaussian
+#define RK gaussian
+#define PSK gaussian
+#endif
+
+// Scaling factor (should match WIDTH/HEIGHT)
+#ifdef LUMA_raw
+#define SF 1
+#else
+#define SF 1
+#endif
+
+/* Visualization
+ *
+ * 0: off
+ * 1: absolute difference between input/output to the power of 0.25
+ * 2: difference between input/output centered on 0.5
+ * 3: avg_weight
+ * 4: edge map (based on the relevant AS settings)
+ */
+#ifdef LUMA_raw
+#define V 0
+#else
+#define V 0
+#endif
+
+// Blur factor (0.0 returns the input image, 1.0 returns the output image)
+#ifdef LUMA_raw
+#define BF 1.0
+#else
+#define BF 1.0
+#endif
+
+// Force disable textureGather
+#ifdef LUMA_raw
+#define NG 0
+#else
+#define NG 0
+#endif
+
+// Patch donut (probably useless)
+#ifdef LUMA_raw
+#define PD 0
+#else
+#define PD 0
+#endif
+
+// Duplicate 1st weight (for luma-guided-chroma)
+#ifdef LUMA_raw
+#define D1W 0
+#else
+#define D1W 0
+#endif
+
+// Skip patch comparison
+#ifdef LUMA_raw
+#define SKIP_PATCH 0
+#else
+#define SKIP_PATCH 0
+#endif
+
+// Shader code
+
+#define EPSILON 0.00000000001
+#define M_PI 3.14159265358979323846
+#define POW2(x) ((x)*(x))
+#define POW3(x) ((x)*(x)*(x))
+#define bicubic(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0))))
+#define gaussian(x) exp(-1 * POW2(x))
+#define lanczos(x) POW2(sinc(x))
+#define quadratic(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5))
+#define sinc(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI))
+#define sphinx(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI))
+
+// XXX could maybe be better optimized on LGC
+// XXX return original alpha component instead of 1.0
+#if defined(LUMA_raw)
+#define val float
+#define val_swizz(v) (v.x)
+#define unval(v) vec4(v.x, 0, 0, 1.0)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#elif defined(CHROMA_raw)
+#define val vec2
+#define val_swizz(v) (v.xy)
+#define unval(v) vec4(v.x, v.y, 0, 1.0)
+#define val_packed uint
+#define val_pack(v) packUnorm2x16(v)
+#define val_unpack(v) unpackUnorm2x16(v)
+#else
+#define val vec3
+#define val_swizz(v) (v.xyz)
+#define unval(v) vec4(v.x, v.y, v.z, 1.0)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#endif
+
+#if PS == 6
+const int hp = P/2;
+#else
+const float hp = int(P/2) - 0.5*(1-(P%2)); // sample between pixels for even patch sizes
+#endif
+
+#if RS == 6
+const int hr = R/2;
+#else
+const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes
+#endif
+
+// donut increment, increments without landing on (0,0,0)
+// much faster than a continue statement
+#define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0))))
+
+// patch/research shapes
+// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R)
+// dots (.) represent samples (pixels) and X represents the pixel-of-interest
+
+// Z    .....
+// Z    .....
+// Z    ..X..
+// Z    .....
+// Z    .....
+#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr)
+
+// (in this instance Z=4)
+// Z    ....
+// Z    ....
+// Z    ..X.
+// Z    ....
+#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr)
+
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
+#define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr)
+
+// Z-4    .
+// Z-2   ...
+// hz+1 ..X
+#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr)
+#define S_TRIANGLE_A(hz,Z) int(hz*hz+Z)
+
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
+// Z-2   ...
+// Z-4    .
+#define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr)
+#define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z)
+
+//
+// Z    ..X..
+//
+#define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++)
+
+// 90 degree rotation of S_HORIZONTAL
+#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
+
+// 1      .
+// 1      . 
+// Z    ..X..
+// 1      . 
+// 1      .
+#define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr)
+#define S_PLUS_A(hz,Z) (Z*2 - 1)
+
+// XXX implement S_PLUS w/ an X overlayed:
+// 3    . . .
+// 3     ...
+// Z    ..X..
+// 3     ...
+// 3    . . .
+
+// XXX implement an X shape:
+// 2    .   .
+// 2     . .
+// 1      X  
+// 2     . .
+// 2    .   .
+
+// 1x1 square
+#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
+
+#define T1 (T+1)
+#define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++)
+
+#ifdef LUMA_raw
+#define RF_ RF_LUMA
+#else
+#define RF_ RF
+#endif
+
+// Skip comparing the pixel-of-interest against itself, unless RF is enabled
+#if RF_
+#define RINCR(z,c) (z.c++)
+#else
+#define RINCR DINCR
+#endif
+
+#define R_AREA(a) (a * T1 + RF_-1)
+
+// research shapes
+// XXX would be nice to have the option of temporally-varying research sizes
+#if R == 0 || R == 1
+#define FOR_RESEARCH(r) S_1X1(r)
+const int r_area = R_AREA(1);
+#elif RS == 7
+#define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y))
+const int r_area = R_AREA(S_PLUS_A(hr,R));
+#elif RS == 6
+#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y))
+const int r_area = R_AREA(R*R);
+#elif RS == 5
+#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x))
+const int r_area = R_AREA(S_TRIANGLE_A(hr,hr));
+#elif RS == 4
+#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x))
+const int r_area = R_AREA(S_TRIANGLE_A(hr,R));
+#elif RS == 3
+#define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y))
+const int r_area = R_AREA(S_DIAMOND_A(hr,R));
+#elif RS == 2
+#define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y))
+const int r_area = R_AREA(R);
+#elif RS == 1
+#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x))
+const int r_area = R_AREA(R);
+#elif RS == 0
+#define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y))
+const int r_area = R_AREA(R*R);
+#endif
+
+#define RI1 (RI+1)
+#define RFI1 (RFI+1)
+
+#if RI
+#define FOR_ROTATION for (float ri = 0; ri < 360; ri+=360.0/RI1)
+#else
+#define FOR_ROTATION
+#endif
+
+#if RFI
+#define FOR_REFLECTION for (int rfi = 0; rfi < RFI1; rfi++)
+#else
+#define FOR_REFLECTION
+#endif
+
+#if PD
+#define PINCR DINCR
+#else
+#define PINCR(z,c) (z.c++)
+#endif
+
+#define P_AREA(a) (a - PD)
+
+// patch shapes
+#if P == 0 || P == 1
+#define FOR_PATCH(p) S_1X1(p)
+const int p_area = P_AREA(1);
+#elif PS == 7
+#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y))
+const int p_area = P_AREA(S_PLUS_A(hp,P));
+#elif PS == 6
+#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y))
+const int p_area = P_AREA(P*P);
+#elif PS == 5
+#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x))
+const int p_area = P_AREA(S_TRIANGLE_A(hp,hp));
+#elif PS == 4
+#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x))
+const int p_area = P_AREA(S_TRIANGLE_A(hp,P));
+#elif PS == 3
+#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y))
+const int p_area = P_AREA(S_DIAMOND_A(hp,P));
+#elif PS == 2
+#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y))
+const int p_area = P_AREA(P);
+#elif PS == 1
+#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x))
+const int p_area = P_AREA(P);
+#elif PS == 0
+#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y))
+const int p_area = P_AREA(P*P);
+#endif
+
+const float r_scale = 1.0/r_area;
+const float p_scale = 1.0/p_area;
+
+#define sample(tex, pos, size, pt, off) tex(pos + pt * (vec2(off) + 0.5 - fract(pos*size)))
+#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off)
+
+#if RF_ && defined(LUMA_raw)
+#define load2_(off) sample(RF_LUMA_tex, RF_LUMA_pos, RF_LUMA_size, RF_LUMA_pt, off)
+#define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr)))
+#define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0)
+#elif RF_ && D1W
+#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
+#define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr)))
+#define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0)
+#elif RF_
+#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
+#else
+#define load2_(off) load_(off)
+#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr)))
+#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0)
+#endif
+
+#if T
+val load(vec3 off)
+{
+	switch (min(int(off.z), frame)) {
+	case 0: return val_swizz(load_(off));
+
+	}
+}
+val load2(vec3 off)
+{
+	return off.z == 0 ? val_swizz(load2_(off)) : load(off);
+}
+#else
+#define load(off) val_swizz(load_(off))
+#define load2(off) val_swizz(load2_(off))
+#endif
+
+val poi = load(vec3(0)); // pixel-of-interest
+val poi2 = load2(vec3(0)); // guide pixel-of-interest
+
+#if RI // rotation
+vec2 rot(vec2 p, float d)
+{
+	return vec2(
+		p.x * cos(radians(d)) - p.y * sin(radians(d)),
+		p.y * sin(radians(d)) + p.x * cos(radians(d))
+	);
+}
+#else
+#define rot(p, d) (p)
+#endif
+
+#if RFI // reflection
+vec2 ref(vec2 p, int d)
+{
+	switch (d) {
+	case 0: return p;
+	case 1: return p * vec2(1, -1);
+	case 2: return p * vec2(-1, 1);
+	}
+}
+#else
+#define ref(p, d) (p)
+#endif
+
+#if SST && R >= SST
+float spatial_r(vec3 v)
+{
+	v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size);
+	return SK(length(v*SD)*SS);
+}
+#else
+#define spatial_r(v) (1)
+#endif
+
+#if PST && P >= PST
+#define spatial_p(v) PSK(length(v*PSD)*PSS)
+#else
+#define spatial_p(v) (1)
+#endif
+
+val range(val pdiff_sq)
+{
+	const float h = S*0.013;
+	const float pdiff_scale = 1.0/(h*h);
+	pdiff_sq = sqrt(pdiff_sq * pdiff_scale);
+#if defined(LUMA_raw)
+	return RK(pdiff_sq);
+#elif defined(CHROMA_raw)
+	return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y));
+#else
+	return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z));
+#endif
+	//return exp(-pdiff_sq * pdiff_scale);
+
+	// weight function from the NLM paper, it's not very good
+	//return exp(-max(pdiff_sq - 2*S*S, 0.0) * pdiff_scale);
+}
+
+val patch_comparison(vec3 r, vec3 r2)
+{
+	vec3 p;
+	val min_rot = val(p_area);
+
+	FOR_ROTATION FOR_REFLECTION {
+		val pdiff_sq = val(0);
+		FOR_PATCH(p) {
+			vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z);
+			val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF);
+			diff_sq *= diff_sq;
+			diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy);
+			pdiff_sq += diff_sq;
+		}
+		min_rot = min(min_rot, pdiff_sq);
+	}
+
+	return min_rot * p_scale;
+}
+
+#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
+#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
+
+#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER
+// 3x3 diamond/plus patch_comparison_gather
+// XXX extend to support arbitrary sizes (probably requires code generation)
+// XXX extend to support 3x3 square
+// XXX support PSS
+const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
+const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
+vec4 poi_patch = gather_offs(0, offsets);
+float patch_comparison_gather(vec3 r, vec3 r2)
+{
+	float min_rot = p_area - 1;
+	vec4 transformer = gather_offs(r, offsets_sf);
+	FOR_ROTATION {
+		FOR_REFLECTION {
+			float diff_sq = dot((poi_patch - transformer) * (poi_patch - transformer), vec4(1));
+			min_rot = min(diff_sq, min_rot);
+#if RFI
+			switch(rfi) {
+			case 0: transformer = transformer.zyxw; break;
+			case 1: transformer = transformer.zwxy; break; // undoes last mirror, performs another mirror
+			case 2: transformer = transformer.zyxw; break; // undoes last mirror
+			}
+#endif
+		}
+#if RI == 3
+		transformer = transformer.wxyz;
+#elif RI == 1
+		transformer = transformer.zwxy;
+#endif
+	}
+	float center_diff_sq = poi2.x - load2(r).x;
+	center_diff_sq *= center_diff_sq;
+	return (min_rot + center_diff_sq) * p_scale;
+}
+#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER
+// tiled even square patch_comparison_gather
+// XXX extend to support odd square?
+float patch_comparison_gather(vec3 r, vec3 r2)
+{
+	vec2 tile;
+	float min_rot = p_area;
+
+	/* gather order:
+	 * w z
+	 * x y
+	 */
+	float pdiff_sq = 0;
+	for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) {
+		vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy);
+		diff_sq *= diff_sq;
+		diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)),
+			                                 spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0)));
+		pdiff_sq += dot(diff_sq, vec4(1));
+	}
+	min_rot = min(min_rot, pdiff_sq);
+
+	return min_rot * p_scale;
+}
+#else
+#define patch_comparison_gather patch_comparison
+#endif
+
+vec4 hook()
+{
+	val total_weight = val(0);
+	val sum = val(0);
+	val result = val(0);
+
+	vec3 r = vec3(0);
+	vec3 p = vec3(0);
+	vec3 me = vec3(0);
+
+#if T && ME == 1 // temporal & motion estimation
+	vec3 me_tmp = vec3(0);
+	float maxweight = 0;
+#elif T && ME == 2 // temporal & motion estimation
+	vec3 me_sum = vec3(0);
+	float me_weight = 0;
+#endif
+
+#if WD == 2 // weight discard
+	int r_index = 0;
+	val_packed all_weights[r_area];
+	val_packed all_pixels[r_area];
+#elif WD == 1 // weight discard
+	val no_weights = val(0);
+	val discard_total_weight = val(0);
+	val discard_sum = val(0);
+#endif
+
+	FOR_FRAME(r) {
+	// XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
+#if T && ME == 1 // temporal & motion estimation max weight
+	if (r.z > 0) {
+		me += me_tmp * MEF;
+		me_tmp = vec3(0);
+		maxweight = 0;
+	}
+#elif T && ME == 2 // temporal & motion estimation weighted average
+	if (r.z > 0) {
+		me += round(me_sum / me_weight * MEF);
+		me_sum = vec3(0);
+		me_weight = 0;
+	}
+#endif
+	FOR_RESEARCH(r) { // main NLM logic
+#if SKIP_PATCH
+		val weight = val(1);
+#else
+		val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(r+me, vec3(0))) : patch_comparison(r+me, vec3(0));
+		val weight = range(pdiff_sq);
+#endif
+
+#if T && ME == 1 // temporal & motion estimation max weight
+		me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));
+		maxweight = max(maxweight, weight.x);
+#elif T && ME == 2 // temporal & motion estimation weighted average
+		me_sum += vec3(r.xy,0) * weight.x;
+		me_weight += weight.x;
+#endif
+
+#if D1W
+		weight = val(weight.x);
+#endif
+
+		weight *= spatial_r(r);
+
+#if WD == 2 // weight discard
+		all_weights[r_index] = val_pack(weight);
+		all_pixels[r_index] = val_pack(load(r+me));
+		r_index++;
+#elif WD == 1 // weight discard
+		val wd_scale = 1.0/max(no_weights, 1);
+		val keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
+		discard_sum += load(r+me) * weight * (1 - keeps);
+		discard_total_weight += weight * (1 - keeps);
+		no_weights += keeps;
+#endif
+
+		sum += load(r+me) * weight;
+		total_weight += weight;
+	} // FOR_RESEARCH
+	} // FOR_FRAME
+
+	val avg_weight = total_weight * r_scale;
+	val old_avg_weight = avg_weight;
+
+#if WD == 2 // true average
+	total_weight = val(0);
+	sum = val(0);
+	val no_weights = val(0);
+
+	for (int i = 0; i < r_area; i++) {
+		val w = val_unpack(all_weights[i]);
+		val px = val_unpack(all_pixels[i]);
+		val keeps = step(avg_weight*WDT, w);
+
+		w *= keeps;
+		sum += px * w;
+		total_weight += w;
+		no_weights += keeps;
+	}
+#elif WD == 1 // moving cumulative average
+	total_weight -= discard_total_weight;
+	sum -= discard_sum;
+#endif
+#if WD // weight discard
+	avg_weight = total_weight / no_weights;
+#endif
+
+	total_weight += SW * spatial_r(vec3(0));
+	sum += poi * SW * spatial_r(vec3(0));
+
+#if V == 3 // weight map
+	result = val(avg_weight);
+#else // mean
+	result = val(sum / total_weight);
+#endif
+
+	// store frames for temporal
+#if T > 1
+
+#endif
+#if T && TRF
+	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result));
+#elif T
+	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2));
+#endif
+
+#if ASW == 0 // pre-WD weights
+#define AS_weight old_avg_weight
+#elif ASW == 1 // post-WD weights
+#define AS_weight avg_weight
+#endif
+
+#if ASK == 0
+	val sharpening_strength = pow(AS_weight, val(ASP));
+#elif ASK == 1
+	val sharpening_strength = mix(
+			pow(smoothstep(0.0, 1.0, AS_weight), val(ASP)),
+			AS_weight, ASC);
+	// XXX normalize the result to account for a negative ASC?
+#elif ASK == 2
+	val sharpening_strength = val(ASP);
+#endif
+
+#if AS == 1 // sharpen+denoise
+	val sharpened = result + (poi - result) * ASF;
+#elif AS == 2 // sharpen only
+	val sharpened = poi + (poi - result) * ASF;
+#endif
+
+#if EP // extremes preserve
+	float luminance = EP_texOff(0).x;
+	// EPSILON is needed since pow(0,0) is undefined
+	float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP));
+	result = mix(poi, result, ep_weight);
+#endif
+
+#if AS == 1 // sharpen+denoise
+	result = mix(sharpened, result, sharpening_strength);
+#elif AS == 2 // sharpen only
+	result = mix(sharpened, poi, sharpening_strength);
+#endif
+
+#if V == 4 // edge map
+	result = sharpening_strength;
+#endif
+
+#if (V == 3 || V == 4) && defined(CHROMA_raw) // drop chroma for these visualizations
+	return vec4(0.5);
+#endif
+
+#if V == 1
+	result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0);
+#elif V == 2
+	result = (poi - result) * 0.5 + 0.5;
+#endif
+
+	return unval(mix(poi, result, BF));
+}
+
diff --git a/portable_config/shaders/nlmeans_lgc.glsl b/portable_config/shaders/nlmeans_lgc.glsl
index f58842ce..384d3a88 100644
--- a/portable_config/shaders/nlmeans_lgc.glsl
+++ b/portable_config/shaders/nlmeans_lgc.glsl
@@ -19,7 +19,7 @@
  * along with this program. If not, see <https://www.gnu.org/licenses/>.
  */
 
-// Profile description: Experimental luma-guided chroma denoising, kinda similar to KrigBilateral
+// Description: nlmeans_lgc.glsl: Experimental luma-guided chroma denoising, kinda similar to KrigBilateral
 
 /* The recommended usage of this shader and its variant profiles is to add them 
  * to input.conf and then dispatch the appropriate shader via a keybind during 
@@ -48,8 +48,8 @@
  * of noise.
  *
  * The denoiser will not work properly if the content has been upscaled 
- * beforehand, whether it was done by you or someone down the line. Consider 
- * issuing a command to downscale in the mpv console, like so:
+ * beforehand (whether it was done by you or not). In such cases, consider 
+ * issuing a command to downscale in the mpv console (backtick ` key):
  *
  * vf toggle scale=-2:720
  *
@@ -65,12 +65,13 @@
  * may be different for your system.
  *
  * If your GPU doesn't support textureGather, or if you are on a version of mpv 
- * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ and VLQ 
- * profiles.
+ * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ profile
  *
- * textureGather is LUMA only and limited to the following configurations:
+ * If you plan on tinkering with NLM's settings, read below:
  *
- * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}:M!=1
+ * textureGather only applies to luma and limited to the these configurations:
+ *
+ * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}
  *   - Default, very fast, rotations and reflections should be free
  *   - If this is unusually slow then try changing gpu-api and vo
  *   - If it's still slow, try setting RI/RFI to 0.
@@ -83,23 +84,14 @@
  *
  * Options which always disable textureGather:
  * 	- PD
+ * 	- NG
  */
 
 //!HOOK CHROMA
-//!DESC Non-local means (downscale)
-//!WIDTH LUMA.w 3 /
-//!HEIGHT LUMA.h 3 /
-//!BIND LUMA
-//!SAVE EP
-
-vec4 hook()
-{
-	return LUMA_texOff(0);
-}
-
-//!HOOK CHROMA
-//!DESC Non-local means (share)
 //!BIND LUMA
+//!WIDTH LUMA.w
+//!HEIGHT LUMA.h
+//!DESC Non-local means (RF, share)
 //!SAVE RF
 
 vec4 hook()
@@ -109,42 +101,19 @@ vec4 hook()
 
 //!HOOK CHROMA
 //!BIND HOOKED
-//!BIND EP
 //!BIND RF
 //!DESC Non-local means (nlmeans_lgc.glsl)
-//!WIDTH LUMA.w
-//!HEIGHT LUMA.h
 
-/* User variables
- *
- * It is usually preferable to denoise chroma and luma differently, so the user 
- * variables for luma and chroma are split.
- */
+// User variables
 
-/* S = denoising factor
- * P = patch size
- * R = research size
- *
- * The denoising factor controls the level of blur, higher is blurrier.
- *
- * Patch size should usually be an odd number greater than or equal to 3. 
- * Higher values are slower and not always better.
- *
- * Research size usually be an odd number greater than or equal to 3. Higher 
- * values are usually better, but slower and offer diminishing returns.
- *
- * Even-numbered patch/research sizes will sample between pixels unless PS=6. 
- * It's not known whether this is ever useful behavior or not. This is 
- * incompatible with textureGather optimizations, so NG=1 to disable them.
- */
+// It is generally preferable to denoise luma and chroma differently, so the 
+// user variables for luma and chroma are split.
+
+// Denoising factor (level of blur, higher means more blur)
 #ifdef LUMA_raw
-#define S 2.0
-#define P 3
-#define R 5
+#define S 11.66
 #else
 #define S 11.66
-#define P 3
-#define R 5
 #endif
 
 /* Adaptive sharpening
@@ -152,11 +121,16 @@ vec4 hook()
  * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
  * weight map to restrict the sharpening to edges.
  *
- * Use M=4 to get a good look at which areas are/aren't sharpened.
+ * If you just want to increase/decrease sharpness then you want to change ASF.
  *
- * AS: 2 for sharpening, 1 for sharpening+denoising, 0 to disable
- * ASF: Sharpening factor, higher numbers make a sharper underlying image
- * ASP: Weight power, higher numbers use more of the sharp image
+ * Use V=4 to visualize which areas are sharpened (black means sharpen).
+ *
+ * AS:
+ * 	- 0 to disable
+ * 	- 1 to sharpen+denoise
+ * 	- 2 to sharpen only
+ * ASF: Higher numbers make a sharper image
+ * ASP: Higher numbers use more of the sharp image
  * ASW:
  * 	- 0 to use pre-WD weights
  * 	- 1 to use post-WD weights (ASP should be ~2x to compensate)
@@ -168,15 +142,15 @@ vec4 hook()
  */
 #ifdef LUMA_raw
 #define AS 0
-#define ASF 2.0
-#define ASP 4.0
+#define ASF 3.0
+#define ASP 1.0
 #define ASW 0
 #define ASK 1
 #define ASC 0.0
 #else
 #define AS 0
-#define ASF 2.0
-#define ASP 4.0
+#define ASF 3.0
+#define ASP 1.0
 #define ASW 0
 #define ASK 1
 #define ASC 0.0
@@ -184,15 +158,13 @@ vec4 hook()
 
 /* Starting weight
  *
- * Lower numbers give less weight to the pixel-of-interest, which may help 
- * handle higher noise levels, ringing, and may be useful for other things too?
+ * Also known as the center weight. This represents the weight of the 
+ * pixel-of-interest. Lower numbers may help handle heavy noise & ringing.
  *
- * EPSILON should be used instead of zero to avoid divide-by-zero errors. The 
- * avg_weight/old_avg_weight variables may be used to make SW adapt to the 
- * local noise level, e.g., SW=max(avg_weight, EPSILON)
+ * EPSILON should be used instead of zero to avoid divide-by-zero errors.
  */
 #ifdef LUMA_raw
-#define SW 1.0
+#define SW 0.75
 #else
 #define SW 0.75
 #endif
@@ -204,7 +176,7 @@ vec4 hook()
  * result, especially around edges.
  * 
  * WD:
- * 	- 2: True average. Very good quality, but slower and uses more memory.
+ * 	- 2: True average. Better quality, but slower and requires GLSL 4.0 or later
  * 	- 1: Moving cumulative average. Inaccurate, tends to blur directionally.
  * 	- 0: Disable
  *
@@ -212,7 +184,7 @@ vec4 hook()
  * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
  */
 #ifdef LUMA_raw
-#define WD 2
+#define WD 0
 #define WDT 0.5
 #define WDP 6.0
 #else
@@ -223,19 +195,21 @@ vec4 hook()
 
 /* Extremes preserve
  *
- * Reduces denoising around very bright/dark areas. The downscaling factor of 
- * EP (located near the top of this shader) controls the area sampled for 
- * luminance (higher numbers consider more area).
+ * Reduces denoising around very bright/dark areas.
+ *
+ * The downscaling factor of the EP shader stage affects what is considered a 
+ * bright/dark area. The default of 3 should be fine, it's not recommended to 
+ * change this.
  *
  * This is incompatible with RGB. If you have RGB hooks enabled then you will 
- * have to delete the EP shader stage or specify EP=0 through nlmeans_cfg.
+ * have to delete the EP shader stage or specify EP=0 through shader_cfg.
  *
  * EP: 1 to enable, 0 to disable
  * DP: EP strength on dark patches, 0 to fully denoise
  * BP: EP strength on bright patches, 0 to fully denoise
  */
 #ifdef LUMA_raw
-#define EP 1
+#define EP 0
 #define BP 0.75
 #define DP 0.25
 #else
@@ -250,25 +224,26 @@ vec4 hook()
 /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
 /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
 
-/* Robust filtering
+/* Patch & research sizes
  *
- * This setting is dependent on code generation from nlmeans_cfg, so this 
- * setting can only be enabled via nlmeans_cfg.
+ * Patch size should be an odd number greater than or equal to 3. Higher values 
+ * are slower and not always better.
  *
- * Compares the pixel-of-interest against a guide, which could be a downscaled 
- * image or the output of another shader such as guided.glsl
+ * Research size be an odd number greater than or equal to 3. Higher values are 
+ * generally better, but slower, blurrier, and gives diminishing returns.
  */
 #ifdef LUMA_raw
-#define RF 0
+#define P 3
+#define R 5
 #else
-#define RF 1
+#define P 3
+#define R 5
 #endif
 
-/* Search shape
+/* Patch and research shapes
  *
- * Determines the shape of patches and research zones. Different shapes have 
- * different speed and quality characteristics. Every shape (besides square) is 
- * smaller than square.
+ * Different shapes have different speed and quality characteristics. Every 
+ * shape (besides square) is smaller than square.
  *
  * PS applies applies to patches, RS applies to research zones.
  *
@@ -291,11 +266,22 @@ vec4 hook()
 #define PS 3
 #endif
 
+/* Robust filtering
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
+ *
+ * Compares the pixel-of-interest against a guide, which could be a downscaled 
+ * image or the output of another shader
+ */
+#define RF_LUMA 0
+#define RF 1
+
 /* Rotational/reflectional invariance
  *
- * Number of rotations/reflections to try for each patch comparison. Slow, but 
- * improves feature preservation, although adding more rotations/reflections 
- * gives diminishing returns. The most similar rotation/reflection will be used.
+ * Number of rotations/reflections to try for each patch comparison. Can be 
+ * slow, but improves feature preservation. More rotations/reflections gives 
+ * diminishing returns. The most similar rotation/reflection will be used.
  *
  * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
  * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
@@ -312,29 +298,39 @@ vec4 hook()
 #endif
 
 /* Temporal denoising
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
  *
  * Caveats:
- * 	- Slower, each frame needs to be researched
- * 	- Requires vo=gpu-next and nlmeans_temporal.glsl
+ * 	- Slower:
+ * 		- Each frame needs to be researched (more samples & more math)
+ * 		- Gather optimizations only apply to the current frame
+ * 	- Requires vo=gpu-next
  * 	- Luma-only (this is a bug)
  * 	- Buggy
  *
- * Gather samples across multiple frames. May cause motion blur and may 
- * struggle more with noise that persists across multiple frames (e.g., from 
- * compression or duplicate frames), but can work very well on high quality 
- * video.
+ * May cause motion blur and may struggle more with noise that persists across 
+ * multiple frames (e.g., from compression or duplicate frames), but can work 
+ * very well on high quality video.
  *
  * Motion estimation (ME) should improve quality without impacting speed.
  *
  * T: number of frames used
  * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg
+ * MEF: estimate factor, compensates for ME being one frame behind
+ * TRF: compare against the denoised frames
  */
 #ifdef LUMA_raw
 #define T 0
 #define ME 1
+#define MEF 2
+#define TRF 0
 #else
 #define T 0
 #define ME 0
+#define MEF 2
+#define TRF 0
 #endif
 
 /* Spatial kernel
@@ -346,69 +342,79 @@ vec4 hook()
  * closer/further, for instance SD=(1,1,0.5) would make the temporal axis 
  * appear closer and increase blur between frames.
  *
- * The intra-patch variants do not yet have well-understood effects. They are 
- * intended to make large patch sizes more useful. Likely slower.
+ * The intra-patch variants are supposed to help with larger patch sizes.
  *
- * SS: spatial denoising factor
+ * SST: enables spatial kernel if R>=PST, 0 fully disables
+ * SS: spatial sigma
  * SD: spatial distortion (X, Y, time)
- * PSS: intra-patch spatial denoising factor
+ * PSS: intra-patch spatial sigma
  * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables
  * PSD: intra-patch spatial distortion (X, Y)
  */
 #ifdef LUMA_raw
+#define SST 1
 #define SS 0.25
-#define SD vec3(1,1,1.5)
+#define SD vec3(1,1,1)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #else
+#define SST 1
 #define SS 0.25
-#define SD vec3(1,1,1.5)
+#define SD vec3(1,1,1)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #endif
 
-// Scaling factor (should match WIDTH/HEIGHT)
+/* Kernels
+ *
+ * SK: spatial kernel
+ * RK: range kernel (takes patch differences)
+ * PSK: intra-patch spatial kernel
+ *
+ * List of available kernels:
+ *
+ * bicubic
+ * cos
+ * gaussian
+ * lanczos
+ * quadratic
+ * sinc
+ * sphinx
+ */
 #ifdef LUMA_raw
-#define SF 1
+#define SK gaussian
+#define RK gaussian
+#define PSK gaussian
 #else
-#define SF 1
+#define SK gaussian
+#define RK gaussian
+#define PSK gaussian
 #endif
 
-/* Estimator
- *
- * 0: means
- * 1: Euclidean medians (extremely slow, may be good for heavy noise)
- * 2: weight map (not a denoiser, maybe useful for generating image masks)
- * 3: weighted median intensity (slow, may be good for heavy noise)
- * 4: edge map (based on the relevant AS settings)
- */
+// Scaling factor (should match WIDTH/HEIGHT)
 #ifdef LUMA_raw
-#define M 0
+#define SF 1
 #else
-#define M 0
+#define SF 1
 #endif
 
-/* Difference visualization
- *
- * Visualizes the difference between input/output image
+/* Visualization
  *
  * 0: off
- * 1: absolute difference scaled by S
- * 2: difference centered on 0.5
+ * 1: absolute difference between input/output to the power of 0.25
+ * 2: difference between input/output centered on 0.5
+ * 3: avg_weight
+ * 4: edge map (based on the relevant AS settings)
  */
 #ifdef LUMA_raw
-#define DV 0
+#define V 0
 #else
-#define DV 0
+#define V 0
 #endif
 
-/* Blur factor
- *
- * 0 to 1, only useful for alternative estimators. You're probably looking for 
- * "S" (denoising factor), go back to the top of the shader!
- */
+// Blur factor (0.0 returns the input image, 1.0 returns the output image)
 #ifdef LUMA_raw
 #define BF 1.0
 #else
@@ -429,17 +435,57 @@ vec4 hook()
 #define PD 0
 #endif
 
-// Duplicate 1st weight (for LGC)
+// Duplicate 1st weight (for luma-guided-chroma)
 #ifdef LUMA_raw
-#define D1W 0
+#define D1W 1
 #else
 #define D1W 1
 #endif
 
-/* Shader code */
+// Skip patch comparison
+#ifdef LUMA_raw
+#define SKIP_PATCH 0
+#else
+#define SKIP_PATCH 0
+#endif
+
+// Shader code
 
 #define EPSILON 0.00000000001
 #define M_PI 3.14159265358979323846
+#define POW2(x) ((x)*(x))
+#define POW3(x) ((x)*(x)*(x))
+#define bicubic(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0))))
+#define gaussian(x) exp(-1 * POW2(x))
+#define lanczos(x) POW2(sinc(x))
+#define quadratic(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5))
+#define sinc(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI))
+#define sphinx(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI))
+
+// XXX could maybe be better optimized on LGC
+// XXX return original alpha component instead of 1.0
+#if defined(LUMA_raw)
+#define val float
+#define val_swizz(v) (v.x)
+#define unval(v) vec4(v.x, 0, 0, 1.0)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#elif defined(CHROMA_raw)
+#define val vec2
+#define val_swizz(v) (v.xy)
+#define unval(v) vec4(v.x, v.y, 0, 1.0)
+#define val_packed uint
+#define val_pack(v) packUnorm2x16(v)
+#define val_unpack(v) unpackUnorm2x16(v)
+#else
+#define val vec3
+#define val_swizz(v) (v.xyz)
+#define unval(v) vec4(v.x, v.y, v.z, 1.0)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#endif
 
 #if PS == 6
 const int hp = P/2;
@@ -454,39 +500,96 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res
 #endif
 
 // donut increment, increments without landing on (0,0,0)
-// much faster than a "continue" statement
+// much faster than a continue statement
 #define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0))))
 
-// search shapes and their corresponding areas
-#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
+// patch/research shapes
+// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R)
+// dots (.) represent samples (pixels) and X represents the pixel-of-interest
+
+// Z    .....
+// Z    .....
+// Z    ..X..
+// Z    .....
+// Z    .....
+#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr)
 
+// (in this instance Z=4)
+// Z    ....
+// Z    ....
+// Z    ..X.
+// Z    ....
+#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr)
+
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
 #define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr)
+
+// Z-4    .
+// Z-2   ...
+// hz+1 ..X
 #define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr)
 #define S_TRIANGLE_A(hz,Z) int(hz*hz+Z)
 
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
+// Z-2   ...
+// Z-4    .
 #define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr)
 #define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z)
 
-#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
+//
+// Z    ..X..
+//
 #define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++)
 
+// 90 degree rotation of S_HORIZONTAL
+#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
+
+// 1      .
+// 1      . 
+// Z    ..X..
+// 1      . 
+// 1      .
 #define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr)
 #define S_PLUS_A(hz,Z) (Z*2 - 1)
 
-#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr)
-#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr)
+// XXX implement S_PLUS w/ an X overlayed:
+// 3    . . .
+// 3     ...
+// Z    ..X..
+// 3     ...
+// 3    . . .
+
+// XXX implement an X shape:
+// 2    .   .
+// 2     . .
+// 1      X  
+// 2     . .
+// 2    .   .
+
+// 1x1 square
+#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
 
 #define T1 (T+1)
 #define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++)
 
+#ifdef LUMA_raw
+#define RF_ RF_LUMA
+#else
+#define RF_ RF
+#endif
+
 // Skip comparing the pixel-of-interest against itself, unless RF is enabled
-#if RF
+#if RF_
 #define RINCR(z,c) (z.c++)
 #else
 #define RINCR DINCR
 #endif
 
-#define R_AREA(a) (a * T1 + RF-1)
+#define R_AREA(a) (a * T1 + RF_-1)
 
 // research shapes
 // XXX would be nice to have the option of temporally-varying research sizes
@@ -575,44 +678,44 @@ const int p_area = P_AREA(P*P);
 const float r_scale = 1.0/r_area;
 const float p_scale = 1.0/p_area;
 
-#define load_(off)  HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off))
+#define sample(tex, pos, size, pt, off) tex(pos + pt * (vec2(off) + 0.5 - fract(pos*size)))
+#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off)
 
-#if RF && defined(LUMA_raw)
-#define load2_(off) RF_LUMA_tex(RF_LUMA_pos + RF_LUMA_pt * vec2(off))
+#if RF_ && defined(LUMA_raw)
+#define load2_(off) sample(RF_LUMA_tex, RF_LUMA_pos, RF_LUMA_size, RF_LUMA_pt, off)
 #define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr)))
 #define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0)
-#elif RF && D1W
-#define load2_(off) RF_tex(RF_pos + RF_pt * vec2(off))
+#elif RF_ && D1W
+#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
 #define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr)))
 #define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0)
-#elif RF
-#define load2_(off) RF_tex(RF_pos + RF_pt * vec2(off))
+#elif RF_
+#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
 #else
-#define load2_(off) HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off))
+#define load2_(off) load_(off)
 #define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr)))
 #define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0)
 #endif
 
 #if T
-vec4 load(vec3 off)
+val load(vec3 off)
 {
-	switch (int(off.z)) {
-	case 0: return load_(off);
+	switch (min(int(off.z), frame)) {
+	case 0: return val_swizz(load_(off));
+
 	}
 }
-vec4 load2(vec3 off)
+val load2(vec3 off)
 {
-	switch (int(off.z)) {
-	case 0: return load2_(off);
-	}
+	return off.z == 0 ? val_swizz(load2_(off)) : load(off);
 }
 #else
-#define load(off) load_(off)
-#define load2(off) load2_(off)
+#define load(off) val_swizz(load_(off))
+#define load2(off) val_swizz(load2_(off))
 #endif
 
-vec4 poi = load(vec3(0)); // pixel-of-interest
-vec4 poi2 = load2(vec3(0)); // guide pixel-of-interest
+val poi = load(vec3(0)); // pixel-of-interest
+val poi2 = load2(vec3(0)); // guide pixel-of-interest
 
 #if RI // rotation
 vec2 rot(vec2 p, float d)
@@ -639,22 +742,52 @@ vec2 ref(vec2 p, int d)
 #define ref(p, d) (p)
 #endif
 
-vec4 patch_comparison(vec3 r, vec3 r2)
+#if SST && R >= SST
+float spatial_r(vec3 v)
+{
+	v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size);
+	return SK(length(v*SD)*SS);
+}
+#else
+#define spatial_r(v) (1)
+#endif
+
+#if PST && P >= PST
+#define spatial_p(v) PSK(length(v*PSD)*PSS)
+#else
+#define spatial_p(v) (1)
+#endif
+
+val range(val pdiff_sq)
+{
+	const float h = S*0.013;
+	const float pdiff_scale = 1.0/(h*h);
+	pdiff_sq = sqrt(pdiff_sq * pdiff_scale);
+#if defined(LUMA_raw)
+	return RK(pdiff_sq);
+#elif defined(CHROMA_raw)
+	return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y));
+#else
+	return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z));
+#endif
+	//return exp(-pdiff_sq * pdiff_scale);
+
+	// weight function from the NLM paper, it's not very good
+	//return exp(-max(pdiff_sq - 2*S*S, 0.0) * pdiff_scale);
+}
+
+val patch_comparison(vec3 r, vec3 r2)
 {
 	vec3 p;
-	vec4 min_rot = vec4(p_area);
+	val min_rot = val(p_area);
 
 	FOR_ROTATION FOR_REFLECTION {
-		vec4 pdiff_sq = vec4(0);
+		val pdiff_sq = val(0);
 		FOR_PATCH(p) {
 			vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z);
-			vec4 diff_sq = load2(p + r2) - load2((transformed_p + r) * SF);
+			val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF);
 			diff_sq *= diff_sq;
-#if PST && P >= PST
-			float pdist = length(p.xy*PSD)*PSS;
-			pdist = exp(-(pdist*pdist));
-			diff_sq = pow(max(diff_sq, EPSILON), vec4(pdist));
-#endif
+			diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy);
 			pdiff_sq += diff_sq;
 		}
 		min_rot = min(min_rot, pdiff_sq);
@@ -666,14 +799,15 @@ vec4 patch_comparison(vec3 r, vec3 r2)
 #define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
 #define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
 
-#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && M != 1 && REGULAR_ROTATIONS && NO_GATHER
+#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER
 // 3x3 diamond/plus patch_comparison_gather
 // XXX extend to support arbitrary sizes (probably requires code generation)
 // XXX extend to support 3x3 square
+// XXX support PSS
 const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
 const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
 vec4 poi_patch = gather_offs(0, offsets);
-vec4 patch_comparison_gather(vec3 r, vec3 r2)
+float patch_comparison_gather(vec3 r, vec3 r2)
 {
 	float min_rot = p_area - 1;
 	vec4 transformer = gather_offs(r, offsets_sf);
@@ -697,13 +831,12 @@ vec4 patch_comparison_gather(vec3 r, vec3 r2)
 	}
 	float center_diff_sq = poi2.x - load2(r).x;
 	center_diff_sq *= center_diff_sq;
-	return vec4(min_rot + center_diff_sq, 0, 0, 0) * p_scale;
+	return (min_rot + center_diff_sq) * p_scale;
 }
-#elif (defined(LUMA_gather) || D1W) && PS == 6 && REGULAR_ROTATIONS && NO_GATHER
+#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER
 // tiled even square patch_comparison_gather
 // XXX extend to support odd square?
-// XXX rotations/reflections appear to be subtly broken
-vec4 patch_comparison_gather(vec3 r, vec3 r2)
+float patch_comparison_gather(vec3 r, vec3 r2)
 {
 	vec2 tile;
 	float min_rot = p_area;
@@ -712,40 +845,17 @@ vec4 patch_comparison_gather(vec3 r, vec3 r2)
 	 * w z
 	 * x y
 	 */
-	FOR_ROTATION FOR_REFLECTION {
-		float pdiff_sq = 0;
-		for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) {
-			vec4 poi_patch = gather(tile + r2.xy);
-			vec4 transformer = gather(ref(rot(tile + 0.5, ri), rfi) - 0.5 + r.xy);
-
-#if RI
-			for (float i = 0; i < ri; i+=90)
-				transformer = transformer.wxyz; // rotate 90 degrees
-#endif
-#if RFI // XXX output is a little off
-			switch(rfi) {
-			case 1: transformer = transformer.zyxw; break;
-			case 2: transformer = transformer.xwzy; break;
-			}
-#endif
-
-			vec4 diff_sq = (poi_patch - transformer) * (poi_patch - transformer);
-#if PST && P >= PST
-			// XXX refactor to avoid pow (should probably break off into a function)
-			vec4 pdist = vec4(
-				exp(-pow(length((tile+vec2(0,1))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(1,1))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(1,0))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(0,0))*PSD)*PSS, 2))
-			);
-			diff_sq = pow(max(diff_sq, EPSILON), pdist);
-#endif
-			pdiff_sq += dot(diff_sq, vec4(1));
-		}
-		min_rot = min(min_rot, pdiff_sq);
+	float pdiff_sq = 0;
+	for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) {
+		vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy);
+		diff_sq *= diff_sq;
+		diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)),
+			                                 spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0)));
+		pdiff_sq += dot(diff_sq, vec4(1));
 	}
+	min_rot = min(min_rot, pdiff_sq);
 
-	return vec4(min_rot, 0, 0, 0) * p_scale;
+	return min_rot * p_scale;
 }
 #else
 #define patch_comparison_gather patch_comparison
@@ -753,9 +863,9 @@ vec4 patch_comparison_gather(vec3 r, vec3 r2)
 
 vec4 hook()
 {
-	vec4 total_weight = vec4(0);
-	vec4 sum = vec4(0);
-	vec4 result = vec4(0);
+	val total_weight = val(0);
+	val sum = val(0);
+	val result = val(0);
 
 	vec3 r = vec3(0);
 	vec3 p = vec3(0);
@@ -769,41 +879,38 @@ vec4 hook()
 	float me_weight = 0;
 #endif
 
-#if WD == 2 || M == 3 // weight discard, weighted median intensities
+#if WD == 2 // weight discard
 	int r_index = 0;
-	vec4 all_weights[r_area];
-	vec4 all_pixels[r_area];
+	val_packed all_weights[r_area];
+	val_packed all_pixels[r_area];
 #elif WD == 1 // weight discard
-	vec4 no_weights = vec4(0);
-	vec4 discard_total_weight = vec4(0);
-	vec4 discard_sum = vec4(0);
-#endif
-
-#if M == 1 // Euclidean medians
-	vec4 minsum = vec4(0);
+	val no_weights = val(0);
+	val discard_total_weight = val(0);
+	val discard_sum = val(0);
 #endif
 
 	FOR_FRAME(r) {
 	// XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
 #if T && ME == 1 // temporal & motion estimation max weight
 	if (r.z > 0) {
-		me += me_tmp;
+		me += me_tmp * MEF;
 		me_tmp = vec3(0);
 		maxweight = 0;
 	}
 #elif T && ME == 2 // temporal & motion estimation weighted average
 	if (r.z > 0) {
-		me += round(me_sum / me_weight);
+		me += round(me_sum / me_weight * MEF);
 		me_sum = vec3(0);
 		me_weight = 0;
 	}
 #endif
-	FOR_RESEARCH(r) {
-		// main NLM logic
-		const float h = S*0.013;
-		const float pdiff_scale = 1.0/(h*h);
-		vec4 pdiff_sq = (r.z == 0) ? patch_comparison_gather(r+me, vec3(0)) : patch_comparison(r+me, vec3(0));
-		vec4 weight = exp(-pdiff_sq * pdiff_scale);
+	FOR_RESEARCH(r) { // main NLM logic
+#if SKIP_PATCH
+		val weight = val(1);
+#else
+		val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(r+me, vec3(0))) : patch_comparison(r+me, vec3(0));
+		val weight = range(pdiff_sq);
+#endif
 
 #if T && ME == 1 // temporal & motion estimation max weight
 		me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));
@@ -814,18 +921,18 @@ vec4 hook()
 #endif
 
 #if D1W
-		weight = vec4(weight.x);
+		weight = val(weight.x);
 #endif
 
-		weight *= exp(-(length(r*SD)*SS * length(r*SD)*SS)); // spatial kernel
+		weight *= spatial_r(r);
 
-#if WD == 2 || M == 3 // weight discard, weighted median intensity
-		all_weights[r_index] = weight;
-		all_pixels[r_index] = load(r+me);
+#if WD == 2 // weight discard
+		all_weights[r_index] = val_pack(weight);
+		all_pixels[r_index] = val_pack(load(r+me));
 		r_index++;
 #elif WD == 1 // weight discard
-		vec4 wd_scale = 1.0/max(no_weights, 1);
-		vec4 keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
+		val wd_scale = 1.0/max(no_weights, 1);
+		val keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
 		discard_sum += load(r+me) * weight * (1 - keeps);
 		discard_total_weight += weight * (1 - keeps);
 		no_weights += keeps;
@@ -833,45 +940,25 @@ vec4 hook()
 
 		sum += load(r+me) * weight;
 		total_weight += weight;
-
-#if M == 1 // Euclidean median
-		// Based on: https://arxiv.org/abs/1207.3056
-		// XXX might not work with ME
-		vec3 r2;
-		vec4 wpdist_sum = vec4(0);
-		FOR_FRAME(r2) FOR_RESEARCH(r2) {
-			vec4 pdist = (r.z + r2.z) == 0 ? patch_comparison_gather(r+me, r2+me) : patch_comparison(r+me, r2+me);
-			wpdist_sum += sqrt(pdist) * (1-weight);
-		}
-
-		vec4 newmin = step(wpdist_sum, minsum); // wpdist_sum <= minsum
-		newmin *= 1 - step(wpdist_sum, vec4(0)); // && wpdist_sum > 0
-		newmin += step(minsum, vec4(0)); // || minsum <= 0
-		newmin = min(newmin, 1);
-
-		minsum = (newmin * wpdist_sum) + ((1-newmin) * minsum);
-		result = (newmin * load(r+me)) + ((1-newmin) * result);
-#endif
 	} // FOR_RESEARCH
 	} // FOR_FRAME
 
-	// XXX optionally put the denoised pixel into the frame buffer?
-#if T // temporal
-#endif
-
-	vec4 avg_weight = total_weight * r_scale;
-	vec4 old_avg_weight = avg_weight;
+	val avg_weight = total_weight * r_scale;
+	val old_avg_weight = avg_weight;
 
 #if WD == 2 // true average
-	total_weight = vec4(0);
-	sum = vec4(0);
-	vec4 no_weights = vec4(0);
+	total_weight = val(0);
+	sum = val(0);
+	val no_weights = val(0);
 
 	for (int i = 0; i < r_area; i++) {
-		vec4 keeps = step(avg_weight*WDT, all_weights[i]);
-		all_weights[i] *= keeps;
-		sum += all_pixels[i] * all_weights[i];
-		total_weight += all_weights[i];
+		val w = val_unpack(all_weights[i]);
+		val px = val_unpack(all_pixels[i]);
+		val keeps = step(avg_weight*WDT, w);
+
+		w *= keeps;
+		sum += px * w;
+		total_weight += w;
 		no_weights += keeps;
 	}
 #elif WD == 1 // moving cumulative average
@@ -882,29 +969,23 @@ vec4 hook()
 	avg_weight = total_weight / no_weights;
 #endif
 
-	total_weight += SW;
-	sum += poi * SW;
+	total_weight += SW * spatial_r(vec3(0));
+	sum += poi * SW * spatial_r(vec3(0));
 
-#if M == 3 // weighted median intensity
-	const float hr_area = r_area/2.0;
-	vec4 is_median, gt, lt, gte, lte, neq;
+#if V == 3 // weight map
+	result = val(avg_weight);
+#else // mean
+	result = val(sum / total_weight);
+#endif
 
-	for (int i = 0; i < r_area; i++) {
-		gt = lt = vec4(0);
-		for (int j = 0; j < r_area; j++) {
-			gte = step(all_pixels[i]*all_weights[i], all_pixels[j]*all_weights[j]);
-			lte = step(all_pixels[j]*all_weights[j], all_pixels[i]*all_weights[i]);
-			neq = 1 - gte * lte;
-			gt += gte * neq;
-			lt += lte * neq;
-		}
-		is_median = step(gt, vec4(hr_area)) * step(lt, vec4(hr_area));
-		result += step(result, vec4(0)) * is_median * all_pixels[i];
-	}
-#elif M == 2 // weight map
-	result = avg_weight;
-#elif M == 0 // mean
-	result = sum / total_weight;
+	// store frames for temporal
+#if T > 1
+
+#endif
+#if T && TRF
+	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result));
+#elif T
+	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2));
 #endif
 
 #if ASW == 0 // pre-WD weights
@@ -914,22 +995,20 @@ vec4 hook()
 #endif
 
 #if ASK == 0
-	vec4 sharpening_strength = pow(AS_weight, vec4(ASP));
+	val sharpening_strength = pow(AS_weight, val(ASP));
 #elif ASK == 1
-#define sigmoid(x) (tanh(x * 2*M_PI - M_PI)*0.5+0.5)
-	vec4 sharpening_strength = mix(pow(sigmoid(AS_weight), vec4(ASP)),
-	                               AS_weight, ASC);
-	// just in case ASC < 0 (will sharpen but it's janky XXX)
-	sharpening_strength = clamp(sharpening_strength, 0.0, 1.0);
+	val sharpening_strength = mix(
+			pow(smoothstep(0.0, 1.0, AS_weight), val(ASP)),
+			AS_weight, ASC);
+	// XXX normalize the result to account for a negative ASC?
 #elif ASK == 2
-	vec4 sharpening_strength = vec4(ASP);
+	val sharpening_strength = val(ASP);
 #endif
 
-	// XXX maybe allow for alternative blurs? e.g., replace result w/ load2?
 #if AS == 1 // sharpen+denoise
-	vec4 sharpened = result + (poi - result) * ASF;
+	val sharpened = result + (poi - result) * ASF;
 #elif AS == 2 // sharpen only
-	vec4 sharpened = poi + (poi - result) * ASF;
+	val sharpened = poi + (poi - result) * ASF;
 #endif
 
 #if EP // extremes preserve
@@ -945,20 +1024,20 @@ vec4 hook()
 	result = mix(sharpened, poi, sharpening_strength);
 #endif
 
-#if M == 4 // edge map
+#if V == 4 // edge map
 	result = sharpening_strength;
 #endif
 
-#if (M == 2 || M == 4) && defined(CHROMA_raw) // drop chroma for weight maps
-	result = vec4(0.5);
+#if (V == 3 || V == 4) && defined(CHROMA_raw) // drop chroma for these visualizations
+	return vec4(0.5);
 #endif
 
-#if DV == 1
-	result = clamp(abs(poi - result) * S, 0.0, 1.0);
-#elif DV == 2
+#if V == 1
+	result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0);
+#elif V == 2
 	result = (poi - result) * 0.5 + 0.5;
 #endif
 
-	return mix(poi, result, BF);
+	return unval(mix(poi, result, BF));
 }
 
diff --git a/portable_config/shaders/nlmeans_lq.glsl b/portable_config/shaders/nlmeans_lq.glsl
index 210708b5..80eaf745 100644
--- a/portable_config/shaders/nlmeans_lq.glsl
+++ b/portable_config/shaders/nlmeans_lq.glsl
@@ -19,7 +19,7 @@
  * along with this program. If not, see <https://www.gnu.org/licenses/>.
  */
 
-// Profile description: Faster, but lower quality.
+// Description: nlmeans_lq.glsl: Faster, but lower quality.
 
 /* The recommended usage of this shader and its variant profiles is to add them 
  * to input.conf and then dispatch the appropriate shader via a keybind during 
@@ -48,8 +48,8 @@
  * of noise.
  *
  * The denoiser will not work properly if the content has been upscaled 
- * beforehand, whether it was done by you or someone down the line. Consider 
- * issuing a command to downscale in the mpv console, like so:
+ * beforehand (whether it was done by you or not). In such cases, consider 
+ * issuing a command to downscale in the mpv console (backtick ` key):
  *
  * vf toggle scale=-2:720
  *
@@ -65,12 +65,13 @@
  * may be different for your system.
  *
  * If your GPU doesn't support textureGather, or if you are on a version of mpv 
- * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ and VLQ 
- * profiles.
+ * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ profile
  *
- * textureGather is LUMA only and limited to the following configurations:
+ * If you plan on tinkering with NLM's settings, read below:
  *
- * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}:M!=1
+ * textureGather only applies to luma and limited to the these configurations:
+ *
+ * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}
  *   - Default, very fast, rotations and reflections should be free
  *   - If this is unusually slow then try changing gpu-api and vo
  *   - If it's still slow, try setting RI/RFI to 0.
@@ -83,15 +84,16 @@
  *
  * Options which always disable textureGather:
  * 	- PD
+ * 	- NG
  */
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!DESC Non-local means (downscale)
 //!BIND HOOKED
-//!SAVE PRERF_LUMA
 //!WIDTH HOOKED.w 1.25 /
 //!HEIGHT HOOKED.h 1.25 /
+//!DESC Non-local means (PRERF)
+//!SAVE PRERF_LUMA
 
 vec4 hook()
 {
@@ -100,11 +102,11 @@ vec4 hook()
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!DESC Non-local means (unscale)
 //!BIND PRERF_LUMA
-//!SAVE RF_LUMA
 //!WIDTH HOOKED.w
 //!HEIGHT HOOKED.h
+//!DESC Non-local means (RF)
+//!SAVE RF_LUMA
 
 vec4 hook()
 {
@@ -113,66 +115,48 @@ vec4 hook()
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!DESC Non-local means (downscale)
-//!WIDTH LUMA.w 3 /
-//!HEIGHT LUMA.h 3 /
-//!BIND LUMA
-//!SAVE EP
+//!BIND RF_LUMA
+//!WIDTH RF_LUMA.w
+//!HEIGHT RF_LUMA.h
+//!DESC Non-local means (RF, share)
+//!SAVE RF
 
 vec4 hook()
 {
-	return LUMA_texOff(0);
+	return RF_LUMA_texOff(0);
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!DESC Non-local means (share)
-//!BIND RF_LUMA
-//!SAVE RF
+//!BIND LUMA
+//!WIDTH LUMA.w 3 /
+//!HEIGHT LUMA.h 3 /
+//!DESC Non-local means (EP)
+//!SAVE EP
 
 vec4 hook()
 {
-	return RF_LUMA_texOff(0);
+	return LUMA_texOff(0);
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
 //!BIND HOOKED
 //!BIND RF_LUMA
-//!BIND EP
 //!BIND RF
+//!BIND EP
 //!DESC Non-local means (nlmeans_lq.glsl)
 
-/* User variables
- *
- * It is usually preferable to denoise chroma and luma differently, so the user 
- * variables for luma and chroma are split.
- */
+// User variables
 
-/* S = denoising factor
- * P = patch size
- * R = research size
- *
- * The denoising factor controls the level of blur, higher is blurrier.
- *
- * Patch size should usually be an odd number greater than or equal to 3. 
- * Higher values are slower and not always better.
- *
- * Research size usually be an odd number greater than or equal to 3. Higher 
- * values are usually better, but slower and offer diminishing returns.
- *
- * Even-numbered patch/research sizes will sample between pixels unless PS=6. 
- * It's not known whether this is ever useful behavior or not. This is 
- * incompatible with textureGather optimizations, so NG=1 to disable them.
- */
+// It is generally preferable to denoise luma and chroma differently, so the 
+// user variables for luma and chroma are split.
+
+// Denoising factor (level of blur, higher means more blur)
 #ifdef LUMA_raw
 #define S 1.25
-#define P 3
-#define R 3
 #else
 #define S 5.0
-#define P 3
-#define R 5
 #endif
 
 /* Adaptive sharpening
@@ -180,11 +164,16 @@ vec4 hook()
  * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
  * weight map to restrict the sharpening to edges.
  *
- * Use M=4 to get a good look at which areas are/aren't sharpened.
+ * If you just want to increase/decrease sharpness then you want to change ASF.
+ *
+ * Use V=4 to visualize which areas are sharpened (black means sharpen).
  *
- * AS: 2 for sharpening, 1 for sharpening+denoising, 0 to disable
- * ASF: Sharpening factor, higher numbers make a sharper underlying image
- * ASP: Weight power, higher numbers use more of the sharp image
+ * AS:
+ * 	- 0 to disable
+ * 	- 1 to sharpen+denoise
+ * 	- 2 to sharpen only
+ * ASF: Higher numbers make a sharper image
+ * ASP: Higher numbers use more of the sharp image
  * ASW:
  * 	- 0 to use pre-WD weights
  * 	- 1 to use post-WD weights (ASP should be ~2x to compensate)
@@ -196,15 +185,15 @@ vec4 hook()
  */
 #ifdef LUMA_raw
 #define AS 0
-#define ASF 2.0
-#define ASP 4.0
+#define ASF 3.0
+#define ASP 1.0
 #define ASW 0
 #define ASK 1
 #define ASC 0.0
 #else
 #define AS 0
-#define ASF 2.0
-#define ASP 4.0
+#define ASF 3.0
+#define ASP 1.0
 #define ASW 0
 #define ASK 1
 #define ASC 0.0
@@ -212,12 +201,10 @@ vec4 hook()
 
 /* Starting weight
  *
- * Lower numbers give less weight to the pixel-of-interest, which may help 
- * handle higher noise levels, ringing, and may be useful for other things too?
+ * Also known as the center weight. This represents the weight of the 
+ * pixel-of-interest. Lower numbers may help handle heavy noise & ringing.
  *
- * EPSILON should be used instead of zero to avoid divide-by-zero errors. The 
- * avg_weight/old_avg_weight variables may be used to make SW adapt to the 
- * local noise level, e.g., SW=max(avg_weight, EPSILON)
+ * EPSILON should be used instead of zero to avoid divide-by-zero errors.
  */
 #ifdef LUMA_raw
 #define SW 1.0
@@ -232,7 +219,7 @@ vec4 hook()
  * result, especially around edges.
  * 
  * WD:
- * 	- 2: True average. Very good quality, but slower and uses more memory.
+ * 	- 2: True average. Better quality, but slower and requires GLSL 4.0 or later
  * 	- 1: Moving cumulative average. Inaccurate, tends to blur directionally.
  * 	- 0: Disable
  *
@@ -251,12 +238,14 @@ vec4 hook()
 
 /* Extremes preserve
  *
- * Reduces denoising around very bright/dark areas. The downscaling factor of 
- * EP (located near the top of this shader) controls the area sampled for 
- * luminance (higher numbers consider more area).
+ * Reduces denoising around very bright/dark areas.
+ *
+ * The downscaling factor of the EP shader stage affects what is considered a 
+ * bright/dark area. The default of 3 should be fine, it's not recommended to 
+ * change this.
  *
  * This is incompatible with RGB. If you have RGB hooks enabled then you will 
- * have to delete the EP shader stage or specify EP=0 through nlmeans_cfg.
+ * have to delete the EP shader stage or specify EP=0 through shader_cfg.
  *
  * EP: 1 to enable, 0 to disable
  * DP: EP strength on dark patches, 0 to fully denoise
@@ -278,25 +267,26 @@ vec4 hook()
 /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
 /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
 
-/* Robust filtering
+/* Patch & research sizes
  *
- * This setting is dependent on code generation from nlmeans_cfg, so this 
- * setting can only be enabled via nlmeans_cfg.
+ * Patch size should be an odd number greater than or equal to 3. Higher values 
+ * are slower and not always better.
  *
- * Compares the pixel-of-interest against a guide, which could be a downscaled 
- * image or the output of another shader such as guided.glsl
+ * Research size be an odd number greater than or equal to 3. Higher values are 
+ * generally better, but slower, blurrier, and gives diminishing returns.
  */
 #ifdef LUMA_raw
-#define RF 1
+#define P 3
+#define R 3
 #else
-#define RF 1
+#define P 3
+#define R 5
 #endif
 
-/* Search shape
+/* Patch and research shapes
  *
- * Determines the shape of patches and research zones. Different shapes have 
- * different speed and quality characteristics. Every shape (besides square) is 
- * smaller than square.
+ * Different shapes have different speed and quality characteristics. Every 
+ * shape (besides square) is smaller than square.
  *
  * PS applies applies to patches, RS applies to research zones.
  *
@@ -319,11 +309,22 @@ vec4 hook()
 #define PS 3
 #endif
 
+/* Robust filtering
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
+ *
+ * Compares the pixel-of-interest against a guide, which could be a downscaled 
+ * image or the output of another shader
+ */
+#define RF_LUMA 1
+#define RF 1
+
 /* Rotational/reflectional invariance
  *
- * Number of rotations/reflections to try for each patch comparison. Slow, but 
- * improves feature preservation, although adding more rotations/reflections 
- * gives diminishing returns. The most similar rotation/reflection will be used.
+ * Number of rotations/reflections to try for each patch comparison. Can be 
+ * slow, but improves feature preservation. More rotations/reflections gives 
+ * diminishing returns. The most similar rotation/reflection will be used.
  *
  * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
  * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
@@ -340,29 +341,39 @@ vec4 hook()
 #endif
 
 /* Temporal denoising
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
  *
  * Caveats:
- * 	- Slower, each frame needs to be researched
- * 	- Requires vo=gpu-next and nlmeans_temporal.glsl
+ * 	- Slower:
+ * 		- Each frame needs to be researched (more samples & more math)
+ * 		- Gather optimizations only apply to the current frame
+ * 	- Requires vo=gpu-next
  * 	- Luma-only (this is a bug)
  * 	- Buggy
  *
- * Gather samples across multiple frames. May cause motion blur and may 
- * struggle more with noise that persists across multiple frames (e.g., from 
- * compression or duplicate frames), but can work very well on high quality 
- * video.
+ * May cause motion blur and may struggle more with noise that persists across 
+ * multiple frames (e.g., from compression or duplicate frames), but can work 
+ * very well on high quality video.
  *
  * Motion estimation (ME) should improve quality without impacting speed.
  *
  * T: number of frames used
  * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg
+ * MEF: estimate factor, compensates for ME being one frame behind
+ * TRF: compare against the denoised frames
  */
 #ifdef LUMA_raw
 #define T 0
 #define ME 1
+#define MEF 2
+#define TRF 0
 #else
 #define T 0
 #define ME 0
+#define MEF 2
+#define TRF 0
 #endif
 
 /* Spatial kernel
@@ -374,69 +385,79 @@ vec4 hook()
  * closer/further, for instance SD=(1,1,0.5) would make the temporal axis 
  * appear closer and increase blur between frames.
  *
- * The intra-patch variants do not yet have well-understood effects. They are 
- * intended to make large patch sizes more useful. Likely slower.
+ * The intra-patch variants are supposed to help with larger patch sizes.
  *
- * SS: spatial denoising factor
+ * SST: enables spatial kernel if R>=PST, 0 fully disables
+ * SS: spatial sigma
  * SD: spatial distortion (X, Y, time)
- * PSS: intra-patch spatial denoising factor
+ * PSS: intra-patch spatial sigma
  * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables
  * PSD: intra-patch spatial distortion (X, Y)
  */
 #ifdef LUMA_raw
+#define SST 1
 #define SS 0.25
-#define SD vec3(1,1,1.5)
+#define SD vec3(1,1,1)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #else
+#define SST 1
 #define SS 0.25
-#define SD vec3(1,1,1.5)
+#define SD vec3(1,1,1)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #endif
 
-// Scaling factor (should match WIDTH/HEIGHT)
+/* Kernels
+ *
+ * SK: spatial kernel
+ * RK: range kernel (takes patch differences)
+ * PSK: intra-patch spatial kernel
+ *
+ * List of available kernels:
+ *
+ * bicubic
+ * cos
+ * gaussian
+ * lanczos
+ * quadratic
+ * sinc
+ * sphinx
+ */
 #ifdef LUMA_raw
-#define SF 1
+#define SK gaussian
+#define RK gaussian
+#define PSK gaussian
 #else
-#define SF 1
+#define SK gaussian
+#define RK gaussian
+#define PSK gaussian
 #endif
 
-/* Estimator
- *
- * 0: means
- * 1: Euclidean medians (extremely slow, may be good for heavy noise)
- * 2: weight map (not a denoiser, maybe useful for generating image masks)
- * 3: weighted median intensity (slow, may be good for heavy noise)
- * 4: edge map (based on the relevant AS settings)
- */
+// Scaling factor (should match WIDTH/HEIGHT)
 #ifdef LUMA_raw
-#define M 0
+#define SF 1
 #else
-#define M 0
+#define SF 1
 #endif
 
-/* Difference visualization
- *
- * Visualizes the difference between input/output image
+/* Visualization
  *
  * 0: off
- * 1: absolute difference scaled by S
- * 2: difference centered on 0.5
+ * 1: absolute difference between input/output to the power of 0.25
+ * 2: difference between input/output centered on 0.5
+ * 3: avg_weight
+ * 4: edge map (based on the relevant AS settings)
  */
 #ifdef LUMA_raw
-#define DV 0
+#define V 0
 #else
-#define DV 0
+#define V 0
 #endif
 
-/* Blur factor
- *
- * 0 to 1, only useful for alternative estimators. You're probably looking for 
- * "S" (denoising factor), go back to the top of the shader!
- */
+// Blur factor (0.0 returns the input image, 1.0 returns the output image)
 #ifdef LUMA_raw
 #define BF 1.0
 #else
@@ -457,17 +478,57 @@ vec4 hook()
 #define PD 0
 #endif
 
-// Duplicate 1st weight (for LGC)
+// Duplicate 1st weight (for luma-guided-chroma)
 #ifdef LUMA_raw
 #define D1W 0
 #else
 #define D1W 0
 #endif
 
-/* Shader code */
+// Skip patch comparison
+#ifdef LUMA_raw
+#define SKIP_PATCH 0
+#else
+#define SKIP_PATCH 0
+#endif
+
+// Shader code
 
 #define EPSILON 0.00000000001
 #define M_PI 3.14159265358979323846
+#define POW2(x) ((x)*(x))
+#define POW3(x) ((x)*(x)*(x))
+#define bicubic(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0))))
+#define gaussian(x) exp(-1 * POW2(x))
+#define lanczos(x) POW2(sinc(x))
+#define quadratic(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5))
+#define sinc(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI))
+#define sphinx(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI))
+
+// XXX could maybe be better optimized on LGC
+// XXX return original alpha component instead of 1.0
+#if defined(LUMA_raw)
+#define val float
+#define val_swizz(v) (v.x)
+#define unval(v) vec4(v.x, 0, 0, 1.0)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#elif defined(CHROMA_raw)
+#define val vec2
+#define val_swizz(v) (v.xy)
+#define unval(v) vec4(v.x, v.y, 0, 1.0)
+#define val_packed uint
+#define val_pack(v) packUnorm2x16(v)
+#define val_unpack(v) unpackUnorm2x16(v)
+#else
+#define val vec3
+#define val_swizz(v) (v.xyz)
+#define unval(v) vec4(v.x, v.y, v.z, 1.0)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#endif
 
 #if PS == 6
 const int hp = P/2;
@@ -482,39 +543,96 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res
 #endif
 
 // donut increment, increments without landing on (0,0,0)
-// much faster than a "continue" statement
+// much faster than a continue statement
 #define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0))))
 
-// search shapes and their corresponding areas
-#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
+// patch/research shapes
+// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R)
+// dots (.) represent samples (pixels) and X represents the pixel-of-interest
+
+// Z    .....
+// Z    .....
+// Z    ..X..
+// Z    .....
+// Z    .....
+#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr)
+
+// (in this instance Z=4)
+// Z    ....
+// Z    ....
+// Z    ..X.
+// Z    ....
+#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr)
 
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
 #define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr)
+
+// Z-4    .
+// Z-2   ...
+// hz+1 ..X
 #define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr)
 #define S_TRIANGLE_A(hz,Z) int(hz*hz+Z)
 
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
+// Z-2   ...
+// Z-4    .
 #define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr)
 #define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z)
 
-#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
+//
+// Z    ..X..
+//
 #define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++)
 
+// 90 degree rotation of S_HORIZONTAL
+#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
+
+// 1      .
+// 1      . 
+// Z    ..X..
+// 1      . 
+// 1      .
 #define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr)
 #define S_PLUS_A(hz,Z) (Z*2 - 1)
 
-#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr)
-#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr)
+// XXX implement S_PLUS w/ an X overlayed:
+// 3    . . .
+// 3     ...
+// Z    ..X..
+// 3     ...
+// 3    . . .
+
+// XXX implement an X shape:
+// 2    .   .
+// 2     . .
+// 1      X  
+// 2     . .
+// 2    .   .
+
+// 1x1 square
+#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
 
 #define T1 (T+1)
 #define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++)
 
+#ifdef LUMA_raw
+#define RF_ RF_LUMA
+#else
+#define RF_ RF
+#endif
+
 // Skip comparing the pixel-of-interest against itself, unless RF is enabled
-#if RF
+#if RF_
 #define RINCR(z,c) (z.c++)
 #else
 #define RINCR DINCR
 #endif
 
-#define R_AREA(a) (a * T1 + RF-1)
+#define R_AREA(a) (a * T1 + RF_-1)
 
 // research shapes
 // XXX would be nice to have the option of temporally-varying research sizes
@@ -603,44 +721,44 @@ const int p_area = P_AREA(P*P);
 const float r_scale = 1.0/r_area;
 const float p_scale = 1.0/p_area;
 
-#define load_(off)  HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off))
+#define sample(tex, pos, size, pt, off) tex(pos + pt * (vec2(off) + 0.5 - fract(pos*size)))
+#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off)
 
-#if RF && defined(LUMA_raw)
-#define load2_(off) RF_LUMA_tex(RF_LUMA_pos + RF_LUMA_pt * vec2(off))
+#if RF_ && defined(LUMA_raw)
+#define load2_(off) sample(RF_LUMA_tex, RF_LUMA_pos, RF_LUMA_size, RF_LUMA_pt, off)
 #define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr)))
 #define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0)
-#elif RF && D1W
-#define load2_(off) RF_tex(RF_pos + RF_pt * vec2(off))
+#elif RF_ && D1W
+#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
 #define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr)))
 #define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0)
-#elif RF
-#define load2_(off) RF_tex(RF_pos + RF_pt * vec2(off))
+#elif RF_
+#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
 #else
-#define load2_(off) HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off))
+#define load2_(off) load_(off)
 #define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr)))
 #define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0)
 #endif
 
 #if T
-vec4 load(vec3 off)
+val load(vec3 off)
 {
-	switch (int(off.z)) {
-	case 0: return load_(off);
+	switch (min(int(off.z), frame)) {
+	case 0: return val_swizz(load_(off));
+
 	}
 }
-vec4 load2(vec3 off)
+val load2(vec3 off)
 {
-	switch (int(off.z)) {
-	case 0: return load2_(off);
-	}
+	return off.z == 0 ? val_swizz(load2_(off)) : load(off);
 }
 #else
-#define load(off) load_(off)
-#define load2(off) load2_(off)
+#define load(off) val_swizz(load_(off))
+#define load2(off) val_swizz(load2_(off))
 #endif
 
-vec4 poi = load(vec3(0)); // pixel-of-interest
-vec4 poi2 = load2(vec3(0)); // guide pixel-of-interest
+val poi = load(vec3(0)); // pixel-of-interest
+val poi2 = load2(vec3(0)); // guide pixel-of-interest
 
 #if RI // rotation
 vec2 rot(vec2 p, float d)
@@ -667,22 +785,52 @@ vec2 ref(vec2 p, int d)
 #define ref(p, d) (p)
 #endif
 
-vec4 patch_comparison(vec3 r, vec3 r2)
+#if SST && R >= SST
+float spatial_r(vec3 v)
+{
+	v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size);
+	return SK(length(v*SD)*SS);
+}
+#else
+#define spatial_r(v) (1)
+#endif
+
+#if PST && P >= PST
+#define spatial_p(v) PSK(length(v*PSD)*PSS)
+#else
+#define spatial_p(v) (1)
+#endif
+
+val range(val pdiff_sq)
+{
+	const float h = S*0.013;
+	const float pdiff_scale = 1.0/(h*h);
+	pdiff_sq = sqrt(pdiff_sq * pdiff_scale);
+#if defined(LUMA_raw)
+	return RK(pdiff_sq);
+#elif defined(CHROMA_raw)
+	return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y));
+#else
+	return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z));
+#endif
+	//return exp(-pdiff_sq * pdiff_scale);
+
+	// weight function from the NLM paper, it's not very good
+	//return exp(-max(pdiff_sq - 2*S*S, 0.0) * pdiff_scale);
+}
+
+val patch_comparison(vec3 r, vec3 r2)
 {
 	vec3 p;
-	vec4 min_rot = vec4(p_area);
+	val min_rot = val(p_area);
 
 	FOR_ROTATION FOR_REFLECTION {
-		vec4 pdiff_sq = vec4(0);
+		val pdiff_sq = val(0);
 		FOR_PATCH(p) {
 			vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z);
-			vec4 diff_sq = load2(p + r2) - load2((transformed_p + r) * SF);
+			val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF);
 			diff_sq *= diff_sq;
-#if PST && P >= PST
-			float pdist = length(p.xy*PSD)*PSS;
-			pdist = exp(-(pdist*pdist));
-			diff_sq = pow(max(diff_sq, EPSILON), vec4(pdist));
-#endif
+			diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy);
 			pdiff_sq += diff_sq;
 		}
 		min_rot = min(min_rot, pdiff_sq);
@@ -694,14 +842,15 @@ vec4 patch_comparison(vec3 r, vec3 r2)
 #define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
 #define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
 
-#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && M != 1 && REGULAR_ROTATIONS && NO_GATHER
+#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER
 // 3x3 diamond/plus patch_comparison_gather
 // XXX extend to support arbitrary sizes (probably requires code generation)
 // XXX extend to support 3x3 square
+// XXX support PSS
 const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
 const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
 vec4 poi_patch = gather_offs(0, offsets);
-vec4 patch_comparison_gather(vec3 r, vec3 r2)
+float patch_comparison_gather(vec3 r, vec3 r2)
 {
 	float min_rot = p_area - 1;
 	vec4 transformer = gather_offs(r, offsets_sf);
@@ -725,13 +874,12 @@ vec4 patch_comparison_gather(vec3 r, vec3 r2)
 	}
 	float center_diff_sq = poi2.x - load2(r).x;
 	center_diff_sq *= center_diff_sq;
-	return vec4(min_rot + center_diff_sq, 0, 0, 0) * p_scale;
+	return (min_rot + center_diff_sq) * p_scale;
 }
-#elif (defined(LUMA_gather) || D1W) && PS == 6 && REGULAR_ROTATIONS && NO_GATHER
+#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER
 // tiled even square patch_comparison_gather
 // XXX extend to support odd square?
-// XXX rotations/reflections appear to be subtly broken
-vec4 patch_comparison_gather(vec3 r, vec3 r2)
+float patch_comparison_gather(vec3 r, vec3 r2)
 {
 	vec2 tile;
 	float min_rot = p_area;
@@ -740,40 +888,17 @@ vec4 patch_comparison_gather(vec3 r, vec3 r2)
 	 * w z
 	 * x y
 	 */
-	FOR_ROTATION FOR_REFLECTION {
-		float pdiff_sq = 0;
-		for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) {
-			vec4 poi_patch = gather(tile + r2.xy);
-			vec4 transformer = gather(ref(rot(tile + 0.5, ri), rfi) - 0.5 + r.xy);
-
-#if RI
-			for (float i = 0; i < ri; i+=90)
-				transformer = transformer.wxyz; // rotate 90 degrees
-#endif
-#if RFI // XXX output is a little off
-			switch(rfi) {
-			case 1: transformer = transformer.zyxw; break;
-			case 2: transformer = transformer.xwzy; break;
-			}
-#endif
-
-			vec4 diff_sq = (poi_patch - transformer) * (poi_patch - transformer);
-#if PST && P >= PST
-			// XXX refactor to avoid pow (should probably break off into a function)
-			vec4 pdist = vec4(
-				exp(-pow(length((tile+vec2(0,1))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(1,1))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(1,0))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(0,0))*PSD)*PSS, 2))
-			);
-			diff_sq = pow(max(diff_sq, EPSILON), pdist);
-#endif
-			pdiff_sq += dot(diff_sq, vec4(1));
-		}
-		min_rot = min(min_rot, pdiff_sq);
+	float pdiff_sq = 0;
+	for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) {
+		vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy);
+		diff_sq *= diff_sq;
+		diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)),
+			                                 spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0)));
+		pdiff_sq += dot(diff_sq, vec4(1));
 	}
+	min_rot = min(min_rot, pdiff_sq);
 
-	return vec4(min_rot, 0, 0, 0) * p_scale;
+	return min_rot * p_scale;
 }
 #else
 #define patch_comparison_gather patch_comparison
@@ -781,9 +906,9 @@ vec4 patch_comparison_gather(vec3 r, vec3 r2)
 
 vec4 hook()
 {
-	vec4 total_weight = vec4(0);
-	vec4 sum = vec4(0);
-	vec4 result = vec4(0);
+	val total_weight = val(0);
+	val sum = val(0);
+	val result = val(0);
 
 	vec3 r = vec3(0);
 	vec3 p = vec3(0);
@@ -797,41 +922,38 @@ vec4 hook()
 	float me_weight = 0;
 #endif
 
-#if WD == 2 || M == 3 // weight discard, weighted median intensities
+#if WD == 2 // weight discard
 	int r_index = 0;
-	vec4 all_weights[r_area];
-	vec4 all_pixels[r_area];
+	val_packed all_weights[r_area];
+	val_packed all_pixels[r_area];
 #elif WD == 1 // weight discard
-	vec4 no_weights = vec4(0);
-	vec4 discard_total_weight = vec4(0);
-	vec4 discard_sum = vec4(0);
-#endif
-
-#if M == 1 // Euclidean medians
-	vec4 minsum = vec4(0);
+	val no_weights = val(0);
+	val discard_total_weight = val(0);
+	val discard_sum = val(0);
 #endif
 
 	FOR_FRAME(r) {
 	// XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
 #if T && ME == 1 // temporal & motion estimation max weight
 	if (r.z > 0) {
-		me += me_tmp;
+		me += me_tmp * MEF;
 		me_tmp = vec3(0);
 		maxweight = 0;
 	}
 #elif T && ME == 2 // temporal & motion estimation weighted average
 	if (r.z > 0) {
-		me += round(me_sum / me_weight);
+		me += round(me_sum / me_weight * MEF);
 		me_sum = vec3(0);
 		me_weight = 0;
 	}
 #endif
-	FOR_RESEARCH(r) {
-		// main NLM logic
-		const float h = S*0.013;
-		const float pdiff_scale = 1.0/(h*h);
-		vec4 pdiff_sq = (r.z == 0) ? patch_comparison_gather(r+me, vec3(0)) : patch_comparison(r+me, vec3(0));
-		vec4 weight = exp(-pdiff_sq * pdiff_scale);
+	FOR_RESEARCH(r) { // main NLM logic
+#if SKIP_PATCH
+		val weight = val(1);
+#else
+		val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(r+me, vec3(0))) : patch_comparison(r+me, vec3(0));
+		val weight = range(pdiff_sq);
+#endif
 
 #if T && ME == 1 // temporal & motion estimation max weight
 		me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));
@@ -842,18 +964,18 @@ vec4 hook()
 #endif
 
 #if D1W
-		weight = vec4(weight.x);
+		weight = val(weight.x);
 #endif
 
-		weight *= exp(-(length(r*SD)*SS * length(r*SD)*SS)); // spatial kernel
+		weight *= spatial_r(r);
 
-#if WD == 2 || M == 3 // weight discard, weighted median intensity
-		all_weights[r_index] = weight;
-		all_pixels[r_index] = load(r+me);
+#if WD == 2 // weight discard
+		all_weights[r_index] = val_pack(weight);
+		all_pixels[r_index] = val_pack(load(r+me));
 		r_index++;
 #elif WD == 1 // weight discard
-		vec4 wd_scale = 1.0/max(no_weights, 1);
-		vec4 keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
+		val wd_scale = 1.0/max(no_weights, 1);
+		val keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
 		discard_sum += load(r+me) * weight * (1 - keeps);
 		discard_total_weight += weight * (1 - keeps);
 		no_weights += keeps;
@@ -861,45 +983,25 @@ vec4 hook()
 
 		sum += load(r+me) * weight;
 		total_weight += weight;
-
-#if M == 1 // Euclidean median
-		// Based on: https://arxiv.org/abs/1207.3056
-		// XXX might not work with ME
-		vec3 r2;
-		vec4 wpdist_sum = vec4(0);
-		FOR_FRAME(r2) FOR_RESEARCH(r2) {
-			vec4 pdist = (r.z + r2.z) == 0 ? patch_comparison_gather(r+me, r2+me) : patch_comparison(r+me, r2+me);
-			wpdist_sum += sqrt(pdist) * (1-weight);
-		}
-
-		vec4 newmin = step(wpdist_sum, minsum); // wpdist_sum <= minsum
-		newmin *= 1 - step(wpdist_sum, vec4(0)); // && wpdist_sum > 0
-		newmin += step(minsum, vec4(0)); // || minsum <= 0
-		newmin = min(newmin, 1);
-
-		minsum = (newmin * wpdist_sum) + ((1-newmin) * minsum);
-		result = (newmin * load(r+me)) + ((1-newmin) * result);
-#endif
 	} // FOR_RESEARCH
 	} // FOR_FRAME
 
-	// XXX optionally put the denoised pixel into the frame buffer?
-#if T // temporal
-#endif
-
-	vec4 avg_weight = total_weight * r_scale;
-	vec4 old_avg_weight = avg_weight;
+	val avg_weight = total_weight * r_scale;
+	val old_avg_weight = avg_weight;
 
 #if WD == 2 // true average
-	total_weight = vec4(0);
-	sum = vec4(0);
-	vec4 no_weights = vec4(0);
+	total_weight = val(0);
+	sum = val(0);
+	val no_weights = val(0);
 
 	for (int i = 0; i < r_area; i++) {
-		vec4 keeps = step(avg_weight*WDT, all_weights[i]);
-		all_weights[i] *= keeps;
-		sum += all_pixels[i] * all_weights[i];
-		total_weight += all_weights[i];
+		val w = val_unpack(all_weights[i]);
+		val px = val_unpack(all_pixels[i]);
+		val keeps = step(avg_weight*WDT, w);
+
+		w *= keeps;
+		sum += px * w;
+		total_weight += w;
 		no_weights += keeps;
 	}
 #elif WD == 1 // moving cumulative average
@@ -910,29 +1012,23 @@ vec4 hook()
 	avg_weight = total_weight / no_weights;
 #endif
 
-	total_weight += SW;
-	sum += poi * SW;
+	total_weight += SW * spatial_r(vec3(0));
+	sum += poi * SW * spatial_r(vec3(0));
 
-#if M == 3 // weighted median intensity
-	const float hr_area = r_area/2.0;
-	vec4 is_median, gt, lt, gte, lte, neq;
+#if V == 3 // weight map
+	result = val(avg_weight);
+#else // mean
+	result = val(sum / total_weight);
+#endif
 
-	for (int i = 0; i < r_area; i++) {
-		gt = lt = vec4(0);
-		for (int j = 0; j < r_area; j++) {
-			gte = step(all_pixels[i]*all_weights[i], all_pixels[j]*all_weights[j]);
-			lte = step(all_pixels[j]*all_weights[j], all_pixels[i]*all_weights[i]);
-			neq = 1 - gte * lte;
-			gt += gte * neq;
-			lt += lte * neq;
-		}
-		is_median = step(gt, vec4(hr_area)) * step(lt, vec4(hr_area));
-		result += step(result, vec4(0)) * is_median * all_pixels[i];
-	}
-#elif M == 2 // weight map
-	result = avg_weight;
-#elif M == 0 // mean
-	result = sum / total_weight;
+	// store frames for temporal
+#if T > 1
+
+#endif
+#if T && TRF
+	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result));
+#elif T
+	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2));
 #endif
 
 #if ASW == 0 // pre-WD weights
@@ -942,22 +1038,20 @@ vec4 hook()
 #endif
 
 #if ASK == 0
-	vec4 sharpening_strength = pow(AS_weight, vec4(ASP));
+	val sharpening_strength = pow(AS_weight, val(ASP));
 #elif ASK == 1
-#define sigmoid(x) (tanh(x * 2*M_PI - M_PI)*0.5+0.5)
-	vec4 sharpening_strength = mix(pow(sigmoid(AS_weight), vec4(ASP)),
-	                               AS_weight, ASC);
-	// just in case ASC < 0 (will sharpen but it's janky XXX)
-	sharpening_strength = clamp(sharpening_strength, 0.0, 1.0);
+	val sharpening_strength = mix(
+			pow(smoothstep(0.0, 1.0, AS_weight), val(ASP)),
+			AS_weight, ASC);
+	// XXX normalize the result to account for a negative ASC?
 #elif ASK == 2
-	vec4 sharpening_strength = vec4(ASP);
+	val sharpening_strength = val(ASP);
 #endif
 
-	// XXX maybe allow for alternative blurs? e.g., replace result w/ load2?
 #if AS == 1 // sharpen+denoise
-	vec4 sharpened = result + (poi - result) * ASF;
+	val sharpened = result + (poi - result) * ASF;
 #elif AS == 2 // sharpen only
-	vec4 sharpened = poi + (poi - result) * ASF;
+	val sharpened = poi + (poi - result) * ASF;
 #endif
 
 #if EP // extremes preserve
@@ -973,20 +1067,20 @@ vec4 hook()
 	result = mix(sharpened, poi, sharpening_strength);
 #endif
 
-#if M == 4 // edge map
+#if V == 4 // edge map
 	result = sharpening_strength;
 #endif
 
-#if (M == 2 || M == 4) && defined(CHROMA_raw) // drop chroma for weight maps
-	result = vec4(0.5);
+#if (V == 3 || V == 4) && defined(CHROMA_raw) // drop chroma for these visualizations
+	return vec4(0.5);
 #endif
 
-#if DV == 1
-	result = clamp(abs(poi - result) * S, 0.0, 1.0);
-#elif DV == 2
+#if V == 1
+	result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0);
+#elif V == 2
 	result = (poi - result) * 0.5 + 0.5;
 #endif
 
-	return mix(poi, result, BF);
+	return unval(mix(poi, result, BF));
 }
 
diff --git a/portable_config/shaders/nlmeans_temporal.glsl b/portable_config/shaders/nlmeans_temporal.glsl
index 01dfdd52..a3bf340d 100644
--- a/portable_config/shaders/nlmeans_temporal.glsl
+++ b/portable_config/shaders/nlmeans_temporal.glsl
@@ -19,7 +19,7 @@
  * along with this program. If not, see <https://www.gnu.org/licenses/>.
  */
 
-// Profile description: Very experimental and buggy, limited to vo=gpu-next.
+// Description: nlmeans_temporal.glsl: Very experimental and buggy, limited to vo=gpu-next.
 
 /* The recommended usage of this shader and its variant profiles is to add them 
  * to input.conf and then dispatch the appropriate shader via a keybind during 
@@ -48,8 +48,8 @@
  * of noise.
  *
  * The denoiser will not work properly if the content has been upscaled 
- * beforehand, whether it was done by you or someone down the line. Consider 
- * issuing a command to downscale in the mpv console, like so:
+ * beforehand (whether it was done by you or not). In such cases, consider 
+ * issuing a command to downscale in the mpv console (backtick ` key):
  *
  * vf toggle scale=-2:720
  *
@@ -65,12 +65,13 @@
  * may be different for your system.
  *
  * If your GPU doesn't support textureGather, or if you are on a version of mpv 
- * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ and VLQ 
- * profiles.
+ * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ profile
  *
- * textureGather is LUMA only and limited to the following configurations:
+ * If you plan on tinkering with NLM's settings, read below:
  *
- * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}:M!=1
+ * textureGather only applies to luma and limited to the these configurations:
+ *
+ * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}
  *   - Default, very fast, rotations and reflections should be free
  *   - If this is unusually slow then try changing gpu-api and vo
  *   - If it's still slow, try setting RI/RFI to 0.
@@ -83,6 +84,7 @@
  *
  * Options which always disable textureGather:
  * 	- PD
+ * 	- NG
  */
 
 // The following is shader code injected from guided.glsl
@@ -104,7 +106,7 @@
  * along with this program. If not, see <https://www.gnu.org/licenses/>.
  */
 
-//desc: Guided filter guided by the downscaled image
+// Description: guided.glsl: Guided by the downscaled image
 
 /* The radius can be adjusted with the MEANI stage's downscaling factor. 
  * Higher numbers give a bigger radius.
@@ -120,10 +122,10 @@
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!DESC Guided filter (PREI)
 //!BIND HOOKED
 //!WIDTH HOOKED.w 1.25 /
 //!HEIGHT HOOKED.h 1.25 /
+//!DESC Guided filter (PREI)
 //!SAVE _INJ_PREI
 
 vec4 hook()
@@ -133,10 +135,10 @@ vec4 hook()
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!DESC Guided filter (I)
 //!BIND _INJ_PREI
-//!WIDTH HOOKED.w 1.0 /
-//!HEIGHT HOOKED.h 1.0 /
+//!WIDTH HOOKED.w
+//!HEIGHT HOOKED.h
+//!DESC Guided filter (I)
 //!SAVE _INJ_I
 
 vec4 hook()
@@ -144,6 +146,7 @@ vec4 hook()
 return _INJ_PREI_texOff(0);
 }
 
+
 //!HOOK LUMA
 //!HOOK CHROMA
 //!DESC Guided filter (P)
@@ -310,72 +313,54 @@ vec4 hook()
 return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0);
 }
 
-// End of source code injected from guided.glsl
+// End of source code injected from guided.glsl 
+
 //!HOOK LUMA
 //!HOOK CHROMA
-//!DESC Non-local means (downscale)
-//!WIDTH LUMA.w 3 /
-//!HEIGHT LUMA.h 3 /
-//!BIND LUMA
-//!SAVE EP
+//!BIND RF_LUMA
+//!WIDTH RF_LUMA.w
+//!HEIGHT RF_LUMA.h
+//!DESC Non-local means (RF, share)
+//!SAVE RF
 
 vec4 hook()
 {
-	return LUMA_texOff(0);
+	return RF_LUMA_texOff(0);
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!DESC Non-local means (share)
-//!BIND RF_LUMA
-//!SAVE RF
+//!BIND LUMA
+//!WIDTH LUMA.w 3 /
+//!HEIGHT LUMA.h 3 /
+//!DESC Non-local means (EP)
+//!SAVE EP
 
 vec4 hook()
 {
-	return RF_LUMA_texOff(0);
+	return LUMA_texOff(0);
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
 //!BIND HOOKED
 //!BIND RF_LUMA
-//!BIND EP
 //!BIND RF
+//!BIND EP
 //!BIND PREV1
 //!BIND PREV2
-//!BIND PREV3
 //!DESC Non-local means (nlmeans_temporal.glsl)
 
-/* User variables
- *
- * It is usually preferable to denoise chroma and luma differently, so the user 
- * variables for luma and chroma are split.
- */
+// User variables
 
-/* S = denoising factor
- * P = patch size
- * R = research size
- *
- * The denoising factor controls the level of blur, higher is blurrier.
- *
- * Patch size should usually be an odd number greater than or equal to 3. 
- * Higher values are slower and not always better.
- *
- * Research size usually be an odd number greater than or equal to 3. Higher 
- * values are usually better, but slower and offer diminishing returns.
- *
- * Even-numbered patch/research sizes will sample between pixels unless PS=6. 
- * It's not known whether this is ever useful behavior or not. This is 
- * incompatible with textureGather optimizations, so NG=1 to disable them.
- */
+// It is generally preferable to denoise luma and chroma differently, so the 
+// user variables for luma and chroma are split.
+
+// Denoising factor (level of blur, higher means more blur)
 #ifdef LUMA_raw
 #define S 2.0
-#define P 3
-#define R 5
 #else
 #define S 5.0
-#define P 3
-#define R 5
 #endif
 
 /* Adaptive sharpening
@@ -383,11 +368,16 @@ vec4 hook()
  * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
  * weight map to restrict the sharpening to edges.
  *
- * Use M=4 to get a good look at which areas are/aren't sharpened.
+ * If you just want to increase/decrease sharpness then you want to change ASF.
  *
- * AS: 2 for sharpening, 1 for sharpening+denoising, 0 to disable
- * ASF: Sharpening factor, higher numbers make a sharper underlying image
- * ASP: Weight power, higher numbers use more of the sharp image
+ * Use V=4 to visualize which areas are sharpened (black means sharpen).
+ *
+ * AS:
+ * 	- 0 to disable
+ * 	- 1 to sharpen+denoise
+ * 	- 2 to sharpen only
+ * ASF: Higher numbers make a sharper image
+ * ASP: Higher numbers use more of the sharp image
  * ASW:
  * 	- 0 to use pre-WD weights
  * 	- 1 to use post-WD weights (ASP should be ~2x to compensate)
@@ -399,15 +389,15 @@ vec4 hook()
  */
 #ifdef LUMA_raw
 #define AS 0
-#define ASF 2.0
-#define ASP 4.0
+#define ASF 3.0
+#define ASP 1.0
 #define ASW 0
 #define ASK 1
 #define ASC 0.0
 #else
 #define AS 0
-#define ASF 2.0
-#define ASP 4.0
+#define ASF 3.0
+#define ASP 1.0
 #define ASW 0
 #define ASK 1
 #define ASC 0.0
@@ -415,12 +405,10 @@ vec4 hook()
 
 /* Starting weight
  *
- * Lower numbers give less weight to the pixel-of-interest, which may help 
- * handle higher noise levels, ringing, and may be useful for other things too?
+ * Also known as the center weight. This represents the weight of the 
+ * pixel-of-interest. Lower numbers may help handle heavy noise & ringing.
  *
- * EPSILON should be used instead of zero to avoid divide-by-zero errors. The 
- * avg_weight/old_avg_weight variables may be used to make SW adapt to the 
- * local noise level, e.g., SW=max(avg_weight, EPSILON)
+ * EPSILON should be used instead of zero to avoid divide-by-zero errors.
  */
 #ifdef LUMA_raw
 #define SW 1.0
@@ -435,7 +423,7 @@ vec4 hook()
  * result, especially around edges.
  * 
  * WD:
- * 	- 2: True average. Very good quality, but slower and uses more memory.
+ * 	- 2: True average. Better quality, but slower and requires GLSL 4.0 or later
  * 	- 1: Moving cumulative average. Inaccurate, tends to blur directionally.
  * 	- 0: Disable
  *
@@ -454,12 +442,14 @@ vec4 hook()
 
 /* Extremes preserve
  *
- * Reduces denoising around very bright/dark areas. The downscaling factor of 
- * EP (located near the top of this shader) controls the area sampled for 
- * luminance (higher numbers consider more area).
+ * Reduces denoising around very bright/dark areas.
+ *
+ * The downscaling factor of the EP shader stage affects what is considered a 
+ * bright/dark area. The default of 3 should be fine, it's not recommended to 
+ * change this.
  *
  * This is incompatible with RGB. If you have RGB hooks enabled then you will 
- * have to delete the EP shader stage or specify EP=0 through nlmeans_cfg.
+ * have to delete the EP shader stage or specify EP=0 through shader_cfg.
  *
  * EP: 1 to enable, 0 to disable
  * DP: EP strength on dark patches, 0 to fully denoise
@@ -481,25 +471,26 @@ vec4 hook()
 /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
 /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
 
-/* Robust filtering
+/* Patch & research sizes
  *
- * This setting is dependent on code generation from nlmeans_cfg, so this 
- * setting can only be enabled via nlmeans_cfg.
+ * Patch size should be an odd number greater than or equal to 3. Higher values 
+ * are slower and not always better.
  *
- * Compares the pixel-of-interest against a guide, which could be a downscaled 
- * image or the output of another shader such as guided.glsl
+ * Research size be an odd number greater than or equal to 3. Higher values are 
+ * generally better, but slower, blurrier, and gives diminishing returns.
  */
 #ifdef LUMA_raw
-#define RF 1
+#define P 3
+#define R 5
 #else
-#define RF 1
+#define P 3
+#define R 5
 #endif
 
-/* Search shape
+/* Patch and research shapes
  *
- * Determines the shape of patches and research zones. Different shapes have 
- * different speed and quality characteristics. Every shape (besides square) is 
- * smaller than square.
+ * Different shapes have different speed and quality characteristics. Every 
+ * shape (besides square) is smaller than square.
  *
  * PS applies applies to patches, RS applies to research zones.
  *
@@ -522,11 +513,22 @@ vec4 hook()
 #define PS 3
 #endif
 
+/* Robust filtering
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
+ *
+ * Compares the pixel-of-interest against a guide, which could be a downscaled 
+ * image or the output of another shader
+ */
+#define RF_LUMA 1
+#define RF 1
+
 /* Rotational/reflectional invariance
  *
- * Number of rotations/reflections to try for each patch comparison. Slow, but 
- * improves feature preservation, although adding more rotations/reflections 
- * gives diminishing returns. The most similar rotation/reflection will be used.
+ * Number of rotations/reflections to try for each patch comparison. Can be 
+ * slow, but improves feature preservation. More rotations/reflections gives 
+ * diminishing returns. The most similar rotation/reflection will be used.
  *
  * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
  * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
@@ -543,29 +545,39 @@ vec4 hook()
 #endif
 
 /* Temporal denoising
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
  *
  * Caveats:
- * 	- Slower, each frame needs to be researched
- * 	- Requires vo=gpu-next and nlmeans_temporal.glsl
+ * 	- Slower:
+ * 		- Each frame needs to be researched (more samples & more math)
+ * 		- Gather optimizations only apply to the current frame
+ * 	- Requires vo=gpu-next
  * 	- Luma-only (this is a bug)
  * 	- Buggy
  *
- * Gather samples across multiple frames. May cause motion blur and may 
- * struggle more with noise that persists across multiple frames (e.g., from 
- * compression or duplicate frames), but can work very well on high quality 
- * video.
+ * May cause motion blur and may struggle more with noise that persists across 
+ * multiple frames (e.g., from compression or duplicate frames), but can work 
+ * very well on high quality video.
  *
  * Motion estimation (ME) should improve quality without impacting speed.
  *
  * T: number of frames used
  * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg
+ * MEF: estimate factor, compensates for ME being one frame behind
+ * TRF: compare against the denoised frames
  */
 #ifdef LUMA_raw
 #define T 2
 #define ME 1
+#define MEF 2
+#define TRF 0
 #else
 #define T 0
 #define ME 0
+#define MEF 2
+#define TRF 0
 #endif
 
 /* Spatial kernel
@@ -577,69 +589,79 @@ vec4 hook()
  * closer/further, for instance SD=(1,1,0.5) would make the temporal axis 
  * appear closer and increase blur between frames.
  *
- * The intra-patch variants do not yet have well-understood effects. They are 
- * intended to make large patch sizes more useful. Likely slower.
+ * The intra-patch variants are supposed to help with larger patch sizes.
  *
- * SS: spatial denoising factor
+ * SST: enables spatial kernel if R>=PST, 0 fully disables
+ * SS: spatial sigma
  * SD: spatial distortion (X, Y, time)
- * PSS: intra-patch spatial denoising factor
+ * PSS: intra-patch spatial sigma
  * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables
  * PSD: intra-patch spatial distortion (X, Y)
  */
 #ifdef LUMA_raw
+#define SST 1
 #define SS 0.25
-#define SD vec3(1,1,1.5)
+#define SD vec3(1,1,1)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #else
+#define SST 1
 #define SS 0.25
-#define SD vec3(1,1,1.5)
+#define SD vec3(1,1,1)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #endif
 
-// Scaling factor (should match WIDTH/HEIGHT)
+/* Kernels
+ *
+ * SK: spatial kernel
+ * RK: range kernel (takes patch differences)
+ * PSK: intra-patch spatial kernel
+ *
+ * List of available kernels:
+ *
+ * bicubic
+ * cos
+ * gaussian
+ * lanczos
+ * quadratic
+ * sinc
+ * sphinx
+ */
 #ifdef LUMA_raw
-#define SF 1
+#define SK gaussian
+#define RK gaussian
+#define PSK gaussian
 #else
-#define SF 1
+#define SK gaussian
+#define RK gaussian
+#define PSK gaussian
 #endif
 
-/* Estimator
- *
- * 0: means
- * 1: Euclidean medians (extremely slow, may be good for heavy noise)
- * 2: weight map (not a denoiser, maybe useful for generating image masks)
- * 3: weighted median intensity (slow, may be good for heavy noise)
- * 4: edge map (based on the relevant AS settings)
- */
+// Scaling factor (should match WIDTH/HEIGHT)
 #ifdef LUMA_raw
-#define M 0
+#define SF 1
 #else
-#define M 0
+#define SF 1
 #endif
 
-/* Difference visualization
- *
- * Visualizes the difference between input/output image
+/* Visualization
  *
  * 0: off
- * 1: absolute difference scaled by S
- * 2: difference centered on 0.5
+ * 1: absolute difference between input/output to the power of 0.25
+ * 2: difference between input/output centered on 0.5
+ * 3: avg_weight
+ * 4: edge map (based on the relevant AS settings)
  */
 #ifdef LUMA_raw
-#define DV 0
+#define V 0
 #else
-#define DV 0
+#define V 0
 #endif
 
-/* Blur factor
- *
- * 0 to 1, only useful for alternative estimators. You're probably looking for 
- * "S" (denoising factor), go back to the top of the shader!
- */
+// Blur factor (0.0 returns the input image, 1.0 returns the output image)
 #ifdef LUMA_raw
 #define BF 1.0
 #else
@@ -660,17 +682,57 @@ vec4 hook()
 #define PD 0
 #endif
 
-// Duplicate 1st weight (for LGC)
+// Duplicate 1st weight (for luma-guided-chroma)
 #ifdef LUMA_raw
 #define D1W 0
 #else
 #define D1W 0
 #endif
 
-/* Shader code */
+// Skip patch comparison
+#ifdef LUMA_raw
+#define SKIP_PATCH 0
+#else
+#define SKIP_PATCH 0
+#endif
+
+// Shader code
 
 #define EPSILON 0.00000000001
 #define M_PI 3.14159265358979323846
+#define POW2(x) ((x)*(x))
+#define POW3(x) ((x)*(x)*(x))
+#define bicubic(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0))))
+#define gaussian(x) exp(-1 * POW2(x))
+#define lanczos(x) POW2(sinc(x))
+#define quadratic(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5))
+#define sinc(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI))
+#define sphinx(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI))
+
+// XXX could maybe be better optimized on LGC
+// XXX return original alpha component instead of 1.0
+#if defined(LUMA_raw)
+#define val float
+#define val_swizz(v) (v.x)
+#define unval(v) vec4(v.x, 0, 0, 1.0)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#elif defined(CHROMA_raw)
+#define val vec2
+#define val_swizz(v) (v.xy)
+#define unval(v) vec4(v.x, v.y, 0, 1.0)
+#define val_packed uint
+#define val_pack(v) packUnorm2x16(v)
+#define val_unpack(v) unpackUnorm2x16(v)
+#else
+#define val vec3
+#define val_swizz(v) (v.xyz)
+#define unval(v) vec4(v.x, v.y, v.z, 1.0)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#endif
 
 #if PS == 6
 const int hp = P/2;
@@ -685,39 +747,96 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res
 #endif
 
 // donut increment, increments without landing on (0,0,0)
-// much faster than a "continue" statement
+// much faster than a continue statement
 #define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0))))
 
-// search shapes and their corresponding areas
-#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
+// patch/research shapes
+// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R)
+// dots (.) represent samples (pixels) and X represents the pixel-of-interest
+
+// Z    .....
+// Z    .....
+// Z    ..X..
+// Z    .....
+// Z    .....
+#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr)
+
+// (in this instance Z=4)
+// Z    ....
+// Z    ....
+// Z    ..X.
+// Z    ....
+#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr)
 
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
 #define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr)
+
+// Z-4    .
+// Z-2   ...
+// hz+1 ..X
 #define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr)
 #define S_TRIANGLE_A(hz,Z) int(hz*hz+Z)
 
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
+// Z-2   ...
+// Z-4    .
 #define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr)
 #define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z)
 
-#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
+//
+// Z    ..X..
+//
 #define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++)
 
+// 90 degree rotation of S_HORIZONTAL
+#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
+
+// 1      .
+// 1      . 
+// Z    ..X..
+// 1      . 
+// 1      .
 #define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr)
 #define S_PLUS_A(hz,Z) (Z*2 - 1)
 
-#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr)
-#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr)
+// XXX implement S_PLUS w/ an X overlayed:
+// 3    . . .
+// 3     ...
+// Z    ..X..
+// 3     ...
+// 3    . . .
+
+// XXX implement an X shape:
+// 2    .   .
+// 2     . .
+// 1      X  
+// 2     . .
+// 2    .   .
+
+// 1x1 square
+#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
 
 #define T1 (T+1)
 #define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++)
 
+#ifdef LUMA_raw
+#define RF_ RF_LUMA
+#else
+#define RF_ RF
+#endif
+
 // Skip comparing the pixel-of-interest against itself, unless RF is enabled
-#if RF
+#if RF_
 #define RINCR(z,c) (z.c++)
 #else
 #define RINCR DINCR
 #endif
 
-#define R_AREA(a) (a * T1 + RF-1)
+#define R_AREA(a) (a * T1 + RF_-1)
 
 // research shapes
 // XXX would be nice to have the option of temporally-varying research sizes
@@ -806,50 +925,45 @@ const int p_area = P_AREA(P*P);
 const float r_scale = 1.0/r_area;
 const float p_scale = 1.0/p_area;
 
-#define load_(off)  HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off))
+#define sample(tex, pos, size, pt, off) tex(pos + pt * (vec2(off) + 0.5 - fract(pos*size)))
+#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off)
 
-#if RF && defined(LUMA_raw)
-#define load2_(off) RF_LUMA_tex(RF_LUMA_pos + RF_LUMA_pt * vec2(off))
+#if RF_ && defined(LUMA_raw)
+#define load2_(off) sample(RF_LUMA_tex, RF_LUMA_pos, RF_LUMA_size, RF_LUMA_pt, off)
 #define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr)))
 #define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0)
-#elif RF && D1W
-#define load2_(off) RF_tex(RF_pos + RF_pt * vec2(off))
+#elif RF_ && D1W
+#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
 #define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr)))
 #define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0)
-#elif RF
-#define load2_(off) RF_tex(RF_pos + RF_pt * vec2(off))
+#elif RF_
+#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
 #else
-#define load2_(off) HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off))
+#define load2_(off) load_(off)
 #define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr)))
 #define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0)
 #endif
 
 #if T
-vec4 load(vec3 off)
+val load(vec3 off)
 {
-	switch (int(off.z)) {
-	case 0: return load_(off);
-	case 1: return imageLoad(PREV1, ivec2((HOOKED_pos + HOOKED_pt * vec2(off)) * imageSize(PREV1)));
-	case 2: return imageLoad(PREV2, ivec2((HOOKED_pos + HOOKED_pt * vec2(off)) * imageSize(PREV2)));
-	case 3: return imageLoad(PREV3, ivec2((HOOKED_pos + HOOKED_pt * vec2(off)) * imageSize(PREV3)));
+	switch (min(int(off.z), frame)) {
+	case 0: return val_swizz(load_(off));
+	case 1: return val_swizz(imageLoad(PREV1, ivec2((HOOKED_pos + HOOKED_pt * vec2(off)) * imageSize(PREV1))));
+	case 2: return val_swizz(imageLoad(PREV2, ivec2((HOOKED_pos + HOOKED_pt * vec2(off)) * imageSize(PREV2))));
 	}
 }
-vec4 load2(vec3 off)
+val load2(vec3 off)
 {
-	switch (int(off.z)) {
-	case 0: return load2_(off);
-	case 1: return imageLoad(PREV1, ivec2((HOOKED_pos + HOOKED_pt * vec2(off)) * imageSize(PREV1)));
-	case 2: return imageLoad(PREV2, ivec2((HOOKED_pos + HOOKED_pt * vec2(off)) * imageSize(PREV2)));
-	case 3: return imageLoad(PREV3, ivec2((HOOKED_pos + HOOKED_pt * vec2(off)) * imageSize(PREV3)));
-	}
+	return off.z == 0 ? val_swizz(load2_(off)) : load(off);
 }
 #else
-#define load(off) load_(off)
-#define load2(off) load2_(off)
+#define load(off) val_swizz(load_(off))
+#define load2(off) val_swizz(load2_(off))
 #endif
 
-vec4 poi = load(vec3(0)); // pixel-of-interest
-vec4 poi2 = load2(vec3(0)); // guide pixel-of-interest
+val poi = load(vec3(0)); // pixel-of-interest
+val poi2 = load2(vec3(0)); // guide pixel-of-interest
 
 #if RI // rotation
 vec2 rot(vec2 p, float d)
@@ -876,22 +990,52 @@ vec2 ref(vec2 p, int d)
 #define ref(p, d) (p)
 #endif
 
-vec4 patch_comparison(vec3 r, vec3 r2)
+#if SST && R >= SST
+float spatial_r(vec3 v)
+{
+	v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size);
+	return SK(length(v*SD)*SS);
+}
+#else
+#define spatial_r(v) (1)
+#endif
+
+#if PST && P >= PST
+#define spatial_p(v) PSK(length(v*PSD)*PSS)
+#else
+#define spatial_p(v) (1)
+#endif
+
+val range(val pdiff_sq)
+{
+	const float h = S*0.013;
+	const float pdiff_scale = 1.0/(h*h);
+	pdiff_sq = sqrt(pdiff_sq * pdiff_scale);
+#if defined(LUMA_raw)
+	return RK(pdiff_sq);
+#elif defined(CHROMA_raw)
+	return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y));
+#else
+	return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z));
+#endif
+	//return exp(-pdiff_sq * pdiff_scale);
+
+	// weight function from the NLM paper, it's not very good
+	//return exp(-max(pdiff_sq - 2*S*S, 0.0) * pdiff_scale);
+}
+
+val patch_comparison(vec3 r, vec3 r2)
 {
 	vec3 p;
-	vec4 min_rot = vec4(p_area);
+	val min_rot = val(p_area);
 
 	FOR_ROTATION FOR_REFLECTION {
-		vec4 pdiff_sq = vec4(0);
+		val pdiff_sq = val(0);
 		FOR_PATCH(p) {
 			vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z);
-			vec4 diff_sq = load2(p + r2) - load2((transformed_p + r) * SF);
+			val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF);
 			diff_sq *= diff_sq;
-#if PST && P >= PST
-			float pdist = length(p.xy*PSD)*PSS;
-			pdist = exp(-(pdist*pdist));
-			diff_sq = pow(max(diff_sq, EPSILON), vec4(pdist));
-#endif
+			diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy);
 			pdiff_sq += diff_sq;
 		}
 		min_rot = min(min_rot, pdiff_sq);
@@ -903,14 +1047,15 @@ vec4 patch_comparison(vec3 r, vec3 r2)
 #define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
 #define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
 
-#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && M != 1 && REGULAR_ROTATIONS && NO_GATHER
+#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER
 // 3x3 diamond/plus patch_comparison_gather
 // XXX extend to support arbitrary sizes (probably requires code generation)
 // XXX extend to support 3x3 square
+// XXX support PSS
 const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
 const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
 vec4 poi_patch = gather_offs(0, offsets);
-vec4 patch_comparison_gather(vec3 r, vec3 r2)
+float patch_comparison_gather(vec3 r, vec3 r2)
 {
 	float min_rot = p_area - 1;
 	vec4 transformer = gather_offs(r, offsets_sf);
@@ -934,13 +1079,12 @@ vec4 patch_comparison_gather(vec3 r, vec3 r2)
 	}
 	float center_diff_sq = poi2.x - load2(r).x;
 	center_diff_sq *= center_diff_sq;
-	return vec4(min_rot + center_diff_sq, 0, 0, 0) * p_scale;
+	return (min_rot + center_diff_sq) * p_scale;
 }
-#elif (defined(LUMA_gather) || D1W) && PS == 6 && REGULAR_ROTATIONS && NO_GATHER
+#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER
 // tiled even square patch_comparison_gather
 // XXX extend to support odd square?
-// XXX rotations/reflections appear to be subtly broken
-vec4 patch_comparison_gather(vec3 r, vec3 r2)
+float patch_comparison_gather(vec3 r, vec3 r2)
 {
 	vec2 tile;
 	float min_rot = p_area;
@@ -949,40 +1093,17 @@ vec4 patch_comparison_gather(vec3 r, vec3 r2)
 	 * w z
 	 * x y
 	 */
-	FOR_ROTATION FOR_REFLECTION {
-		float pdiff_sq = 0;
-		for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) {
-			vec4 poi_patch = gather(tile + r2.xy);
-			vec4 transformer = gather(ref(rot(tile + 0.5, ri), rfi) - 0.5 + r.xy);
-
-#if RI
-			for (float i = 0; i < ri; i+=90)
-				transformer = transformer.wxyz; // rotate 90 degrees
-#endif
-#if RFI // XXX output is a little off
-			switch(rfi) {
-			case 1: transformer = transformer.zyxw; break;
-			case 2: transformer = transformer.xwzy; break;
-			}
-#endif
-
-			vec4 diff_sq = (poi_patch - transformer) * (poi_patch - transformer);
-#if PST && P >= PST
-			// XXX refactor to avoid pow (should probably break off into a function)
-			vec4 pdist = vec4(
-				exp(-pow(length((tile+vec2(0,1))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(1,1))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(1,0))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(0,0))*PSD)*PSS, 2))
-			);
-			diff_sq = pow(max(diff_sq, EPSILON), pdist);
-#endif
-			pdiff_sq += dot(diff_sq, vec4(1));
-		}
-		min_rot = min(min_rot, pdiff_sq);
+	float pdiff_sq = 0;
+	for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) {
+		vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy);
+		diff_sq *= diff_sq;
+		diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)),
+			                                 spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0)));
+		pdiff_sq += dot(diff_sq, vec4(1));
 	}
+	min_rot = min(min_rot, pdiff_sq);
 
-	return vec4(min_rot, 0, 0, 0) * p_scale;
+	return min_rot * p_scale;
 }
 #else
 #define patch_comparison_gather patch_comparison
@@ -990,9 +1111,9 @@ vec4 patch_comparison_gather(vec3 r, vec3 r2)
 
 vec4 hook()
 {
-	vec4 total_weight = vec4(0);
-	vec4 sum = vec4(0);
-	vec4 result = vec4(0);
+	val total_weight = val(0);
+	val sum = val(0);
+	val result = val(0);
 
 	vec3 r = vec3(0);
 	vec3 p = vec3(0);
@@ -1006,41 +1127,38 @@ vec4 hook()
 	float me_weight = 0;
 #endif
 
-#if WD == 2 || M == 3 // weight discard, weighted median intensities
+#if WD == 2 // weight discard
 	int r_index = 0;
-	vec4 all_weights[r_area];
-	vec4 all_pixels[r_area];
+	val_packed all_weights[r_area];
+	val_packed all_pixels[r_area];
 #elif WD == 1 // weight discard
-	vec4 no_weights = vec4(0);
-	vec4 discard_total_weight = vec4(0);
-	vec4 discard_sum = vec4(0);
-#endif
-
-#if M == 1 // Euclidean medians
-	vec4 minsum = vec4(0);
+	val no_weights = val(0);
+	val discard_total_weight = val(0);
+	val discard_sum = val(0);
 #endif
 
 	FOR_FRAME(r) {
 	// XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
 #if T && ME == 1 // temporal & motion estimation max weight
 	if (r.z > 0) {
-		me += me_tmp;
+		me += me_tmp * MEF;
 		me_tmp = vec3(0);
 		maxweight = 0;
 	}
 #elif T && ME == 2 // temporal & motion estimation weighted average
 	if (r.z > 0) {
-		me += round(me_sum / me_weight);
+		me += round(me_sum / me_weight * MEF);
 		me_sum = vec3(0);
 		me_weight = 0;
 	}
 #endif
-	FOR_RESEARCH(r) {
-		// main NLM logic
-		const float h = S*0.013;
-		const float pdiff_scale = 1.0/(h*h);
-		vec4 pdiff_sq = (r.z == 0) ? patch_comparison_gather(r+me, vec3(0)) : patch_comparison(r+me, vec3(0));
-		vec4 weight = exp(-pdiff_sq * pdiff_scale);
+	FOR_RESEARCH(r) { // main NLM logic
+#if SKIP_PATCH
+		val weight = val(1);
+#else
+		val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(r+me, vec3(0))) : patch_comparison(r+me, vec3(0));
+		val weight = range(pdiff_sq);
+#endif
 
 #if T && ME == 1 // temporal & motion estimation max weight
 		me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));
@@ -1051,18 +1169,18 @@ vec4 hook()
 #endif
 
 #if D1W
-		weight = vec4(weight.x);
+		weight = val(weight.x);
 #endif
 
-		weight *= exp(-(length(r*SD)*SS * length(r*SD)*SS)); // spatial kernel
+		weight *= spatial_r(r);
 
-#if WD == 2 || M == 3 // weight discard, weighted median intensity
-		all_weights[r_index] = weight;
-		all_pixels[r_index] = load(r+me);
+#if WD == 2 // weight discard
+		all_weights[r_index] = val_pack(weight);
+		all_pixels[r_index] = val_pack(load(r+me));
 		r_index++;
 #elif WD == 1 // weight discard
-		vec4 wd_scale = 1.0/max(no_weights, 1);
-		vec4 keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
+		val wd_scale = 1.0/max(no_weights, 1);
+		val keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
 		discard_sum += load(r+me) * weight * (1 - keeps);
 		discard_total_weight += weight * (1 - keeps);
 		no_weights += keeps;
@@ -1070,48 +1188,25 @@ vec4 hook()
 
 		sum += load(r+me) * weight;
 		total_weight += weight;
-
-#if M == 1 // Euclidean median
-		// Based on: https://arxiv.org/abs/1207.3056
-		// XXX might not work with ME
-		vec3 r2;
-		vec4 wpdist_sum = vec4(0);
-		FOR_FRAME(r2) FOR_RESEARCH(r2) {
-			vec4 pdist = (r.z + r2.z) == 0 ? patch_comparison_gather(r+me, r2+me) : patch_comparison(r+me, r2+me);
-			wpdist_sum += sqrt(pdist) * (1-weight);
-		}
-
-		vec4 newmin = step(wpdist_sum, minsum); // wpdist_sum <= minsum
-		newmin *= 1 - step(wpdist_sum, vec4(0)); // && wpdist_sum > 0
-		newmin += step(minsum, vec4(0)); // || minsum <= 0
-		newmin = min(newmin, 1);
-
-		minsum = (newmin * wpdist_sum) + ((1-newmin) * minsum);
-		result = (newmin * load(r+me)) + ((1-newmin) * result);
-#endif
 	} // FOR_RESEARCH
 	} // FOR_FRAME
 
-	// XXX optionally put the denoised pixel into the frame buffer?
-#if T // temporal
-	imageStore(PREV3, ivec2(HOOKED_pos*imageSize(PREV3)), load2(vec3(0,0,2)));
-	imageStore(PREV2, ivec2(HOOKED_pos*imageSize(PREV2)), load2(vec3(0,0,1)));
-	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), load2(vec3(0,0,0)));
-#endif
-
-	vec4 avg_weight = total_weight * r_scale;
-	vec4 old_avg_weight = avg_weight;
+	val avg_weight = total_weight * r_scale;
+	val old_avg_weight = avg_weight;
 
 #if WD == 2 // true average
-	total_weight = vec4(0);
-	sum = vec4(0);
-	vec4 no_weights = vec4(0);
+	total_weight = val(0);
+	sum = val(0);
+	val no_weights = val(0);
 
 	for (int i = 0; i < r_area; i++) {
-		vec4 keeps = step(avg_weight*WDT, all_weights[i]);
-		all_weights[i] *= keeps;
-		sum += all_pixels[i] * all_weights[i];
-		total_weight += all_weights[i];
+		val w = val_unpack(all_weights[i]);
+		val px = val_unpack(all_pixels[i]);
+		val keeps = step(avg_weight*WDT, w);
+
+		w *= keeps;
+		sum += px * w;
+		total_weight += w;
 		no_weights += keeps;
 	}
 #elif WD == 1 // moving cumulative average
@@ -1122,29 +1217,23 @@ vec4 hook()
 	avg_weight = total_weight / no_weights;
 #endif
 
-	total_weight += SW;
-	sum += poi * SW;
+	total_weight += SW * spatial_r(vec3(0));
+	sum += poi * SW * spatial_r(vec3(0));
 
-#if M == 3 // weighted median intensity
-	const float hr_area = r_area/2.0;
-	vec4 is_median, gt, lt, gte, lte, neq;
+#if V == 3 // weight map
+	result = val(avg_weight);
+#else // mean
+	result = val(sum / total_weight);
+#endif
 
-	for (int i = 0; i < r_area; i++) {
-		gt = lt = vec4(0);
-		for (int j = 0; j < r_area; j++) {
-			gte = step(all_pixels[i]*all_weights[i], all_pixels[j]*all_weights[j]);
-			lte = step(all_pixels[j]*all_weights[j], all_pixels[i]*all_weights[i]);
-			neq = 1 - gte * lte;
-			gt += gte * neq;
-			lt += lte * neq;
-		}
-		is_median = step(gt, vec4(hr_area)) * step(lt, vec4(hr_area));
-		result += step(result, vec4(0)) * is_median * all_pixels[i];
-	}
-#elif M == 2 // weight map
-	result = avg_weight;
-#elif M == 0 // mean
-	result = sum / total_weight;
+	// store frames for temporal
+#if T > 1
+	imageStore(PREV2, ivec2(HOOKED_pos*imageSize(PREV2)), unval(load2(vec3(0,0,2-1))));
+#endif
+#if T && TRF
+	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result));
+#elif T
+	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2));
 #endif
 
 #if ASW == 0 // pre-WD weights
@@ -1154,22 +1243,20 @@ vec4 hook()
 #endif
 
 #if ASK == 0
-	vec4 sharpening_strength = pow(AS_weight, vec4(ASP));
+	val sharpening_strength = pow(AS_weight, val(ASP));
 #elif ASK == 1
-#define sigmoid(x) (tanh(x * 2*M_PI - M_PI)*0.5+0.5)
-	vec4 sharpening_strength = mix(pow(sigmoid(AS_weight), vec4(ASP)),
-	                               AS_weight, ASC);
-	// just in case ASC < 0 (will sharpen but it's janky XXX)
-	sharpening_strength = clamp(sharpening_strength, 0.0, 1.0);
+	val sharpening_strength = mix(
+			pow(smoothstep(0.0, 1.0, AS_weight), val(ASP)),
+			AS_weight, ASC);
+	// XXX normalize the result to account for a negative ASC?
 #elif ASK == 2
-	vec4 sharpening_strength = vec4(ASP);
+	val sharpening_strength = val(ASP);
 #endif
 
-	// XXX maybe allow for alternative blurs? e.g., replace result w/ load2?
 #if AS == 1 // sharpen+denoise
-	vec4 sharpened = result + (poi - result) * ASF;
+	val sharpened = result + (poi - result) * ASF;
 #elif AS == 2 // sharpen only
-	vec4 sharpened = poi + (poi - result) * ASF;
+	val sharpened = poi + (poi - result) * ASF;
 #endif
 
 #if EP // extremes preserve
@@ -1185,35 +1272,29 @@ vec4 hook()
 	result = mix(sharpened, poi, sharpening_strength);
 #endif
 
-#if M == 4 // edge map
+#if V == 4 // edge map
 	result = sharpening_strength;
 #endif
 
-#if (M == 2 || M == 4) && defined(CHROMA_raw) // drop chroma for weight maps
-	result = vec4(0.5);
+#if (V == 3 || V == 4) && defined(CHROMA_raw) // drop chroma for these visualizations
+	return vec4(0.5);
 #endif
 
-#if DV == 1
-	result = clamp(abs(poi - result) * S, 0.0, 1.0);
-#elif DV == 2
+#if V == 1
+	result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0);
+#elif V == 2
 	result = (poi - result) * 0.5 + 0.5;
 #endif
 
-	return mix(poi, result, BF);
+	return unval(mix(poi, result, BF));
 }
 
 //!TEXTURE PREV1
 //!SIZE 1920 1080
-//!FORMAT r32f
+//!FORMAT r16f
 //!STORAGE
 
 //!TEXTURE PREV2
 //!SIZE 1920 1080
-//!FORMAT r32f
+//!FORMAT r16f
 //!STORAGE
-
-//!TEXTURE PREV3
-//!SIZE 1920 1080
-//!FORMAT r32f
-//!STORAGE
-