From 599adf63cc761453d67a1894b280b5c613eb8214 Mon Sep 17 00:00:00 2001 From: Ilya Zlobintsev Date: Fri, 17 Jan 2025 21:30:19 +0200 Subject: [PATCH 1/3] feat: gpu_controllers behind an RwLock --- lact-daemon/src/lib.rs | 2 +- lact-daemon/src/server.rs | 16 +- lact-daemon/src/server/gpu_controller.rs | 1 + lact-daemon/src/server/handler.rs | 187 ++++++++++++----------- lact-daemon/src/server/profiles.rs | 4 +- lact-daemon/src/tests/mod.rs | 1 + 6 files changed, 110 insertions(+), 101 deletions(-) diff --git a/lact-daemon/src/lib.rs b/lact-daemon/src/lib.rs index 87ae9b9b..e79eb3c0 100644 --- a/lact-daemon/src/lib.rs +++ b/lact-daemon/src/lib.rs @@ -124,7 +124,7 @@ async fn listen_config_changes(handler: Handler) { let mut rx = config::start_watcher(handler.config_last_saved.clone()); while let Some(new_config) = rx.recv().await { info!("config file was changed, reloading"); - handler.config.replace(new_config); + *handler.config.write().await = new_config; match handler.apply_current_config().await { Ok(()) => { info!("configuration reloaded"); diff --git a/lact-daemon/src/server.rs b/lact-daemon/src/server.rs index 0ab91325..22d097bc 100644 --- a/lact-daemon/src/server.rs +++ b/lact-daemon/src/server.rs @@ -131,12 +131,12 @@ async fn handle_request<'a>(request: Request<'a>, handler: &'a Handler) -> anyho match request { Request::Ping => ok_response(ping()), Request::SystemInfo => ok_response(system::info().await?), - Request::ListDevices => ok_response(handler.list_devices()), - Request::DeviceInfo { id } => ok_response(handler.get_device_info(id)?), - Request::DeviceStats { id } => ok_response(handler.get_gpu_stats(id)?), - Request::DeviceClocksInfo { id } => ok_response(handler.get_clocks_info(id)?), + Request::ListDevices => ok_response(handler.list_devices().await), + Request::DeviceInfo { id } => ok_response(handler.get_device_info(id).await?), + Request::DeviceStats { id } => ok_response(handler.get_gpu_stats(id).await?), + Request::DeviceClocksInfo { id } => ok_response(handler.get_clocks_info(id).await?), Request::DevicePowerProfileModes { id } => { - ok_response(handler.get_power_profile_modes(id)?) + ok_response(handler.get_power_profile_modes(id).await?) } Request::SetFanControl(opts) => ok_response(handler.set_fan_control(opts).await?), Request::ResetPmfw { id } => ok_response(handler.reset_pmfw(id).await?), @@ -160,13 +160,13 @@ async fn handle_request<'a>(request: Request<'a>, handler: &'a Handler) -> anyho .set_power_profile_mode(id, index, custom_heuristics) .await?, ), - Request::GetPowerStates { id } => ok_response(handler.get_power_states(id)?), + Request::GetPowerStates { id } => ok_response(handler.get_power_states(id).await?), Request::SetEnabledPowerStates { id, kind, states } => { ok_response(handler.set_enabled_power_states(id, kind, states).await?) } - Request::VbiosDump { id } => ok_response(handler.vbios_dump(id)?), + Request::VbiosDump { id } => ok_response(handler.vbios_dump(id).await?), Request::ListProfiles { include_state } => { - ok_response(handler.list_profiles(include_state)) + ok_response(handler.list_profiles(include_state).await) } Request::SetProfile { name, auto_switch } => ok_response( handler diff --git a/lact-daemon/src/server/gpu_controller.rs b/lact-daemon/src/server/gpu_controller.rs index f0e81420..ca28f773 100644 --- a/lact-daemon/src/server/gpu_controller.rs +++ b/lact-daemon/src/server/gpu_controller.rs @@ -25,6 +25,7 @@ use std::{cell::LazyCell, collections::HashMap, fs, path::PathBuf, rc::Rc}; use tokio::{sync::Notify, task::JoinHandle}; use tracing::{error, warn}; +pub type DynGpuController = Box; type FanControlHandle = (Rc, JoinHandle<()>); pub trait GpuController { diff --git a/lact-daemon/src/server/handler.rs b/lact-daemon/src/server/handler.rs index dade1cd8..ee12752c 100644 --- a/lact-daemon/src/server/handler.rs +++ b/lact-daemon/src/server/handler.rs @@ -1,5 +1,5 @@ use super::{ - gpu_controller::{fan_control::FanCurve, GpuController}, + gpu_controller::{fan_control::FanCurve, DynGpuController, GpuController}, profiles::ProfileWatcherCommand, system::{self, detect_initramfs_type, PP_FEATURE_MASK_PATH}, }; @@ -38,7 +38,7 @@ use std::{ }; use tokio::{ process::Command, - sync::{mpsc, oneshot}, + sync::{mpsc, oneshot, RwLock, RwLockReadGuard}, time::sleep, }; use tracing::{debug, error, info, trace, warn}; @@ -86,8 +86,8 @@ const SNAPSHOT_FAN_CTRL_FILES: &[&str] = &[ #[derive(Clone)] pub struct Handler { - pub config: Rc>, - pub gpu_controllers: Rc>>, + pub config: Rc>, + pub gpu_controllers: Rc>>, confirm_config_tx: Rc>>>, pub config_last_saved: Rc>, pub profile_watcher_tx: Rc>>>, @@ -151,8 +151,8 @@ impl<'a> Handler { info!("initialized {} GPUs", controllers.len()); let handler = Self { - gpu_controllers: Rc::new(controllers), - config: Rc::new(RefCell::new(config)), + gpu_controllers: Rc::new(RwLock::new(controllers)), + config: Rc::new(RwLock::new(config)), confirm_config_tx: Rc::new(RefCell::new(None)), config_last_saved: Rc::new(Cell::new(Instant::now())), profile_watcher_tx: Rc::new(RefCell::new(None)), @@ -162,11 +162,11 @@ impl<'a> Handler { error!("could not apply config: {err:#}"); } - if let Some(profile_name) = &handler.config.borrow().current_profile { + if let Some(profile_name) = &handler.config.read().await.current_profile { info!("using profile '{profile_name}'"); } - if handler.config.borrow().auto_switch_profiles { + if handler.config.read().await.auto_switch_profiles { handler.start_profile_watcher().await; } @@ -182,11 +182,12 @@ impl<'a> Handler { } pub async fn apply_current_config(&self) -> anyhow::Result<()> { - let config = self.config.borrow().clone(); // Clone to avoid locking the RwLock on an await point + let config = self.config.read().await; let gpus = config.gpus()?; + let controllers = self.gpu_controllers.read().await; for (id, gpu_config) in gpus { - if let Some(controller) = self.gpu_controllers.get(id) { + if let Some(controller) = controllers.get(id) { if let Err(err) = controller.apply_config(gpu_config).await { error!("could not apply existing config for gpu {id}: {err}"); } @@ -231,7 +232,7 @@ impl<'a> Handler { } let (gpu_config, apply_timer) = { - let config = self.config.try_borrow().map_err(|err| anyhow!("{err}"))?; + let config = self.config.read().await; let apply_timer = config.apply_settings_timer; let gpu_config = config.gpus()?.get(&id).cloned().unwrap_or_default(); (gpu_config, apply_timer) @@ -240,7 +241,7 @@ impl<'a> Handler { let mut new_config = gpu_config.clone(); f(&mut new_config); - let controller = self.controller_by_id(&id)?; + let controller = self.controller_by_id(&id).await?; match controller.apply_config(&new_config).await { Ok(()) => { @@ -278,6 +279,7 @@ impl<'a> Handler { tokio::task::spawn_local(async move { let controller = handler .controller_by_id(&id) + .await .expect("GPU controller disappeared"); tokio::select! { @@ -293,7 +295,7 @@ impl<'a> Handler { Ok(ConfirmCommand::Confirm) => { info!("saving updated config"); - let mut config_guard = handler.config.borrow_mut(); + let mut config_guard = handler.config.write().await; match config_guard.gpus_mut() { Ok(gpus) => { gpus.insert(id, new_config); @@ -323,16 +325,19 @@ impl<'a> Handler { Ok(()) } - fn controller_by_id(&self, id: &str) -> anyhow::Result<&dyn GpuController> { - Ok(self - .gpu_controllers - .get(id) - .context("No controller with such id")? - .as_ref()) + async fn controller_by_id( + &self, + id: &str, + ) -> anyhow::Result> { + let guard = self.gpu_controllers.read().await; + RwLockReadGuard::try_map(guard, |controllers| controllers.get(id).map(Box::as_ref)) + .map_err(|_| anyhow!("Controller '{id}' not found")) } - pub fn list_devices(&'a self) -> Vec { + pub async fn list_devices(&'a self) -> Vec { self.gpu_controllers + .read() + .await .iter() .map(|(id, controller)| { let name = controller @@ -349,29 +354,23 @@ impl<'a> Handler { .collect() } - pub fn get_device_info(&'a self, id: &str) -> anyhow::Result { - Ok(self.controller_by_id(id)?.get_info()) + pub async fn get_device_info(&'a self, id: &str) -> anyhow::Result { + Ok(self.controller_by_id(id).await?.get_info()) } - pub fn get_gpu_stats(&'a self, id: &str) -> anyhow::Result { - let config = self - .config - .try_borrow() - .map_err(|err| anyhow!("Could not read config: {err:?}"))?; + pub async fn get_gpu_stats(&'a self, id: &str) -> anyhow::Result { + let config = self.config.read().await; let gpu_config = config.gpus()?.get(id); - Ok(self.controller_by_id(id)?.get_stats(gpu_config)) + Ok(self.controller_by_id(id).await?.get_stats(gpu_config)) } - pub fn get_clocks_info(&'a self, id: &str) -> anyhow::Result { - self.controller_by_id(id)?.get_clocks_info() + pub async fn get_clocks_info(&'a self, id: &str) -> anyhow::Result { + self.controller_by_id(id).await?.get_clocks_info() } pub async fn set_fan_control(&'a self, opts: FanOptions<'_>) -> anyhow::Result { let settings = { - let mut config_guard = self - .config - .try_borrow_mut() - .map_err(|err| anyhow!("{err}"))?; + let mut config_guard = self.config.write().await; let gpu_config = config_guard .gpus_mut()? .entry(opts.id.to_owned()) @@ -449,7 +448,7 @@ impl<'a> Handler { pub async fn reset_pmfw(&self, id: &str) -> anyhow::Result { info!("Resetting PMFW settings"); - self.controller_by_id(id)?.reset_pmfw_settings(); + self.controller_by_id(id).await?.reset_pmfw_settings(); self.edit_gpu_config(id.to_owned(), |config| { config.pmfw_options = PmfwOptions::default(); @@ -466,14 +465,14 @@ impl<'a> Handler { .context("Failed to edit GPU config and set power cap") } - pub fn get_power_states(&self, id: &str) -> anyhow::Result { - let config = self - .config - .try_borrow() - .map_err(|err| anyhow!("Could not read config: {err:?}"))?; + pub async fn get_power_states(&self, id: &str) -> anyhow::Result { + let config = self.config.read().await; let gpu_config = config.gpus()?.get(id); - let states = self.controller_by_id(id)?.get_power_states(gpu_config); + let states = self + .controller_by_id(id) + .await? + .get_power_states(gpu_config); Ok(states) } @@ -499,7 +498,7 @@ impl<'a> Handler { command: SetClocksCommand, ) -> anyhow::Result { if let SetClocksCommand::Reset = command { - self.controller_by_id(id)?.cleanup_clocks()?; + self.controller_by_id(id).await?.cleanup_clocks()?; } self.edit_gpu_config(id.to_owned(), |gpu_config| { @@ -523,8 +522,11 @@ impl<'a> Handler { .context("Failed to edit GPU config and batch set clocks") } - pub fn get_power_profile_modes(&self, id: &str) -> anyhow::Result { - let modes_table = self.controller_by_id(id)?.get_power_profile_modes()?; + pub async fn get_power_profile_modes( + &self, + id: &str, + ) -> anyhow::Result { + let modes_table = self.controller_by_id(id).await?.get_power_profile_modes()?; Ok(modes_table) } @@ -555,8 +557,8 @@ impl<'a> Handler { .context("Failed to edit GPU config and set enabled power states") } - pub fn vbios_dump(&self, id: &str) -> anyhow::Result> { - self.controller_by_id(id)?.vbios_dump() + pub async fn vbios_dump(&self, id: &str) -> anyhow::Result> { + self.controller_by_id(id).await?.vbios_dump() } pub async fn generate_snapshot(&self) -> anyhow::Result { @@ -575,7 +577,8 @@ impl<'a> Handler { add_path_to_archive(&mut archive, path)?; } - for controller in self.gpu_controllers.values() { + let controllers = self.gpu_controllers.read().await; + for controller in controllers.values() { let controller_path = &controller.controller_info().sysfs_path; for device_file in SNAPSHOT_DEVICE_FILES { @@ -664,7 +667,7 @@ impl<'a> Handler { let info = json!({ "system_info": system_info, "initramfs_type": initramfs_type, - "devices": self.generate_snapshot_device_info(), + "devices": self.generate_snapshot_device_info().await, }); let info_data = serde_json::to_vec_pretty(&info).unwrap(); @@ -689,37 +692,40 @@ impl<'a> Handler { Ok(out_path) } - pub(crate) fn generate_snapshot_device_info(&self) -> BTreeMap { - self.gpu_controllers - .iter() - .map(|(id, controller)| { - let config = self.config.try_borrow(); - let gpu_config = config - .as_ref() - .ok() - .and_then(|config| config.gpus().ok()?.get(id)); - - let data = json!({ - "pci_info": controller.controller_info().pci_info.clone(), - "info": controller.get_info(), - "stats": controller.get_stats(gpu_config), - "clocks_info": controller.get_clocks_info().ok(), - "power_profile_modes": controller.get_power_profile_modes().ok(), - "power_states": controller.get_power_states(gpu_config), - }); - (id.clone(), data) - }) - .collect() + pub(crate) async fn generate_snapshot_device_info( + &self, + ) -> BTreeMap { + let controllers = self.gpu_controllers.read().await; + let config = self.config.read().await; + + let mut map = BTreeMap::new(); + + for (id, controller) in controllers.iter() { + let gpu_config = config.gpus().ok().and_then(|gpus| gpus.get(id)); + + let data = json!({ + "pci_info": controller.controller_info().pci_info.clone(), + "info": controller.get_info(), + "stats": controller.get_stats(gpu_config), + "clocks_info": controller.get_clocks_info().ok(), + "power_profile_modes": controller.get_power_profile_modes().ok(), + "power_states": controller.get_power_states(gpu_config), + }); + + map.insert(id.clone(), data); + } + + map } - pub fn list_profiles(&self, include_state: bool) -> ProfilesInfo { + pub async fn list_profiles(&self, include_state: bool) -> ProfilesInfo { let watcher_state = if include_state { self.profile_watcher_state.borrow().as_ref().cloned() } else { None }; - let config = self.config.borrow(); + let config = self.config.read().await; ProfilesInfo { profiles: config .profiles @@ -743,19 +749,21 @@ impl<'a> Handler { self.stop_profile_watcher().await; self.set_current_profile(name).await?; } - self.config.borrow_mut().auto_switch_profiles = auto_switch; - self.config.borrow_mut().save(&self.config_last_saved)?; + + let mut config = self.config.write().await; + config.auto_switch_profiles = auto_switch; + config.save(&self.config_last_saved)?; Ok(()) } pub(super) async fn set_current_profile(&self, name: Option>) -> anyhow::Result<()> { if let Some(name) = &name { - self.config.borrow().profile(name)?; + self.config.read().await.profile(name)?; } self.cleanup().await; - self.config.borrow_mut().current_profile = name; + self.config.write().await.current_profile = name; self.apply_current_config().await?; @@ -764,7 +772,7 @@ impl<'a> Handler { pub async fn create_profile(&self, name: String, base: ProfileBase) -> anyhow::Result<()> { { - let mut config = self.config.borrow_mut(); + let mut config = self.config.write().await; if config.profiles.contains_key(name.as_str()) { bail!("Profile {name} already exists"); } @@ -787,15 +795,16 @@ impl<'a> Handler { } pub async fn delete_profile(&self, name: String) -> anyhow::Result<()> { - if self.config.borrow().current_profile.as_deref() == Some(&name) { + if self.config.read().await.current_profile.as_deref() == Some(&name) { self.set_current_profile(None).await?; } self.config - .borrow_mut() + .write() + .await .profiles .shift_remove(name.as_str()); - self.config.borrow().save(&self.config_last_saved)?; + self.config.write().await.save(&self.config_last_saved)?; let tx = self.profile_watcher_tx.borrow().clone(); if let Some(tx) = tx { @@ -807,7 +816,7 @@ impl<'a> Handler { pub async fn move_profile(&self, name: &str, new_position: usize) -> anyhow::Result<()> { { - let mut config = self.config.borrow_mut(); + let mut config = self.config.write().await; let current_index = config .profiles @@ -836,13 +845,14 @@ impl<'a> Handler { rule: Option, ) -> anyhow::Result<()> { self.config - .borrow_mut() + .write() + .await .profiles .get_mut(name) .with_context(|| format!("Profile {name} not found"))? .rule = rule; - self.config.borrow().save(&self.config_last_saved)?; + self.config.read().await.save(&self.config_last_saved)?; let tx = self.profile_watcher_tx.borrow().clone(); if let Some(tx) = tx { @@ -879,7 +889,7 @@ impl<'a> Handler { pub async fn reset_config(&self) { self.cleanup().await; - let mut config = self.config.borrow_mut(); + let mut config = self.config.write().await; config.clear(); if let Err(err) = config.save(&self.config_last_saved) { @@ -888,13 +898,10 @@ impl<'a> Handler { } pub async fn cleanup(&self) { - let disable_clocks_cleanup = self - .config - .try_borrow() - .map(|config| config.daemon.disable_clocks_cleanup) - .unwrap_or(false); + let disable_clocks_cleanup = self.config.read().await.daemon.disable_clocks_cleanup; - for (id, controller) in &*self.gpu_controllers { + let controllers = self.gpu_controllers.read().await; + for (id, controller) in controllers.iter() { if !disable_clocks_cleanup { debug!("resetting clocks table"); if let Err(err) = controller.cleanup_clocks() { @@ -912,7 +919,7 @@ impl<'a> Handler { } /// `sysfs_only` disables initialization of any external data sources, such as libdrm and nvml -fn load_controllers(base_path: &Path) -> anyhow::Result>> { +fn load_controllers(base_path: &Path) -> anyhow::Result> { let mut controllers = BTreeMap::new(); let pci_db = Database::read().unwrap_or_else(|err| { diff --git a/lact-daemon/src/server/profiles.rs b/lact-daemon/src/server/profiles.rs index e36631e0..a6cc1829 100644 --- a/lact-daemon/src/server/profiles.rs +++ b/lact-daemon/src/server/profiles.rs @@ -220,7 +220,7 @@ async fn handle_profile_event(event: &ProfileWatcherEvent, handler: &Handler) { async fn update_profile(handler: &Handler) { let new_profile = { - let config = handler.config.borrow(); + let config = handler.config.read().await; let profile_rules = config .profiles .iter() @@ -237,7 +237,7 @@ async fn update_profile(handler: &Handler) { } }; - if handler.config.borrow().current_profile != new_profile { + if handler.config.read().await.current_profile != new_profile { if let Some(name) = &new_profile { info!("setting current profile to '{name}'"); } else { diff --git a/lact-daemon/src/tests/mod.rs b/lact-daemon/src/tests/mod.rs index 3b3919fd..84552e2c 100644 --- a/lact-daemon/src/tests/mod.rs +++ b/lact-daemon/src/tests/mod.rs @@ -22,6 +22,7 @@ async fn snapshot_everything() { .unwrap(); let device_info = handler .generate_snapshot_device_info() + .await .into_values() .next() .unwrap(); From e2257a83d2dd9eae6a895573e9aedefcba41d79e Mon Sep 17 00:00:00 2001 From: Ilya Zlobintsev Date: Fri, 17 Jan 2025 22:24:47 +0200 Subject: [PATCH 2/3] feat: listen to netlink kernel events --- lact-daemon/src/lib.rs | 20 +++++++++++ lact-daemon/src/server/handler.rs | 29 +++++++++++++--- lact-daemon/src/server/system.rs | 58 +++++++++++++++++++++++++++++-- 3 files changed, 100 insertions(+), 7 deletions(-) diff --git a/lact-daemon/src/lib.rs b/lact-daemon/src/lib.rs index e79eb3c0..339b4c60 100644 --- a/lact-daemon/src/lib.rs +++ b/lact-daemon/src/lib.rs @@ -12,11 +12,14 @@ mod tests; use anyhow::Context; use config::Config; use futures::future::select_all; +use server::system; use server::{handle_stream, handler::Handler, Server}; use std::cell::Cell; +use std::sync::Arc; use std::time::Instant; use std::{os::unix::net::UnixStream as StdUnixStream, time::Duration}; use tokio::net::UnixStream; +use tokio::sync::Notify; use tokio::{ runtime, signal::unix::{signal, SignalKind}, @@ -72,6 +75,7 @@ pub fn run() -> anyhow::Result<()> { tokio::task::spawn_local(listen_config_changes(handler.clone())); tokio::task::spawn_local(listen_exit_signals(handler.clone())); + tokio::task::spawn_local(listen_device_events(handler.clone())); tokio::task::spawn_local(suspend::listen_events(handler)); server.run().await; @@ -136,6 +140,22 @@ async fn listen_config_changes(handler: Handler) { } } +async fn listen_device_events(handler: Handler) { + let notify = Arc::new(Notify::new()); + let task_notify = notify.clone(); + tokio::task::spawn_blocking(move || { + if let Err(err) = system::listen_netlink_kernel_event(&task_notify) { + error!("kernel event listener error: {err:#}"); + } + }); + + loop { + notify.notified().await; + info!("got kernel drm subsystem event, reloading GPUs"); + handler.reload_gpus().await; + } +} + async fn ensure_sufficient_uptime() { match get_uptime() { Ok(current_uptime) => { diff --git a/lact-daemon/src/server/handler.rs b/lact-daemon/src/server/handler.rs index ee12752c..818ccaf2 100644 --- a/lact-daemon/src/server/handler.rs +++ b/lact-daemon/src/server/handler.rs @@ -96,10 +96,7 @@ pub struct Handler { impl<'a> Handler { pub async fn new(config: Config) -> anyhow::Result { - let base_path = match env::var("_LACT_DRM_SYSFS_PATH") { - Ok(custom_path) => PathBuf::from(custom_path), - Err(_) => PathBuf::from("/sys/class/drm"), - }; + let base_path = drm_base_path(); Self::with_base_path(&base_path, config).await } @@ -199,6 +196,23 @@ impl<'a> Handler { Ok(()) } + pub async fn reload_gpus(&self) { + let base_path = drm_base_path(); + match load_controllers(&base_path) { + Ok(new_controllers) => { + info!("GPU list reloaded with {} devices", new_controllers.len()); + *self.gpu_controllers.write().await = new_controllers; + + if let Err(err) = self.apply_current_config().await { + error!("could not reapply config: {err:#}"); + } + } + Err(err) => { + error!("could not load GPU controllers: {err:#}"); + } + } + } + async fn stop_profile_watcher(&self) { let tx = self.profile_watcher_tx.borrow_mut().take(); if let Some(existing_stop_notify) = tx { @@ -1074,3 +1088,10 @@ fn add_path_to_archive( } Ok(()) } + +fn drm_base_path() -> PathBuf { + match env::var("_LACT_DRM_SYSFS_PATH") { + Ok(custom_path) => PathBuf::from(custom_path), + Err(_) => PathBuf::from("/sys/class/drm"), + } +} diff --git a/lact-daemon/src/server/system.rs b/lact-daemon/src/server/system.rs index 38489a0b..2ea5fd78 100644 --- a/lact-daemon/src/server/system.rs +++ b/lact-daemon/src/server/system.rs @@ -1,15 +1,19 @@ use anyhow::{anyhow, ensure, Context}; use lact_schema::{InitramfsType, SystemInfo, GIT_COMMIT}; +use nix::sys::socket::{ + bind, recv, socket, AddressFamily, MsgFlags, NetlinkAddr, SockFlag, SockProtocol, SockType, +}; use os_release::{OsRelease, OS_RELEASE}; use std::{ fs::{self, File, Permissions}, io::Write, - os::unix::prelude::PermissionsExt, + os::{fd::AsRawFd, unix::prelude::PermissionsExt}, path::Path, + process, sync::atomic::{AtomicBool, Ordering}, }; -use tokio::process::Command; -use tracing::{info, warn}; +use tokio::{process::Command, sync::Notify}; +use tracing::{error, info, warn}; static OC_TOGGLED: AtomicBool = AtomicBool::new(false); @@ -191,6 +195,54 @@ async fn run_command(exec: &str, args: &[&str]) -> anyhow::Result<()> { } } +pub(crate) fn listen_netlink_kernel_event(notify: &Notify) -> anyhow::Result<()> { + let socket = socket( + AddressFamily::Netlink, + SockType::Raw, + SockFlag::empty(), + SockProtocol::NetlinkKObjectUEvent, + ) + .context("Could not setup netlink socket")?; + + let sa = NetlinkAddr::new(process::id(), 1); + bind(socket.as_raw_fd(), &sa).context("Could not bind netlink socket")?; + + let mut buf = Vec::new(); + loop { + // Read only the size using the peek and truncate flags first + let msg_size = recv( + socket.as_raw_fd(), + &mut [], + MsgFlags::MSG_PEEK | MsgFlags::MSG_TRUNC, + ) + .context("Could not read netlink message")?; + buf.clear(); + buf.resize(msg_size, 0); + + // Read the actual message into the buffer + recv(socket.as_raw_fd(), &mut buf, MsgFlags::empty()) + .context("Could not read netlink message")?; + + for raw_line in buf.split(|c| *c == b'\0') { + match std::str::from_utf8(raw_line) { + Ok(line) => { + if let Some(subsystem) = line.strip_prefix("SUBSYSTEM=") { + if subsystem == "drm" { + notify.notify_one(); + } + } + } + Err(_) => { + error!( + "Got invalid unicode in uevent line {}", + String::from_utf8_lossy(raw_line) + ); + } + } + } + } +} + #[cfg(test)] mod tests { use crate::server::system::detect_initramfs_type; From 69461424c49f3f8432962cf48930b6abc2fedc18 Mon Sep 17 00:00:00 2001 From: Ilya Zlobintsev Date: Fri, 17 Jan 2025 22:48:17 +0200 Subject: [PATCH 3/3] feat: wait for a period until the last drm event before reload --- lact-daemon/src/lib.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lact-daemon/src/lib.rs b/lact-daemon/src/lib.rs index 339b4c60..bdb3a435 100644 --- a/lact-daemon/src/lib.rs +++ b/lact-daemon/src/lib.rs @@ -20,6 +20,7 @@ use std::time::Instant; use std::{os::unix::net::UnixStream as StdUnixStream, time::Duration}; use tokio::net::UnixStream; use tokio::sync::Notify; +use tokio::time::timeout; use tokio::{ runtime, signal::unix::{signal, SignalKind}, @@ -35,6 +36,7 @@ pub const AMDGPU_FAMILY_GC_11_0_0: u32 = 145; pub use server::system::MODULE_CONF_PATH; const MIN_SYSTEM_UPTIME_SECS: f32 = 15.0; +const DRM_EVENT_TIMEOUT_PERIOD_MS: u64 = 100; const SHUTDOWN_SIGNALS: [SignalKind; 4] = [ SignalKind::terminate(), SignalKind::interrupt(), @@ -151,6 +153,16 @@ async fn listen_device_events(handler: Handler) { loop { notify.notified().await; + + // Wait until the timeout has passed with no new events coming in + while timeout( + Duration::from_millis(DRM_EVENT_TIMEOUT_PERIOD_MS), + notify.notified(), + ) + .await + .is_ok() + {} + info!("got kernel drm subsystem event, reloading GPUs"); handler.reload_gpus().await; }