From bed65646857fd75f0e5648a0d5edbe21d264cd1e Mon Sep 17 00:00:00 2001 From: Vratislav Podzimek Date: Mon, 6 Nov 2023 17:44:04 +0100 Subject: [PATCH 1/5] Do not add our services to cfengine3.service.wants on enable The `cfengine3.service` has `Wants` on all our services which ensures they are started when the `cfengine3.service` starts. If an individual service is enabled with `systemctl enable`, it should only be added to the respective systemd target in which it should start. (cherry picked from commit 4e661a70080b1c3786a58bdfaf7d0149d3e581f5) --- misc/systemd/cf-apache.service.in | 1 - misc/systemd/cf-execd.service.in | 1 - misc/systemd/cf-hub.service.in | 1 - misc/systemd/cf-monitord.service.in | 1 - misc/systemd/cf-postgres.service.in | 1 - misc/systemd/cf-reactor.service.in | 1 - misc/systemd/cf-runalerts.service.in | 1 - misc/systemd/cf-serverd.service.in | 1 - 8 files changed, 8 deletions(-) diff --git a/misc/systemd/cf-apache.service.in b/misc/systemd/cf-apache.service.in index 5b4373ec24..dea7c2356f 100644 --- a/misc/systemd/cf-apache.service.in +++ b/misc/systemd/cf-apache.service.in @@ -17,4 +17,3 @@ UMask=0177 [Install] WantedBy=multi-user.target -WantedBy=cfengine3.service diff --git a/misc/systemd/cf-execd.service.in b/misc/systemd/cf-execd.service.in index 95567bcfe0..66f1e235e3 100644 --- a/misc/systemd/cf-execd.service.in +++ b/misc/systemd/cf-execd.service.in @@ -14,4 +14,3 @@ KillMode=process [Install] WantedBy=multi-user.target -WantedBy=cfengine3.service diff --git a/misc/systemd/cf-hub.service.in b/misc/systemd/cf-hub.service.in index 1c8c62aa46..a6027ce206 100644 --- a/misc/systemd/cf-hub.service.in +++ b/misc/systemd/cf-hub.service.in @@ -17,4 +17,3 @@ RestartSec=10 [Install] WantedBy=multi-user.target -WantedBy=cfengine3.service diff --git a/misc/systemd/cf-monitord.service.in b/misc/systemd/cf-monitord.service.in index 7ceb1e78d6..351090d6e6 100644 --- a/misc/systemd/cf-monitord.service.in +++ b/misc/systemd/cf-monitord.service.in @@ -13,4 +13,3 @@ RestartSec=10 [Install] WantedBy=multi-user.target -WantedBy=cfengine3.service diff --git a/misc/systemd/cf-postgres.service.in b/misc/systemd/cf-postgres.service.in index 609c90c4fa..393efad1ee 100644 --- a/misc/systemd/cf-postgres.service.in +++ b/misc/systemd/cf-postgres.service.in @@ -33,4 +33,3 @@ ExecReload=@bindir@/pg_ctl -w -D ${PGDATA} -l /var/log/postgresql.log reload -m [Install] WantedBy=multi-user.target -WantedBy=cfengine3.service diff --git a/misc/systemd/cf-reactor.service.in b/misc/systemd/cf-reactor.service.in index e50248258f..9e751abec5 100644 --- a/misc/systemd/cf-reactor.service.in +++ b/misc/systemd/cf-reactor.service.in @@ -17,4 +17,3 @@ RestartSec=10 [Install] WantedBy=multi-user.target -WantedBy=cfengine3.service diff --git a/misc/systemd/cf-runalerts.service.in b/misc/systemd/cf-runalerts.service.in index 9a79480e02..e87f199347 100644 --- a/misc/systemd/cf-runalerts.service.in +++ b/misc/systemd/cf-runalerts.service.in @@ -20,5 +20,4 @@ RestartSec=10 [Install] WantedBy=multi-user.target -WantedBy=cfengine3.service WantedBy=cf-postgres.service diff --git a/misc/systemd/cf-serverd.service.in b/misc/systemd/cf-serverd.service.in index 50fb00b15f..03945f8fea 100644 --- a/misc/systemd/cf-serverd.service.in +++ b/misc/systemd/cf-serverd.service.in @@ -15,4 +15,3 @@ RestartSec=10 [Install] WantedBy=network-online.target -WantedBy=cfengine3.service From b16ef5fca5d94365056323859bb4b6ad233ba1c1 Mon Sep 17 00:00:00 2001 From: Vratislav Podzimek Date: Tue, 7 Nov 2023 09:54:59 +0100 Subject: [PATCH 2/5] Relax the condition for matching LMDB file in `cf-check dump` Instead of requiring that the file name ends with e.g. "cf_lock.lmdb", just check if the file name contains the string. This ensures that files like `cf_lock.lmdb.backup` match as well. And if someone renames their `cf_lastseen.lmdb` to `cf_lock.lmdb_cf_lastseen.lmdb` or something similar, it's not our fault they get wrong output. (cherry picked from commit 3e98366c36e4e7e991148108699e8bdd0494722b) --- cf-check/dump.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/cf-check/dump.c b/cf-check/dump.c index d059612b46..2a87c57f3b 100644 --- a/cf-check/dump.c +++ b/cf-check/dump.c @@ -277,16 +277,16 @@ static void print_struct_or_string( { if (structs) { - if (StringEndsWith(file, "cf_lastseen.lmdb") + if (StringContains(file, "cf_lastseen.lmdb") && StringStartsWith(key.mv_data, "q")) { print_struct_lastseen_quality(value, strip_strings); } - else if (StringEndsWith(file, "cf_lock.lmdb")) + else if (StringContains(file, "cf_lock.lmdb")) { print_struct_lock_data(value, strip_strings); } - else if (StringEndsWith(file, "cf_observations.lmdb")) + else if (StringContains(file, "cf_observations.lmdb")) { if (StringEqual(key.mv_data, "DATABASE_AGE")) { @@ -299,15 +299,18 @@ static void print_struct_or_string( print_struct_averages(value, strip_strings, tskey_filename); } } - else if (StringEndsWith(file, "history.lmdb")) + else if (StringEqual(file, "history.lmdb") || + StringEndsWith(file, FILE_SEPARATOR_STR"history.lmdb") || + StringEqual(file, "history.lmdb.backup") || + StringEndsWith(file, FILE_SEPARATOR_STR"history.lmdb.backup")) { print_struct_averages(value, strip_strings, tskey_filename); } - else if (StringEndsWith(file, "cf_state.lmdb")) + else if (StringContains(file, "cf_state.lmdb")) { print_struct_persistent_class(value, strip_strings); } - else if (StringEndsWith(file, "nova_agent_execution.lmdb")) + else if (StringContains(file, "nova_agent_execution.lmdb")) { if (StringEqual(key.mv_data, "delta_gavr")) { From 6e78b0f4ae4c7486dbb13565cfdeb1c8f19b463b Mon Sep 17 00:00:00 2001 From: Vratislav Podzimek Date: Tue, 7 Nov 2023 09:59:48 +0100 Subject: [PATCH 3/5] Drop VerifyThatDatabaseIsNotCorrupt() on locks DB This function totally doesn't do what its name says. It only checks if the DB was modified in the current boot and if not, it restores it from the latest backup. Which is done and the end of every agent run. So this function effectively reverts the locks DB to the state of the last agent run on every boot dropping significant data like daemon locks. Ticket: CFE-3982 Changelog: cf_lock.lmdb is no longer restored from backup on every boot (cherry picked from commit db4eaf808db82462b4b47f250877f97e13dcb7ef) --- libpromises/locks.c | 36 ------------------------------------ 1 file changed, 36 deletions(-) diff --git a/libpromises/locks.c b/libpromises/locks.c index e2eaeba12c..969fbab1ae 100644 --- a/libpromises/locks.c +++ b/libpromises/locks.c @@ -95,46 +95,10 @@ static void CopyLockDatabaseAtomically(const char *from, const char *to, const char *from_pretty_name, const char *to_pretty_name); -static void RestoreLockDatabase(void); - -static void VerifyThatDatabaseIsNotCorrupt_once(void) -{ - int uptime = GetUptimeSeconds(time(NULL)); - if (uptime <= 0) - { - Log(LOG_LEVEL_VERBOSE, - "Not able to determine uptime when verifying lock database. " - "Will assume the database is in order."); - return; - } - - char *db_path = DBIdToPath(dbid_locks); - struct stat statbuf; - if (stat(db_path, &statbuf) == 0) - { - if (statbuf.st_mtime < time(NULL) - uptime) - { - // We have rebooted since the database was last updated. - // Restore it from our backup. - RestoreLockDatabase(); - } - } - - free(db_path); -} - -static void VerifyThatDatabaseIsNotCorrupt(void) -{ - static pthread_once_t uptime_verified = PTHREAD_ONCE_INIT; - pthread_once(&uptime_verified, &VerifyThatDatabaseIsNotCorrupt_once); -} - CF_DB *OpenLock() { CF_DB *dbp; - VerifyThatDatabaseIsNotCorrupt(); - if (!OpenDB(&dbp, dbid_locks)) { return NULL; From 094e758d448d5e4b7013b905587b1151b5516003 Mon Sep 17 00:00:00 2001 From: Vratislav Podzimek Date: Tue, 7 Nov 2023 10:08:41 +0100 Subject: [PATCH 4/5] Make RestoreLockDatabase() a non-static function It's no longer being used in locks.c, but it can potentially be useful for being called explicitly. After all, it's a complementary function to BackupLockDatabase() which is also non-static. Ticket: CFE-3982 Changelog: None (cherry picked from commit d824b882cdc3f721a81cb6317e3230430767a9fc) --- libpromises/locks.c | 5 ++--- libpromises/locks.h | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libpromises/locks.c b/libpromises/locks.c index 969fbab1ae..0e56d425a3 100644 --- a/libpromises/locks.c +++ b/libpromises/locks.c @@ -1038,11 +1038,10 @@ void GetLockName(char *lockname, const char *locktype, } } -static void RestoreLockDatabase(void) +void RestoreLockDatabase(void) { // We don't do any locking here (since we can't trust the database), but - // this should be right after bootup, so we should be the only one. - // Worst case someone else will just copy the same file to the same + // worst case someone else will just copy the same file to the same // location. char *db_path = DBIdToPath(dbid_locks); char *db_path_backup; diff --git a/libpromises/locks.h b/libpromises/locks.h index 10c26fdbf1..6379c24f1f 100644 --- a/libpromises/locks.h +++ b/libpromises/locks.h @@ -38,6 +38,7 @@ void GetLockName(char *lockname, const char *locktype, const char *base, const Rlist *params); void PurgeLocks(void); void BackupLockDatabase(void); +void RestoreLockDatabase(void); // Used in enterprise/nova code: CF_DB *OpenLock(); From 995d03578b549acac2987a3080324f101ae349d6 Mon Sep 17 00:00:00 2001 From: Vratislav Podzimek Date: Tue, 7 Nov 2023 11:10:08 +0100 Subject: [PATCH 5/5] Only kill potential CFEngine lock holders MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a non CFEngine process has a matching PID and start time, we shouldn't try to kill it because it's practically impossible that it is a real holder of one of our locks in cf_lock.lmdb. Most likely it's an unfortunate process that ended up with the matching PID and start time after a reboot. Ticket: CFE-3982 Changelog: Only CFEngine processes are now killed as expired lock owners Co-authored-by: BenoƮt Peccatte (cherry picked from commit 6234c8cf64c6e20632d2cd99344b0f6730c2e251) --- libpromises/locks.c | 64 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/libpromises/locks.c b/libpromises/locks.c index 0e56d425a3..e295445816 100644 --- a/libpromises/locks.c +++ b/libpromises/locks.c @@ -518,6 +518,63 @@ static void PromiseTypeString(char *dst, size_t dst_size, const Promise *pp) } } +/** + * A helper best-effort function to prevent us from killing non CFEngine + * processes with matching PID-start_time combinations **when/where it's easy to + * check**. + */ +static bool IsCfengineLockHolder(pid_t pid) +{ + char procfile[PATH_MAX]; + snprintf(procfile, PATH_MAX, "/proc/%ju/comm", (uintmax_t) pid); + int f = open(procfile, O_RDONLY); + /* On platforms where /proc doesn't exist, we would have to do a more + complicated check probably not worth it in this helper best-effort + function. */ + if (f == -1) + { + /* assume true where we cannot check */ + return true; + } + + /* more than any possible CFEngine lock holder's name */ + char command[32] = {0}; + ssize_t n_read = FullRead(f, command, sizeof(command)); + close(f); + if ((n_read <= 1) || (n_read == sizeof(command))) + { + Log(LOG_LEVEL_VERBOSE, "Failed to get command for process %ju", (uintmax_t) pid); + /* assume true where we cannot check */ + return true; + } + if (command[n_read - 1] == '\n') + { + command[n_read - 1] = '\0'; + } + + /* potential CFEngine lock holders (others like cf-net, cf-key,... are not + * supposed/expected to be lock holders) */ + const char *const cfengine_procs[] = { + "cf-promises", + "lt-cf-agent", /* when running from a build with 'libtool --mode=execute' */ + "cf-agent", + "cf-execd", + "cf-serverd", + "cf-monitord", + "cf-hub", + NULL + }; + for (size_t i = 0; cfengine_procs[i] != NULL; i++) + { + if (StringEqual(cfengine_procs[i], command)) + { + return true; + } + } + Log(LOG_LEVEL_DEBUG, "'%s' not considered a CFEngine process", command); + return false; +} + static bool KillLockHolder(const char *lock) { bool ret; @@ -551,6 +608,13 @@ static bool KillLockHolder(const char *lock) CloseLock(dbp); + if (!IsCfengineLockHolder(lock_data.pid)) { + Log(LOG_LEVEL_VERBOSE, + "Lock holder with PID %ju was replaced by a non CFEngine process, ignoring request to kill it", + (uintmax_t) lock_data.pid); + return true; + } + if (GracefulTerminate(lock_data.pid, lock_data.process_start_time)) { Log(LOG_LEVEL_INFO,