Skip to content

Commit

Permalink
Merge pull request #5368 from vpodzime/3.18.x-locks_killing
Browse files Browse the repository at this point in the history
[3.18.x] CFE-3982: Random process killing
  • Loading branch information
vpodzime authored Nov 8, 2023
2 parents 0cc7d1a + 995d035 commit d229fb8
Show file tree
Hide file tree
Showing 11 changed files with 76 additions and 53 deletions.
15 changes: 9 additions & 6 deletions cf-check/dump.c
Original file line number Diff line number Diff line change
Expand Up @@ -277,16 +277,16 @@ static void print_struct_or_string(
{
if (structs)
{
if (StringEndsWith(file, "cf_lastseen.lmdb")
if (StringContains(file, "cf_lastseen.lmdb")
&& StringStartsWith(key.mv_data, "q"))
{
print_struct_lastseen_quality(value, strip_strings);
}
else if (StringEndsWith(file, "cf_lock.lmdb"))
else if (StringContains(file, "cf_lock.lmdb"))
{
print_struct_lock_data(value, strip_strings);
}
else if (StringEndsWith(file, "cf_observations.lmdb"))
else if (StringContains(file, "cf_observations.lmdb"))
{
if (StringEqual(key.mv_data, "DATABASE_AGE"))
{
Expand All @@ -299,15 +299,18 @@ static void print_struct_or_string(
print_struct_averages(value, strip_strings, tskey_filename);
}
}
else if (StringEndsWith(file, "history.lmdb"))
else if (StringEqual(file, "history.lmdb") ||
StringEndsWith(file, FILE_SEPARATOR_STR"history.lmdb") ||
StringEqual(file, "history.lmdb.backup") ||
StringEndsWith(file, FILE_SEPARATOR_STR"history.lmdb.backup"))
{
print_struct_averages(value, strip_strings, tskey_filename);
}
else if (StringEndsWith(file, "cf_state.lmdb"))
else if (StringContains(file, "cf_state.lmdb"))
{
print_struct_persistent_class(value, strip_strings);
}
else if (StringEndsWith(file, "nova_agent_execution.lmdb"))
else if (StringContains(file, "nova_agent_execution.lmdb"))
{
if (StringEqual(key.mv_data, "delta_gavr"))
{
Expand Down
105 changes: 66 additions & 39 deletions libpromises/locks.c
Original file line number Diff line number Diff line change
Expand Up @@ -95,46 +95,10 @@ static void CopyLockDatabaseAtomically(const char *from, const char *to,
const char *from_pretty_name,
const char *to_pretty_name);

static void RestoreLockDatabase(void);

static void VerifyThatDatabaseIsNotCorrupt_once(void)
{
int uptime = GetUptimeSeconds(time(NULL));
if (uptime <= 0)
{
Log(LOG_LEVEL_VERBOSE,
"Not able to determine uptime when verifying lock database. "
"Will assume the database is in order.");
return;
}

char *db_path = DBIdToPath(dbid_locks);
struct stat statbuf;
if (stat(db_path, &statbuf) == 0)
{
if (statbuf.st_mtime < time(NULL) - uptime)
{
// We have rebooted since the database was last updated.
// Restore it from our backup.
RestoreLockDatabase();
}
}

free(db_path);
}

static void VerifyThatDatabaseIsNotCorrupt(void)
{
static pthread_once_t uptime_verified = PTHREAD_ONCE_INIT;
pthread_once(&uptime_verified, &VerifyThatDatabaseIsNotCorrupt_once);
}

CF_DB *OpenLock()
{
CF_DB *dbp;

VerifyThatDatabaseIsNotCorrupt();

if (!OpenDB(&dbp, dbid_locks))
{
return NULL;
Expand Down Expand Up @@ -554,6 +518,63 @@ static void PromiseTypeString(char *dst, size_t dst_size, const Promise *pp)
}
}

/**
* A helper best-effort function to prevent us from killing non CFEngine
* processes with matching PID-start_time combinations **when/where it's easy to
* check**.
*/
static bool IsCfengineLockHolder(pid_t pid)
{
char procfile[PATH_MAX];
snprintf(procfile, PATH_MAX, "/proc/%ju/comm", (uintmax_t) pid);
int f = open(procfile, O_RDONLY);
/* On platforms where /proc doesn't exist, we would have to do a more
complicated check probably not worth it in this helper best-effort
function. */
if (f == -1)
{
/* assume true where we cannot check */
return true;
}

/* more than any possible CFEngine lock holder's name */
char command[32] = {0};
ssize_t n_read = FullRead(f, command, sizeof(command));
close(f);
if ((n_read <= 1) || (n_read == sizeof(command)))
{
Log(LOG_LEVEL_VERBOSE, "Failed to get command for process %ju", (uintmax_t) pid);
/* assume true where we cannot check */
return true;
}
if (command[n_read - 1] == '\n')
{
command[n_read - 1] = '\0';
}

/* potential CFEngine lock holders (others like cf-net, cf-key,... are not
* supposed/expected to be lock holders) */
const char *const cfengine_procs[] = {
"cf-promises",
"lt-cf-agent", /* when running from a build with 'libtool --mode=execute' */
"cf-agent",
"cf-execd",
"cf-serverd",
"cf-monitord",
"cf-hub",
NULL
};
for (size_t i = 0; cfengine_procs[i] != NULL; i++)
{
if (StringEqual(cfengine_procs[i], command))
{
return true;
}
}
Log(LOG_LEVEL_DEBUG, "'%s' not considered a CFEngine process", command);
return false;
}

static bool KillLockHolder(const char *lock)
{
bool ret;
Expand Down Expand Up @@ -587,6 +608,13 @@ static bool KillLockHolder(const char *lock)

CloseLock(dbp);

if (!IsCfengineLockHolder(lock_data.pid)) {
Log(LOG_LEVEL_VERBOSE,
"Lock holder with PID %ju was replaced by a non CFEngine process, ignoring request to kill it",
(uintmax_t) lock_data.pid);
return true;
}

if (GracefulTerminate(lock_data.pid, lock_data.process_start_time))
{
Log(LOG_LEVEL_INFO,
Expand Down Expand Up @@ -1074,11 +1102,10 @@ void GetLockName(char *lockname, const char *locktype,
}
}

static void RestoreLockDatabase(void)
void RestoreLockDatabase(void)
{
// We don't do any locking here (since we can't trust the database), but
// this should be right after bootup, so we should be the only one.
// Worst case someone else will just copy the same file to the same
// worst case someone else will just copy the same file to the same
// location.
char *db_path = DBIdToPath(dbid_locks);
char *db_path_backup;
Expand Down
1 change: 1 addition & 0 deletions libpromises/locks.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ void GetLockName(char *lockname, const char *locktype,
const char *base, const Rlist *params);
void PurgeLocks(void);
void BackupLockDatabase(void);
void RestoreLockDatabase(void);

// Used in enterprise/nova code:
CF_DB *OpenLock();
Expand Down
1 change: 0 additions & 1 deletion misc/systemd/cf-apache.service.in
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,3 @@ UMask=0177

[Install]
WantedBy=multi-user.target
WantedBy=cfengine3.service
1 change: 0 additions & 1 deletion misc/systemd/cf-execd.service.in
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,3 @@ KillMode=process

[Install]
WantedBy=multi-user.target
WantedBy=cfengine3.service
1 change: 0 additions & 1 deletion misc/systemd/cf-hub.service.in
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,3 @@ RestartSec=10

[Install]
WantedBy=multi-user.target
WantedBy=cfengine3.service
1 change: 0 additions & 1 deletion misc/systemd/cf-monitord.service.in
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,3 @@ RestartSec=10

[Install]
WantedBy=multi-user.target
WantedBy=cfengine3.service
1 change: 0 additions & 1 deletion misc/systemd/cf-postgres.service.in
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,3 @@ ExecReload=@bindir@/pg_ctl -w -D ${PGDATA} -l /var/log/postgresql.log reload -m

[Install]
WantedBy=multi-user.target
WantedBy=cfengine3.service
1 change: 0 additions & 1 deletion misc/systemd/cf-reactor.service.in
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,3 @@ RestartSec=10

[Install]
WantedBy=multi-user.target
WantedBy=cfengine3.service
1 change: 0 additions & 1 deletion misc/systemd/cf-runalerts.service.in
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,4 @@ RestartSec=10

[Install]
WantedBy=multi-user.target
WantedBy=cfengine3.service
WantedBy=cf-postgres.service
1 change: 0 additions & 1 deletion misc/systemd/cf-serverd.service.in
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,3 @@ RestartSec=10

[Install]
WantedBy=network-online.target
WantedBy=cfengine3.service

0 comments on commit d229fb8

Please sign in to comment.