From 31adfcf3f673f5ce91501802f14bbf8161c7897f Mon Sep 17 00:00:00 2001 From: Herbert Wartens Date: Tue, 15 Sep 2020 22:55:58 -0700 Subject: [PATCH 1/2] Add --nodown Option to Exclude Nodes That Are Down Add in an option that is similar to the -v option in pdsh. The idea is to skip all hosts that are currently down for one reason or another. The groups.conf file would have something like this in it: [genders] map: nodeattr -n $GROUP all: nodeattr -n -A list: nodeattr -l down: whatsup -n -d || /bin/true Example usage without -D: > clush -a -b /path/to/command.sh host8: mcmd: connect failed: No route to host host12: mcmd: connect failed: No route to host clush: host[8,12] (2): exited with exit code 1 --------------- host[1-7,9-11] (10) --------------- Hello World Example usage with -D: > clush -D -a -b /path/to/command.sh --------------- host[1-7,9-11] (10) --------------- Hello World --- conf/groups.conf.d/genders.conf.example | 1 + doc/man/man1/clush.1 | 3 +++ doc/man/man5/groups.conf.5 | 7 +++++- doc/sphinx/config.rst | 1 + doc/txt/clush.txt | 1 + lib/ClusterShell/CLI/Clush.py | 5 ++++ lib/ClusterShell/CLI/OptionParser.py | 2 ++ lib/ClusterShell/NodeSet.py | 18 ++++++++++++++ lib/ClusterShell/NodeUtils.py | 32 ++++++++++++++++++++----- 9 files changed, 63 insertions(+), 7 deletions(-) diff --git a/conf/groups.conf.d/genders.conf.example b/conf/groups.conf.d/genders.conf.example index 02069127..692fca2d 100644 --- a/conf/groups.conf.d/genders.conf.example +++ b/conf/groups.conf.d/genders.conf.example @@ -9,4 +9,5 @@ map: nodeattr -n $GROUP all: nodeattr -n ALL list: nodeattr -l +down: whatsup -n -d || /bin/true diff --git a/doc/man/man1/clush.1 b/doc/man/man1/clush.1 index 8fd94f56..588b3363 100644 --- a/doc/man/man1/clush.1 +++ b/doc/man/man1/clush.1 @@ -185,6 +185,9 @@ exclude nodes from the node list .B \-a\fP,\fB \-\-all run command on all nodes .TP +.B \-D\fP,\fB \-\-nodown +exclude nodes that are down +.TP .BI \-g \ GROUP\fP,\fB \ \-\-group\fB= GROUP run command on a group of nodes .TP diff --git a/doc/man/man5/groups.conf.5 b/doc/man/man5/groups.conf.5 index 43d24c6d..832cacfc 100644 --- a/doc/man/man5/groups.conf.5 +++ b/doc/man/man5/groups.conf.5 @@ -64,7 +64,7 @@ Global configuration options. There should be only one Main section. .B \fIGroup_source\fP The \fIGroup_source\fP section(s) define the configuration for each node group source (or namespace). This configuration consists in external commands -definition (map, all, list and reverse). +definition (map, all, list, down, and reverse). .UNINDENT .sp Only \fIGroup_source\fP section(s) are allowed in additional configuration files. @@ -123,6 +123,11 @@ Optional external shell command that should return the list of all groups for this group source (separated by space characters or by carriage returns). .TP +.B down +Optional external shell command that should return the list of all hosts +that are down for this group source (separated by space characters or by +carriage returns). +.TP .B reverse Optional external shell command used to find the group(s) of a single node. The variable $NODE is previously replaced. If this upcall is not diff --git a/doc/sphinx/config.rst b/doc/sphinx/config.rst index 135021ba..bac567be 100644 --- a/doc/sphinx/config.rst +++ b/doc/sphinx/config.rst @@ -148,6 +148,7 @@ groups are bound to the source named *genders* by default:: map: nodeattr -n $GROUP all: nodeattr -n ALL list: nodeattr -l + down: whatsup -n -d || /bin/true [slurm] map: sinfo -h -o "%N" -p $GROUP diff --git a/doc/txt/clush.txt b/doc/txt/clush.txt index 7c070cee..dae21145 100644 --- a/doc/txt/clush.txt +++ b/doc/txt/clush.txt @@ -140,6 +140,7 @@ Selecting target nodes: -w NODES nodes where to run the command -x NODES exclude nodes from the node list -a, --all run command on all nodes + -D, --nodown exclude nodes that are down -g GROUP, --group=GROUP run command on a group of nodes -X GROUP exclude nodes from this group diff --git a/lib/ClusterShell/CLI/Clush.py b/lib/ClusterShell/CLI/Clush.py index 8218c97f..04b71dc8 100755 --- a/lib/ClusterShell/CLI/Clush.py +++ b/lib/ClusterShell/CLI/Clush.py @@ -922,6 +922,11 @@ def main(): msg = "Picked random nodes: %s" % nodeset_base print(Display.COLOR_RESULT_FMT % msg) + # If we need to remove nodes that are down do it here + if options.nodown: + down = NodeSet.fromdown() + nodeset_base.difference_update(down) + # Set open files limit. set_fdlimit(config.fd_max, display) diff --git a/lib/ClusterShell/CLI/OptionParser.py b/lib/ClusterShell/CLI/OptionParser.py index 449d6264..46c09613 100644 --- a/lib/ClusterShell/CLI/OptionParser.py +++ b/lib/ClusterShell/CLI/OptionParser.py @@ -106,6 +106,8 @@ def install_nodes_options(self): optgrp.add_option("-x", action="append", type="safestring", dest="exclude", metavar="NODES", help="exclude nodes from the node list") + optgrp.add_option("-D", "--nodown", action="store_true", dest="nodown", + help="exclude down nodes from the node list") optgrp.add_option("-a", "--all", action="store_true", dest="nodes_all", help="run command on all nodes") optgrp.add_option("-g", "--group", action="append", type="safestring", diff --git a/lib/ClusterShell/NodeSet.py b/lib/ClusterShell/NodeSet.py index 8faf1590..591b12e5 100644 --- a/lib/ClusterShell/NodeSet.py +++ b/lib/ClusterShell/NodeSet.py @@ -1285,6 +1285,24 @@ def fromall(cls, groupsource=None, autostep=None, resolver=None): raise NodeSetExternalError(errmsg) return inst + @classmethod + def fromdown(cls, groupsource=None, autostep=None, resolver=None): + """Class method that returns a new NodeSet with all nodes from optional + groupsource.""" + inst = NodeSet(autostep=autostep, resolver=resolver) + try: + if not inst._resolver: + raise NodeSetExternalError("Group resolver is not defined") + else: + # fill this nodeset with all nodes found by resolver + down_nodes = inst._parser.group_resolver.down_nodes(groupsource) + inst = NodeSet.fromlist(down_nodes) + except NodeUtils.GroupResolverError as exc: + errmsg = "Group source error (%s: %s)" % (exc.__class__.__name__, + exc) + raise NodeSetExternalError(errmsg) + return inst + def __getstate__(self): """Called when pickling: remove references to group resolver.""" odict = self.__dict__.copy() diff --git a/lib/ClusterShell/NodeUtils.py b/lib/ClusterShell/NodeUtils.py index 82cfa011..9bb490b0 100644 --- a/lib/ClusterShell/NodeUtils.py +++ b/lib/ClusterShell/NodeUtils.py @@ -160,8 +160,8 @@ class UpcallGroupSource(GroupSource): """ def __init__(self, name, map_upcall, all_upcall=None, - list_upcall=None, reverse_upcall=None, cfgdir=None, - cache_time=None): + list_upcall=None, down_upcall=None, reverse_upcall=None, + cfgdir=None, cache_time=None): GroupSource.__init__(self, name) self.verbosity = 0 # deprecated self.cfgdir = cfgdir @@ -174,6 +174,8 @@ def __init__(self, name, map_upcall, all_upcall=None, self.upcalls['all'] = all_upcall if list_upcall: self.upcalls['list'] = list_upcall + if down_upcall: + self.upcalls['down'] = down_upcall if reverse_upcall: self.upcalls['reverse'] = reverse_upcall self.has_reverse = True @@ -192,6 +194,7 @@ def clear_cache(self): """ self._cache = { 'map': {}, + 'down': {}, 'reverse': {} } @@ -224,12 +227,12 @@ def _upcall_cache(self, upcall, cache, key, **args): raise GroupSourceNoUpcall(upcall, self) # Purge expired data from cache - if key in cache and cache[key][1] < time.time(): + if key in cache and cache[key]: self.logger.debug("PURGE EXPIRED (%d)'%s'", cache[key][1], key) del cache[key] # Fetch the data if unknown of just purged - if key not in cache: + if key not in cache or not cache[key]: cache_expiry = time.time() + self.cache_time # $CFGDIR and $SOURCE always replaced args['CFGDIR'] = self.cfgdir @@ -252,6 +255,12 @@ def resolv_list(self): """ return self._upcall_cache('list', self._cache, 'list') + def resolv_down(self): + """ + Return a list of all nodes that are down in this group. + """ + return self._upcall_cache('down', self._cache, 'down') + def resolv_all(self): """ Return the content of special group ALL, using the cached value @@ -496,6 +505,13 @@ def all_nodes(self, namespace=None): source = self._source(namespace) return self._list_nodes(source, 'all') + def down_nodes(self, namespace=None): + """ + Find all nodes. You may specify an optional namespace. + """ + source = self._source(namespace) + return self._list_nodes(source, 'down') + def grouplist(self, namespace=None): """ Get full group list. You may specify an optional @@ -653,11 +669,13 @@ def _sources_from_cfg(self, cfg, cfgdir): if srcname != self.SECTION_MAIN: # only map is a mandatory upcall map_upcall = cfg.get(section, 'map', raw=True) - all_upcall = list_upcall = reverse_upcall = ctime = None + all_upcall = list_upcall = down_upcall = reverse_upcall = ctime = None if cfg.has_option(section, 'all'): all_upcall = cfg.get(section, 'all', raw=True) if cfg.has_option(section, 'list'): list_upcall = cfg.get(section, 'list', raw=True) + if cfg.has_option(section, 'down'): + down_upcall = cfg.get(section, 'down', raw=True) if cfg.has_option(section, 'reverse'): reverse_upcall = cfg.get(section, 'reverse', raw=True) @@ -665,9 +683,11 @@ def _sources_from_cfg(self, cfg, cfgdir): ctime = float(cfg.get(section, 'cache_time', raw=True)) # add new group source - self.add_source(UpcallGroupSource(srcname, map_upcall, + self.add_source(UpcallGroupSource(srcname, + map_upcall, all_upcall, list_upcall, + down_upcall, reverse_upcall, cfgdir, ctime)) except (NoSectionError, NoOptionError, ValueError) as exc: From 65485e2dd23ee67db322f48a42ad1916c1d7af02 Mon Sep 17 00:00:00 2001 From: Herbert Wartens Date: Wed, 16 Sep 2020 07:52:06 -0700 Subject: [PATCH 2/2] Harden UpcallGroupSource Initialization Use named parameters instead of relying on position when initializing UpcallGroupSource class. --- lib/ClusterShell/NodeUtils.py | 11 ++++++----- tests/NodeSetGroupTest.py | 6 ++++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/lib/ClusterShell/NodeUtils.py b/lib/ClusterShell/NodeUtils.py index 9bb490b0..6cda1304 100644 --- a/lib/ClusterShell/NodeUtils.py +++ b/lib/ClusterShell/NodeUtils.py @@ -685,11 +685,12 @@ def _sources_from_cfg(self, cfg, cfgdir): # add new group source self.add_source(UpcallGroupSource(srcname, map_upcall, - all_upcall, - list_upcall, - down_upcall, - reverse_upcall, - cfgdir, ctime)) + all_upcall=all_upcall, + list_upcall=list_upcall, + down_upcall=down_upcall, + reverse_upcall=reverse_upcall, + cfgdir=cfgdir, + cache_time=ctime)) except (NoSectionError, NoOptionError, ValueError) as exc: raise GroupResolverConfigError(str(exc)) diff --git a/tests/NodeSetGroupTest.py b/tests/NodeSetGroupTest.py index e740c535..f0df801e 100644 --- a/tests/NodeSetGroupTest.py +++ b/tests/NodeSetGroupTest.py @@ -1346,8 +1346,10 @@ def __init__(self, name, data): reverse_upcall = None if 'reverse' in data: reverse_upcall = 'fake_reverse' - UpcallGroupSource.__init__(self, name, "fake_map", all_upcall, - list_upcall, reverse_upcall) + UpcallGroupSource.__init__(self, name, "fake_map", + all_upcall=all_upcall, + list_upcall=list_upcall, + reverse_upcall=reverse_upcall) self._data = data def _upcall_read(self, cmdtpl, args=dict()):