diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ab70f9450..d334e556f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -24,6 +24,7 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+exclude: monitor/dcgm/
repos:
- repo: https://github.com/timothycrosley/isort
rev: 5.12.0
diff --git a/Dockerfile b/Dockerfile
index 93fff3753..802cb93b5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -27,7 +27,7 @@ ARG BASE_IMAGE
ARG TRITONSDK_BASE_IMAGE
# DCGM version to install for Model Analyzer
-ENV DCGM_VERSION=2.4.7
+ENV DCGM_VERSION=3.2.6
# Ensure apt-get won't prompt for selecting options
ENV DEBIAN_FRONTEND=noninteractive
diff --git a/model_analyzer/device/gpu_device_factory.py b/model_analyzer/device/gpu_device_factory.py
index f28e36b3e..03f76115f 100755
--- a/model_analyzer/device/gpu_device_factory.py
+++ b/model_analyzer/device/gpu_device_factory.py
@@ -66,9 +66,10 @@ def init_all_devices(self, dcgmPath=None):
device_atrributes = dcgm_agent.dcgmGetDeviceAttributes(
dcgm_handle, device_id
).identifiers
- pci_bus_id = device_atrributes.pciBusId.decode("utf-8").upper()
- device_uuid = str(device_atrributes.uuid, encoding="utf-8")
- device_name = str(device_atrributes.deviceName, encoding="utf-8")
+ pci_bus_id = device_atrributes.pciBusId
+ device_uuid = device_atrributes.uuid
+ device_name = device_atrributes.deviceName
+
gpu_device = GPUDevice(device_name, device_id, pci_bus_id, device_uuid)
self._devices.append(gpu_device)
diff --git a/model_analyzer/monitor/dcgm/DcgmDiag.py b/model_analyzer/monitor/dcgm/DcgmDiag.py
new file mode 100644
index 000000000..e9178895c
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/DcgmDiag.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
+
+
+class DcgmDiag:
+
+ # Maps version codes to simple version values for range comparisons
+ _versionMap = {dcgm_structs.dcgmRunDiag_version: 5}
+
+ def __init__(self,
+ gpuIds=None,
+ testNamesStr='',
+ paramsStr='',
+ verbose=True,
+ version=dcgm_structs.dcgmRunDiag_version):
+ # Make sure version is valid
+ if version not in DcgmDiag._versionMap:
+ raise ValueError("'%s' is not a valid version for dcgmRunDiag." %
+ version)
+ self.version = version
+
+ if self.version == dcgm_structs.dcgmRunDiag_version7:
+ self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
+ else:
+ self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_t()
+
+ self.numTests = 0
+ self.numParams = 0
+ self.SetVerbose(verbose)
+ if testNamesStr == '':
+ # default to a level 1 test
+ self.runDiagInfo.validate = 1
+ elif testNamesStr == '1':
+ self.runDiagInfo.validate = 1
+ elif testNamesStr == '2':
+ self.runDiagInfo.validate = 2
+ elif testNamesStr == '3':
+ self.runDiagInfo.validate = 3
+ elif testNamesStr == '4':
+ self.runDiagInfo.validate = 4
+ else:
+ # Make sure no number other that 1-4 were submitted
+ if testNamesStr.isdigit():
+ raise ValueError("'%s' is not a valid test name." %
+ testNamesStr)
+
+ # Copy to the testNames portion of the object
+ names = testNamesStr.split(',')
+ if len(names) > dcgm_structs.DCGM_MAX_TEST_NAMES:
+ err = 'DcgmDiag cannot initialize: %d test names were specified exceeding the limit of %d.' %\
+ (len(names), dcgm_structs.DCGM_MAX_TEST_NAMES)
+ raise ValueError(err)
+
+ for testName in names:
+ self.AddTest(testName)
+
+ if paramsStr != '':
+ params = paramsStr.split(';')
+ if len(params) >= dcgm_structs.DCGM_MAX_TEST_PARMS:
+ err = 'DcgmDiag cannot initialize: %d parameters were specified, exceeding the limit of %d.' %\
+ (len(params), dcgm_structs.DCGM_MAX_TEST_PARMS)
+ raise ValueError(err)
+
+ for param in params:
+ self.AddParameter(param)
+
+ if gpuIds:
+ first = True
+ for gpu in gpuIds:
+ if first:
+ self.runDiagInfo.gpuList = str(gpu)
+ first = False
+ else:
+ self.runDiagInfo.gpuList = "%s,%s" % (
+ self.runDiagInfo.gpuList, str(gpu))
+
+ def SetVerbose(self, val):
+ if val == True:
+ self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_VERBOSE
+ else:
+ self.runDiagInfo.flags &= ~dcgm_structs.DCGM_RUN_FLAGS_VERBOSE
+
+ def UseFakeGpus(self):
+ self.runDiagInfo.fakeGpuList = self.runDiagInfo.gpuList
+
+ def GetStruct(self):
+ return self.runDiagInfo
+
+ def AddParameter(self, parameterStr):
+ if len(parameterStr) >= dcgm_structs.DCGM_MAX_TEST_PARMS_LEN:
+ err = 'DcgmDiag cannot add parameter \'%s\' because it exceeds max length %d.' % \
+ (parameterStr, dcgm_structs.DCGM_MAX_TEST_PARMS_LEN)
+ raise ValueError(err)
+
+ index = 0
+ for c in parameterStr:
+ self.runDiagInfo.testParms[self.numParams][index] = ord(c)
+ index += 1
+
+ self.numParams += 1
+
+ def AddTest(self, testNameStr):
+ if len(testNameStr) >= dcgm_structs.DCGM_MAX_TEST_NAMES_LEN:
+ err = 'DcgmDiag cannot add test name \'%s\' because it exceeds max length %d.' % \
+ (testNameStr, dcgm_structs.DCGM_MAX_TEST_NAMES_LEN)
+ raise ValueError(err)
+
+ index = 0
+ for c in testNameStr:
+ self.runDiagInfo.testNames[self.numTests][index] = ord(c)
+ index += 1
+
+ self.numTests += 1
+
+ def SetStatsOnFail(self, val):
+ if val == True:
+ self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_STATSONFAIL
+
+ def SetThrottleMask(self, value):
+ if DcgmDiag._versionMap[self.version] < 3:
+ raise ValueError(
+ "Throttle mask requires minimum version 3 for dcgmRunDiag.")
+ if isinstance(
+ value,
+ str) and len(value) >= dcgm_structs.DCGM_THROTTLE_MASK_LEN:
+ raise ValueError("Throttle mask value '%s' exceeds max length %d." %
+ (value, dcgm_structs.DCGM_THROTTLE_MASK_LEN - 1))
+
+ self.runDiagInfo.throttleMask = str(value)
+
+ def SetFailEarly(self, enable=True, checkInterval=5):
+ if DcgmDiag._versionMap[self.version] < 5:
+ raise ValueError(
+ "Fail early requires minimum version 5 for dcgmRunDiag.")
+ if not isinstance(checkInterval, int):
+ raise ValueError("Invalid checkInterval value: %s" % checkInterval)
+
+ if enable:
+ self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_FAIL_EARLY
+ self.runDiagInfo.failCheckInterval = checkInterval
+ else:
+ self.runDiagInfo.flags &= ~dcgm_structs.DCGM_RUN_FLAGS_FAIL_EARLY
+
+ def Execute(self, handle):
+ return dcgm_agent.dcgmActionValidate_v2(handle, self.runDiagInfo,
+ self.version)
+
+ def SetStatsPath(self, statsPath):
+ if len(statsPath) >= dcgm_structs.DCGM_PATH_LEN:
+ err = "DcgmDiag cannot set statsPath '%s' because it exceeds max length %d." % \
+ (statsPath, dcgm_structs.DCGM_PATH_LEN)
+ raise ValueError(err)
+
+ self.runDiagInfo.statsPath = statsPath
+
+ def SetConfigFileContents(self, configFileContents):
+ if len(configFileContents) >= dcgm_structs.DCGM_MAX_CONFIG_FILE_LEN:
+ err = "Dcgm Diag cannot set config file contents to '%s' because it exceeds max length %d." \
+ % (configFileContents, dcgm_structs.DCGM_MAX_CONFIG_FILE_LEN)
+ raise ValueError(err)
+
+ self.runDiagInfo.configFileContents = configFileContents
+
+ def SetDebugLogFile(self, logFileName):
+ if len(logFileName) >= dcgm_structs.DCGM_FILE_LEN:
+ raise ValueError("Cannot set debug file to '%s' because it exceeds max length %d."\
+ % (logFileName, dcgm_structs.DCGM_FILE_LEN))
+
+ self.runDiagInfo.debugLogFile = logFileName
+
+ def SetDebugLevel(self, debugLevel):
+ if debugLevel < 0 or debugLevel > 5:
+ raise ValueError(
+ "Cannot set debug level to %d. Debug Level must be a value from 0-5 inclusive."
+ )
+
+ self.runDiagInfo.debugLevel = debugLevel
diff --git a/model_analyzer/monitor/dcgm/DcgmFieldGroup.py b/model_analyzer/monitor/dcgm/DcgmFieldGroup.py
new file mode 100644
index 000000000..bcbe37035
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/DcgmFieldGroup.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+'''
+Class for managing a group of field IDs in the host engine.
+'''
+
+
+class DcgmFieldGroup:
+ '''
+ Constructor
+
+ dcgmHandle - DcgmHandle() instance to use for communicating with the host engine
+ name - Name of the field group to use within DCGM. This must be unique
+ fieldIds - Fields that are part of this group
+ fieldGroupId - If provided, this is used to initialize the object from an existing field group ID
+ '''
+
+ def __init__(self, dcgmHandle, name="", fieldIds=None, fieldGroupId=None):
+ fieldIds = fieldIds or []
+ self.name = name
+ self.fieldIds = fieldIds
+ self._dcgmHandle = dcgmHandle
+ self.wasCreated = False
+
+ #If the user passed in an ID, the field group already exists. Fetch live info
+ if fieldGroupId is not None:
+ self.fieldGroupId = fieldGroupId
+ fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo(
+ self._dcgmHandle.handle, self.fieldGroupId)
+ self.name = fieldGroupInfo.fieldGroupName
+ self.fieldIds = fieldGroupInfo.fieldIds
+ else:
+ self.fieldGroupId = None #Assign here so the destructor doesn't fail if the call below fails
+ self.fieldGroupId = dcgm_agent.dcgmFieldGroupCreate(
+ self._dcgmHandle.handle, fieldIds, name)
+ self.wasCreated = True
+
+ '''
+ Remove this field group from DCGM. This object can no longer be passed to other APIs after this call.
+ '''
+
+ def Delete(self):
+ if self.wasCreated and self.fieldGroupId is not None:
+ try:
+ try:
+ dcgm_agent.dcgmFieldGroupDestroy(self._dcgmHandle.handle,
+ self.fieldGroupId)
+ except dcgm_structs.dcgmExceptionClass(
+ dcgm_structs.DCGM_ST_NO_DATA):
+ # someone may have deleted the group under us. That's ok.
+ pass
+ except dcgm_structs.dcgmExceptionClass(
+ dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
+ # We lost our connection, but we're destructing this object anyway.
+ pass
+ except AttributeError as ae:
+ # When we're cleaning up at the end, dcgm_agent and dcgm_structs have been unloaded and we'll
+ # get an AttributeError: "'NoneType' object has no 'dcgmExceptionClass'" Ignore this
+ pass
+ except TypeError as te:
+ # When we're cleaning up at the end, dcgm_agent and dcgm_structs have been unloaded and we might
+ # get a TypeError: "'NoneType' object is not callable'" Ignore this
+ pass
+ self.fieldGroupId = None
+ self._dcgmHandle = None
+
+ #Destructor
+ def __del__(self):
+ self.Delete()
diff --git a/model_analyzer/monitor/dcgm/DcgmGroup.py b/model_analyzer/monitor/dcgm/DcgmGroup.py
new file mode 100644
index 000000000..834e102db
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/DcgmGroup.py
@@ -0,0 +1,815 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import model_analyzer.monitor.dcgm.pydcgm as pydcgm
+import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+import model_analyzer.monitor.dcgm.dcgm_field_helpers as dcgm_field_helpers
+from model_analyzer.monitor.dcgm.DcgmHandle import DcgmHandle
+
+
+class DcgmGroupConfig:
+
+ def __init__(self, dcgmHandle, groupId, dcgmGroup):
+ self._dcgmHandle = dcgmHandle
+ self._groupId = groupId
+ self._dcgmGroup = dcgmGroup
+
+ '''
+ Set configuration for this group
+
+ config should be an instance of dcgm_structs.c_dcgmDeviceConfig_v1
+
+ Will throw an exception on error
+ '''
+
+ def Set(self, config):
+ status = pydcgm.DcgmStatus()
+ ret = dcgm_structs.DCGM_ST_OK
+
+ try:
+ ret = dcgm_agent.dcgmConfigSet(self._dcgmHandle.handle,
+ self._groupId, config, status.handle)
+ except dcgm_structs.DCGMError as e:
+ pass
+
+ #Throw specific errors before return error
+ status.ThrowExceptionOnErrors()
+ #Throw an appropriate exception on error
+ dcgm_structs._dcgmCheckReturn(ret)
+
+ '''
+ Get configuration for this group
+
+ configType is a DCGM_CONFIG_? constant
+
+ Returns an array of dcgm_structs.c_dcgmDeviceConfig_v1 objects
+ Throws an exception on error
+ '''
+
+ def Get(self, configType):
+ status = pydcgm.DcgmStatus()
+
+ gpuIds = self._dcgmGroup.GetGpuIds()
+ configList = dcgm_agent.dcgmConfigGet(self._dcgmHandle.handle,
+ self._groupId, configType,
+ len(gpuIds), status.handle)
+ #Throw specific errors before return error
+ status.ThrowExceptionOnErrors()
+ return configList
+
+ '''
+ Enforce the configuration that has been set with Set()
+
+ Throws an exception on error
+ '''
+
+ def Enforce(self):
+ status = pydcgm.DcgmStatus()
+ ret = dcgm_structs.DCGM_ST_OK
+ try:
+ ret = dcgm_agent.dcgmConfigEnforce(self._dcgmHandle.handle,
+ self._groupId, status.handle)
+ except dcgm_structs.DCGMError as e:
+ pass
+
+ #Throw specific errors before return error
+ status.ThrowExceptionOnErrors()
+ #Throw an appropriate exception on error
+ dcgm_structs._dcgmCheckReturn(ret)
+
+
+class DcgmGroupSamples:
+
+ def __init__(self, dcgmHandle, groupId, dcgmGroup):
+ self._dcgmHandle = dcgmHandle
+ self._groupId = groupId
+ self._dcgmGroup = dcgmGroup
+
+ '''
+ Tell DCGM to start recording samples for the given field group
+
+ fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
+ updateFreq: How often to update these fields in usec
+ maxKeepAge: How long to keep data for these fields in seconds
+ maxKeepSamples: Maximum number of samples to keep per field. 0=no limit
+
+ Once the field collection is watched, it will update whenever the next update
+ loop occurs. If you want to query these values immediately, use
+ handle.UpdateAllFields(True) to make sure that the fields have updated at least once.
+ '''
+
+ def WatchFields(self, fieldGroup, updateFreq, maxKeepAge, maxKeepSamples):
+ ret = dcgm_agent.dcgmWatchFields(self._dcgmHandle.handle, self._groupId,
+ fieldGroup.fieldGroupId, updateFreq,
+ maxKeepAge, maxKeepSamples)
+ dcgm_structs._dcgmCheckReturn(ret)
+
+ '''
+ tell DCGM to stop recording samples for a given field group
+
+ fieldGroup: DcgmFieldGroup() instance tracking the fields we want to unwatch.
+ '''
+
+ def UnwatchFields(self, fieldGroup):
+ ret = dcgm_agent.dcgmUnwatchFields(self._dcgmHandle.handle,
+ self._groupId,
+ fieldGroup.fieldGroupId)
+ dcgm_structs._dcgmCheckReturn(ret)
+
+ '''
+ Get the most recent values for each field in a field collection
+
+ fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
+
+ Returns DcgmFieldValueCollection object. Use its .values[gpuId][fieldId][0].value to access values
+ '''
+
+ def GetLatest(self, fieldGroup):
+ dfvc = dcgm_field_helpers.DcgmFieldValueCollection(
+ self._dcgmHandle.handle, self._groupId)
+ dfvc.GetLatestValues(fieldGroup)
+ return dfvc
+
+ '''
+ Get the most recent values for each field in a field collection
+
+ fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
+
+ Returns DcgmFieldValueEntityCollection object. Use its .values[entityGroupId][entityId][fieldId][0].value to access values
+ '''
+
+ def GetLatest_v2(self, fieldGroup):
+ dfvec = dcgm_field_helpers.DcgmFieldValueEntityCollection(
+ self._dcgmHandle.handle, self._groupId)
+ dfvec.GetLatestValues(fieldGroup)
+ return dfvec
+
+ '''
+ Get the new values for each field in a field collection since the last
+ collection.
+
+ dfvc: DcgmFieldValueCollection() instance. Will return a
+ DcgmFieldValueCollection with values since the one passed in.
+ Pass None for the first call to get one for subsequent calls.
+ On subsequent calls, pass what was returned.
+ fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
+
+ Returns DcgmFieldValueCollection object. Use its .values[gpuId][fieldId][*].value to access values
+ '''
+
+ def GetAllSinceLastCall(self, dfvc, fieldGroup):
+ if dfvc == None:
+ dfvc = dcgm_field_helpers.DcgmFieldValueCollection(
+ self._dcgmHandle.handle, self._groupId)
+ dfvc.GetLatestValues(fieldGroup)
+ else:
+ # We used to expect at least one value (GetLatestValues), so this
+ # ensures we provide one at the risk of repetition. This should not
+ # happen if we call this function infrequently enough (slower than
+ # the sampling rate).
+ dfvc.GetAllSinceLastCall(fieldGroup)
+ if len(dfvc.values) == 0:
+ dfvc.GetLatestValues(fieldGroup)
+ return dfvc
+
+ '''
+ Gets more values for each field in a field entity collection
+
+ dfvec: DcgmFieldValueEntityCollection() instance. Will return a
+ DcgmFieldValueEntityCollection with values since the one passed
+ in. Pass None for the first call to get one for subsequent
+ calls. On subsequent calls, pass what was returned.
+
+ fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
+
+ Returns DcgmFieldValueEntityCollection object. Use its .values[entityGroupId][entityId][fieldId][*].value to access values
+ '''
+
+ def GetAllSinceLastCall_v2(self, dvfec, fieldGroup):
+ if dfvec == None:
+ dfvec = dcgm_field_helpers.DcgmFieldValueEntityCollection(
+ self._dcgmHandle.handle, self._groupId)
+ dfvec.GetLastestValues_v2(fieldGroup)
+ else:
+ dfvec.GetAllSinceLastCall_v2(fieldGroup)
+ # We used to expect at least one value (GetLatestValues), so this
+ # ensures we provide one at the risk of repetition. This should not
+ # happen if we call this function infrequently enough (slower than
+ # the sampling rate).
+ if len(dfvec.values) == 0:
+ dfvec.GetLatestValues_v2(fieldGroup)
+
+ return dfvec
+
+ '''
+ Convenience alias for DcgmHandle.UpdateAllFields(). All fields on the system will be updated, not
+ just this group's.
+ '''
+
+ def UpdateAllFields(self, waitForUpdate):
+ self._dcgmHandle.UpdateAllFields(waitForUpdate)
+
+
+class DcgmGroupHealth:
+
+ def __init__(self, dcgmHandle, groupId, dcgmGroup):
+ self._dcgmHandle = dcgmHandle
+ self._groupId = groupId
+ self._dcgmGroup = dcgmGroup
+
+ '''
+ Enable health checks for this group
+
+ systems: A bitmask of dcgm_structs.DCGM_HEALTH_WATCH_? definitions of which health checks to enable
+ updateInterval: How often DCGM should request new health data from the driver in usec
+ maxKeepAge: How long DCGM should keep health data around once it has been retrieved from the driver in seconds
+ '''
+
+ def Set(self, systems, updateInterval=None, maxKeepAge=None):
+ if updateInterval is None or maxKeepAge is None:
+ ret = dcgm_agent.dcgmHealthSet(self._dcgmHandle.handle,
+ self._groupId, systems)
+ else:
+ ret = dcgm_agent.dcgmHealthSet_v2(self._dcgmHandle.handle,
+ self._groupId, systems,
+ updateInterval, maxKeepAge)
+ dcgm_structs._dcgmCheckReturn(ret)
+
+ '''
+ Retrieve the current state of the DCGM health check system
+
+ Returns a bitmask of dcgm_structs.DCGM_HEALTH_WATCH_? definitions of which health checks are currently enabled
+ '''
+
+ def Get(self):
+ systems = dcgm_agent.dcgmHealthGet(self._dcgmHandle.handle,
+ self._groupId)
+ return systems
+
+ '''
+ Check the configured watches for any errors/failures/warnings that have occurred
+ since the last time this check was invoked. On the first call, stateful information
+ about all of the enabled watches within a group is created but no error results are
+ provided. On subsequent calls, any error information will be returned.
+
+ @param version IN: Allows the caller to use an older version of this request. Should be
+ dcgm_structs.dcgmHealthResponse_version4
+
+ Returns a dcgm_structs.c_dcgmHealthResponse_* object that contains results for each GPU/entity
+ '''
+
+ def Check(self, version=dcgm_structs.dcgmHealthResponse_version4):
+ resp = dcgm_agent.dcgmHealthCheck(self._dcgmHandle.handle,
+ self._groupId, version)
+ return resp
+
+
+class DcgmGroupPolicy:
+
+ def __init__(self, dcgmHandle, groupId, dcgmGroup):
+ self._dcgmHandle = dcgmHandle
+ self._groupId = groupId
+ self._dcgmGroup = dcgmGroup
+
+ '''
+ Get the current violation policy inside the policy manager. Given a groupId, a number of
+ policy structures are retrieved.
+
+ @param statusHandle IN/OUT: pydcgm.DcgmStatus for the resulting status of the operation. Pass it as None
+ if the detailed error information for the operation is not needed (default).
+
+ Returns a list of dcgm_structs.c_dcgmPolicy_v1 with the same length as the number of GPUs in the group.
+ The index of an entry corresponds to a given GPU ID in the group. Throws an exception on error.
+ '''
+
+ def Get(self, statusHandle=None):
+ if statusHandle:
+ statusHandle = statusHandle.handle
+ count = len(self._dcgmGroup.GetGpuIds())
+ if count <= 0:
+ raise pydcgm.DcgmException(
+ "This group has no GPUs, cannot retrieve policies")
+ return dcgm_agent.dcgmPolicyGet(self._dcgmHandle.handle, self._groupId,
+ count, statusHandle)
+
+ '''
+ Set the current violation policy inside the policy manager. Given the conditions within "policy",
+ if a violation has occurred, subsequent action(s) may be performed to either
+ report or contain the failure.
+
+ This API is only supported on Tesla GPUs and will throw DCGMError_NotSupported if called on non-Tesla GPUs.
+
+ @param policy IN: dcgm_structs.c_dcgmPolicy_v1 that will be applied to all GPUs in the group
+
+ @param statusHandle IN/OUT: pydcgm.DcgmStatus for the resulting status for the operation. Pass it as
+ None if the detailed error information for the operation is not needed (default).
+
+ Returns Nothing. Throws an exception on error
+ '''
+
+ def Set(self, policy, statusHandle=None):
+ if statusHandle:
+ statusHandle = statusHandle.handle
+ dcgm_agent.dcgmPolicySet(self._dcgmHandle.handle, self._groupId, policy,
+ statusHandle)
+
+ '''
+ Register a function to be called when a specific policy condition (see dcgm_structs.c_dcgmPolicy_v1.condition)
+ has been violated. This callback(s) will be called automatically when in DCGM_OPERATION_MODE_AUTO mode and only after
+ DcgmPolicy.Trigger when in DCGM_OPERATION_MODE_MANUAL mode.
+ All callbacks are made within a separate thread.
+
+ This API is only supported on Tesla GPUs and will throw DCGMError_NotSupported if called on non-Tesla GPUs.
+
+ @param condition IN: The set of conditions specified as an OR'd list
+ (see dcgm_structs.DCGM_POLICY_COND_*)
+ for which to register a callback function
+
+ @param beginCallback IN: A function that should be called should a violation occur. This
+ function will be called prior to any actions specified by the policy are taken.
+
+ @param finishCallback IN: A reference to a function that should be called should a violation occur.
+ This function will be called after any action specified by the policy are completed.
+
+ At least one callback must be provided that is not None.
+
+ Returns Nothing. Throws an exception on error.
+ '''
+
+ def Register(self, condition, beginCallback=None, finishCallback=None):
+ if beginCallback is None and finishCallback is None:
+ raise pydcgm.DcgmException(
+ "At least 1 callback must be provided to register that is not None"
+ )
+ dcgm_agent.dcgmPolicyRegister(self._dcgmHandle.handle, self._groupId,
+ condition, beginCallback, finishCallback)
+
+ '''
+ Unregister a function to be called for a specific policy condition (see dcgm_structs.c_dcgmPolicy_v1.condition) .
+ This function will unregister all callbacks for a given condition.
+
+ @param condition IN: The set of conditions specified as an OR'd list
+ (see dcgm_structs.DCGM_POLICY_COND_*)
+ for which to unregister a callback function
+
+ Returns Nothing. Throws an exception on error.
+ '''
+
+ def Unregister(self, condition):
+ dcgm_agent.dcgmPolicyUnregister(self._dcgmHandle.handle, self._groupId,
+ condition)
+
+ '''
+ Inform the policy manager loop to perform an iteration and trigger the callbacks of any
+ registered functions. Callback functions will be called from a separate thread as the calling function.
+
+ Note: The GPU monitoring and management agent must call this method periodically if the operation
+ mode is set to manual mode (DCGM_OPERATION_MODE_MANUAL) during initialization
+ (\ref DcgmHandle.__init__).
+
+ Returns Nothing. Throws an exception if there is a generic error that the
+ policy manager was unable to perform another iteration.
+ '''
+
+ def Trigger(self):
+ dcgm_agent.dcgmPolicyTrigger(self._dcgmHandle.handle)
+
+
+class DcgmGroupDiscovery:
+
+ def __init__(self, dcgmHandle, groupId, dcgmGroup):
+ self._dcgmHandle = dcgmHandle
+ self._groupId = groupId
+ self._dcgmGroup = dcgmGroup
+
+ '''
+ Get the topology for this group
+
+ Returns a c_dcgmGroupTopology_v1 object representing the topology for this group
+ '''
+
+ def GetTopology(self):
+ return dcgm_agent.dcgmGetGroupTopology(self._dcgmHandle.handle,
+ self._groupId)
+
+
+class DcgmGroupStats:
+
+ def __init__(self, dcgmHandle, groupId, dcgmGroup):
+ self._dcgmHandle = dcgmHandle
+ self._groupId = groupId
+ self._dcgmGroup = dcgmGroup
+
+ '''
+ Tell DCGM to start recording samples for fields returned from GetPidInfo()
+
+ updateFreq: How often to update these fields in usec
+ maxKeepAge: How long to keep data for these fields in seconds
+ maxKeepSamples: Maximum number of samples to keep per field. 0=no limit
+
+ Once the field collection is watched, it will update whenever the next update
+ loop occurs. If you want to query these values immediately, use
+ handle.UpdateAllFields(True) to make sure that the fields have updated at least once.
+ '''
+
+ def WatchPidFields(self, updateFreq, maxKeepAge, maxKeepSamples):
+ ret = dcgm_agent.dcgmWatchPidFields(self._dcgmHandle.handle,
+ self._groupId, updateFreq,
+ maxKeepAge, maxKeepSamples)
+ dcgm_structs._dcgmCheckReturn(ret)
+
+ '''
+ Get process stats for a given PID on this GPU group
+
+ You must call WatchPidFields() before this query for this method to return any results
+
+ Returns a dcgm_structs.c_dcgmPidInfo_v2 structure
+ '''
+
+ def GetPidInfo(self, pid):
+ return dcgm_agent.dcgmGetPidInfo(self._dcgmHandle.handle, self._groupId,
+ pid)
+
+ '''
+ Tell DCGM to start recording samples for fields returned from GetJobStats()
+
+ updateFreq: How often to update these fields in usec
+ maxKeepAge: How long to keep data for these fields in seconds
+ maxKeepSamples: Maximum number of samples to keep per field. 0=no limit
+
+ Once the fields are watched, they will update whenever the next update
+ loop occurs. If you want to query these values immediately, use
+ handle.UpdateAllFields(True) to make sure that the fields have updated at least once.
+ '''
+
+ def WatchJobFields(self, updateFreq, maxKeepAge, maxKeepSamples):
+ ret = dcgm_agent.dcgmWatchJobFields(self._dcgmHandle.handle,
+ self._groupId, updateFreq,
+ maxKeepAge, maxKeepSamples)
+ dcgm_structs._dcgmCheckReturn(ret)
+
+ '''
+ Start collecting stats for a named job for this GPU group
+
+ Calling this will tell DCGM to start tracking stats for the given jobId. Stats tracking
+ will end when StopJobStats() is called
+
+ You must call WatchJobFields() before this call to tell DCGM to start sampling the fields
+ that are returned from GetJobStats().
+
+ jobId is a unique string identifier for this job. An exception will be thrown if this is not unique
+
+ Returns Nothing (Will throw exception on error)
+ '''
+
+ def StartJobStats(self, jobId):
+ ret = dcgm_agent.dcgmJobStartStats(self._dcgmHandle.handle,
+ self._groupId, jobId)
+ dcgm_structs._dcgmCheckReturn(ret)
+
+ '''
+ Stop collecting stats for a named job
+
+ Calling this will tell DCGM to stop collecting stats for a job that was previously started
+ with StartJobStats().
+
+ jobId is the unique string that was passed as jobId to StartJobStats.
+
+ Returns Nothing (Will throw exception on error)
+ '''
+
+ def StopJobStats(self, jobId):
+ ret = dcgm_agent.dcgmJobStopStats(self._dcgmHandle.handle, jobId)
+ dcgm_structs._dcgmCheckReturn(ret)
+
+ '''
+ Get stats for a job that was started with StartJobStats. If StopJobStats has not been called yet,
+ this will get stats from when the job started until now. If StopJob was called prior to
+ this, the returned Stats will go from when StartJobStats was called to when StopJobStats was called.
+
+ jobId is the unique string that was passed as jobId to StartJobStats and StopJobStats
+
+ Returns a dcgm_structs.c_dcgmJobInfo_v3 structure. Throws an exception on error
+ '''
+
+ def GetJobStats(self, jobId):
+ ret = dcgm_agent.dcgmJobGetStats(self._dcgmHandle.handle, jobId)
+ return ret
+
+ '''
+ This API tells DCGM to stop tracking the job given by jobId. After this call, you will no longer
+ be able to call GetJobStats() on this jobId. However, you will be able to reuse jobId after
+ this call.
+
+ jobId is the unique string that was passed as jobId to StartJobStats and StopJobStats
+
+ Returns Nothing (Will throw exception on error)
+ '''
+
+ def RemoveJob(self, jobId):
+ ret = dcgm_agent.dcgmJobRemove(self._dcgmHandle.handle, jobId)
+ return ret
+
+ '''
+ This API tells DCGM to stop tracking all jobs. After this call, you will no longer
+ be able to call dcgmJobGetStats() any jobs until you call StartJobStats() again.
+ You will be able to reuse any previously-used jobIds after this call.
+
+ Returns Nothing (Will throw exception on error)
+ '''
+
+ def RemoveAllJobs(self):
+ ret = dcgm_agent.dcgmJobRemoveAll(self._dcgmHandle.handle)
+ return ret
+
+
+class DcgmGroupAction:
+
+ def __init__(self, dcgmHandle, groupId, dcgmGroup):
+ self._dcgmHandle = dcgmHandle
+ self._groupId = groupId
+ self._dcgmGroup = dcgmGroup
+
+ '''
+ Inform the action manager to perform a manual validation of a group of GPUs on the system
+
+ validate is what sort of validation to do. See dcgm_structs.DCGM_POLICY_VALID_* defines.
+
+ Returns a dcgm_structs.c_dcgmDiagResponse_v5 instance
+ '''
+
+ def Validate(self, validate):
+ runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
+ runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7
+ runDiagInfo.validate = validate
+ runDiagInfo.groupId = self._groupId
+
+ ret = dcgm_agent.dcgmActionValidate_v2(self._dcgmHandle.handle,
+ runDiagInfo)
+ return ret
+
+ '''
+ Run a diagnostic on this group of GPUs.
+
+ diagLevel is the level of diagnostic desired. See dcgm_structs.DCGM_DIAG_LVL_* constants.
+
+ Returns a dcgm_structs.c_dcgmDiagResponse_v5 instance
+ '''
+
+ def RunDiagnostic(self, diagLevel):
+ ret = dcgm_agent.dcgmRunDiagnostic(self._dcgmHandle.handle,
+ self._groupId, diagLevel)
+ return ret
+
+ '''
+ Run a specific diagnostic test on this group of GPUs.
+ testName is the name of the specific test that should be invoked.
+ Returns a dcgm_structs.c_dcgmDiagResponse_v5 instance
+ '''
+
+ def RunSpecificTest(self, testName):
+ runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
+ runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7
+ for i in range(len(testName)):
+ runDiagInfo.testNames[0][i] = testName[i]
+ runDiagInfo.groupId = self._groupId
+ runDiagInfo.validate = dcgm_structs.DCGM_POLICY_VALID_NONE
+ response = dcgm_agent.dcgmActionValidate_v2(self._dcgmHandle.handle,
+ runDiagInfo)
+ return response
+
+
+class DcgmGroupProfiling:
+
+ def __init__(self, dcgmHandle, groupId, dcgmGroup):
+ """
+
+ Parameters
+ ----------
+ dcgmHandle : DcgmHandle
+ groupId : int
+ dcgmGroup : DcgmGroup
+ """
+ self._dcgmHandle = dcgmHandle
+ self._groupId = groupId
+ self._dcgmGroup = dcgmGroup
+
+ def GetSupportedMetricGroups(self):
+ """
+ Get a list of the profiling metric groups available for this group of entities
+
+ :return: dcgm_structs.c_dcgmProfGetMetricGroups_v3
+ :throws: dcgm_structs.DCGMError on error
+ """
+ gpuIds = self._dcgmGroup.GetGpuIds()
+ if len(gpuIds) < 1:
+ raise dcgm_structs.DCGMError_ProfilingNotSupported
+
+ ret = dcgm_agent.dcgmProfGetSupportedMetricGroups(
+ self._dcgmHandle.handle, gpuIds[0])
+ return ret
+
+
+class DcgmGroup:
+ '''
+ Constructor.
+
+ Either groupId OR groupName must be provided as a parameter.
+ This will set which GPU group this object is bound to
+
+ groupId=DCGM_GROUP_ALL_GPUS creates a group with all GPUs. Passing an existing groupId will
+ not create an additional group.
+ If groupName is provided, an empty group (No GPUs) of name groupName will be created. This group
+ will be destroyed when this object goes out of scope or is deleted with del().
+ groupType is the type of group to create. See dcgm_structs.DCGM_GROUP_? constants.
+ '''
+
+ def __init__(self,
+ dcgmHandle,
+ groupId=None,
+ groupName=None,
+ groupType=dcgm_structs.DCGM_GROUP_EMPTY):
+ self._dcgmHandle = dcgmHandle
+
+ if groupId is None and groupName is None:
+ raise pydcgm.DcgmException(
+ "Either groupId or groupName is required")
+
+ if groupId is not None:
+ self._groupId = groupId
+ else:
+ self._groupId = dcgm_agent.dcgmGroupCreate(self._dcgmHandle.handle,
+ groupType, groupName)
+
+ #Create namespace classes
+ self.config = DcgmGroupConfig(self._dcgmHandle, self._groupId, self)
+ self.samples = DcgmGroupSamples(self._dcgmHandle, self._groupId, self)
+ self.health = DcgmGroupHealth(self._dcgmHandle, self._groupId, self)
+ self.policy = DcgmGroupPolicy(self._dcgmHandle, self._groupId, self)
+ self.discovery = DcgmGroupDiscovery(self._dcgmHandle, self._groupId,
+ self)
+ self.stats = DcgmGroupStats(self._dcgmHandle, self._groupId, self)
+ self.action = DcgmGroupAction(self._dcgmHandle, self._groupId, self)
+ self.profiling = DcgmGroupProfiling(self._dcgmHandle, self._groupId,
+ self)
+
+ '''
+ Remove this group from DCGM. This object will no longer be valid after this call.
+ '''
+
+ def Delete(self):
+ del self.config
+ self.config = None
+ del self.samples
+ self.samples = None
+ del self.health
+ self.health = None
+ del self.policy
+ self.policy = None
+ del self.discovery
+ self.discovery = None
+ del self.stats
+ self.stats = None
+ del self.action
+ self.action = None
+ del self.profiling
+ self.profiling = None
+
+ #Delete the group we created if we're not using the special all-GPU group
+ if self._groupId is not None and not self._IsGroupIdStatic():
+ ret = dcgm_agent.dcgmGroupDestroy(self._dcgmHandle.handle,
+ self._groupId)
+ dcgm_structs._dcgmCheckReturn(ret)
+
+ self._groupId = None
+
+ '''
+ Private method to determine if our groupId is a predefined one
+ '''
+
+ def _IsGroupIdStatic(self):
+ if self._groupId == dcgm_structs.DCGM_GROUP_ALL_GPUS or \
+ self._groupId == dcgm_structs.DCGM_GROUP_ALL_NVSWITCHES:
+ return True
+ return False
+
+ '''
+ Add a GPU to this group
+
+ gpuId is the GPU ID to add to our group
+
+ Returns Nothing. Throws an exception on error
+ '''
+
+ def AddGpu(self, gpuId):
+ if self._IsGroupIdStatic():
+ raise pydcgm.DcgmException("Can't add a GPU to a static group")
+
+ ret = dcgm_agent.dcgmGroupAddDevice(self._dcgmHandle.handle,
+ self._groupId, gpuId)
+ dcgm_structs._dcgmCheckReturn(ret)
+
+ '''
+ Add an entity to this group
+
+ entityGroupId is DCGM_FE_? constant of the entity group this entity belongs to
+ entityId is the entity to add to this group
+
+ Returns Nothing. Throws an exception on error
+ '''
+
+ def AddEntity(self, entityGroupId, entityId):
+ if self._IsGroupIdStatic():
+ raise pydcgm.DcgmException("Can't add an entity to a static group")
+
+ ret = dcgm_agent.dcgmGroupAddEntity(self._dcgmHandle.handle,
+ self._groupId, entityGroupId,
+ entityId)
+ dcgm_structs._dcgmCheckReturn(ret)
+
+ '''
+ Remove a GPU from this group
+
+ gpuId is the GPU ID to remove from our group
+
+ Returns Nothing. Throws an exception on error
+ '''
+
+ def RemoveGpu(self, gpuId):
+ if self._IsGroupIdStatic():
+ raise pydcgm.DcgmException("Can't remove a GPU from a static group")
+
+ ret = dcgm_agent.dcgmGroupRemoveDevice(self._dcgmHandle.handle,
+ self._groupId, gpuId)
+ dcgm_structs._dcgmCheckReturn(ret)
+
+ '''
+ Remove an entity from this group
+
+ entityGroupId is DCGM_FE_? constant of the entity group this entity belongs to
+ entityId is the entity to remove from this group
+
+ Returns Nothing. Throws an exception on error
+ '''
+
+ def RemoveEntity(self, entityGroupId, entityId):
+ if self._IsGroupIdStatic():
+ raise pydcgm.DcgmException(
+ "Can't remove an entity from a static group")
+
+ ret = dcgm_agent.dcgmGroupRemoveEntity(self._dcgmHandle.handle,
+ self._groupId, entityGroupId,
+ entityId)
+ dcgm_structs._dcgmCheckReturn(ret)
+
+ '''
+ Get an array of GPU ids that are part of this group
+
+ Note: this ignores non-GPU members of the group
+
+ Returns a list of GPU ids. Throws an exception on error
+ '''
+
+ def GetGpuIds(self):
+ groupInfo = dcgm_agent.dcgmGroupGetInfo(self._dcgmHandle.handle,
+ self._groupId)
+ groupGpuIds = []
+ for i in range(groupInfo.count):
+ if groupInfo.entityList[i].entityGroupId != dcgm_fields.DCGM_FE_GPU:
+ continue
+ groupGpuIds.append(groupInfo.entityList[i].entityId)
+ return groupGpuIds
+
+ '''
+ Get an array of entities that are part of this group
+
+ Returns a list of c_dcgmGroupEntityPair_t structs. Throws an exception on error
+ '''
+
+ def GetEntities(self):
+ groupInfo = dcgm_agent.dcgmGroupGetInfo(self._dcgmHandle.handle,
+ self._groupId)
+ entities = groupInfo.entityList[0:groupInfo.count]
+ return entities
+
+ '''
+ Get the groupId of this object
+
+ Returns our groupId
+ '''
+
+ def GetId(self):
+ return self._groupId
diff --git a/model_analyzer/monitor/dcgm/DcgmHandle.py b/model_analyzer/monitor/dcgm/DcgmHandle.py
new file mode 100644
index 000000000..0234318ed
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/DcgmHandle.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import model_analyzer.monitor.dcgm.pydcgm as pydcgm
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
+
+
+class DcgmHandle:
+ '''
+ Class to encapsulate a handle to DCGM and global methods to control + query the host engine
+ '''
+
+ def __init__(self,
+ handle=None,
+ ipAddress=None,
+ opMode=dcgm_structs.DCGM_OPERATION_MODE_AUTO,
+ persistAfterDisconnect=False,
+ unixSocketPath=None,
+ timeoutMs=0):
+ '''
+ Constructor
+
+ handle is an existing handle from dcgmInit(). Pass None if you want this object to handle DCGM initialization for you
+ ipAddress is the host to connect to. None = start embedded host engine
+ opMode is a dcgm_structs.DCGM_OPERATION_MODE_* constant for how the host engine should run (embedded mode only)
+ persistAfterDisconnect (TCP-IP connections only) is whether the host engine should persist all of our watches
+ after we disconnect. 1=persist our watches. 0=clean up after our connection
+ unixSocketPath is a path to a path on the local filesystem that is a unix socket that the host engine is listening on.
+ This option is mutually exclusive with ipAddress
+ timeoutMs is how long to wait for TCP/IP or Unix domain connections to establish in ms. 0=Default timeout (5000ms)
+ '''
+ self._handleCreated = False
+ self._persistAfterDisconnect = persistAfterDisconnect
+
+ if handle is not None:
+ self.handle = handle
+ return
+
+ self._ipAddress = ipAddress
+
+ #Can't provide both unix socket and ip address
+ if ipAddress is not None and unixSocketPath is not None:
+ raise dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)
+
+ #Initialize the DCGM client library
+ dcgm_structs._dcgmInit()
+ dcgm_agent.dcgmInit(
+ ) #Not harmful to call this multiple times in a process
+
+ #If neither ipAddress nor unixSocketPath are present, start an embedded host engine
+ if ipAddress is None and unixSocketPath is None:
+ self.handle = dcgm_agent.dcgmStartEmbedded(opMode)
+ self.isEmbedded = True
+ self._handleCreated = True
+ return
+
+ #Set up connection parameters. We're connecting to something
+ connectParams = dcgm_structs.c_dcgmConnectV2Params_v2()
+ connectParams.version = dcgm_structs.c_dcgmConnectV2Params_version
+ connectParams.timeoutMs = timeoutMs
+ if self._persistAfterDisconnect:
+ connectParams.persistAfterDisconnect = 1
+ else:
+ connectParams.persistAfterDisconnect = 0
+
+ if ipAddress is not None:
+ connectToAddress = ipAddress
+ connectParams.addressIsUnixSocket = 0
+ else:
+ connectToAddress = unixSocketPath
+ connectParams.addressIsUnixSocket = 1
+
+ self.handle = dcgm_agent.dcgmConnect_v2(connectToAddress, connectParams)
+ self.isEmbedded = False
+ self._handleCreated = True
+
+ def __del__(self):
+ '''
+ Destructor
+ '''
+ if self._handleCreated:
+ self.Shutdown()
+
+ def GetSystem(self):
+ '''
+ Get a DcgmSystem instance for this handle
+ '''
+ return pydcgm.DcgmSystem(self)
+
+ def __StopDcgm__(self):
+ '''
+ Shuts down either the hostengine or the embedded server
+ '''
+ if self.isEmbedded:
+ dcgm_agent.dcgmStopEmbedded(self.handle)
+ else:
+ dcgm_agent.dcgmDisconnect(self.handle)
+
+ def Shutdown(self):
+ '''
+ Shutdown DCGM hostengine
+ '''
+ if not self._handleCreated:
+ return
+
+ try:
+ self.__StopDcgm__()
+ except AttributeError as e:
+ # Due to multi-threading, sometimes this is called after the modules have been unloaded, making
+ # dcgm_agent effectively NoneType and resulting in this error being thrown.
+ pass
+
+ self._handleCreated = False
+ self.handle = None
+
+ @staticmethod
+ def Unload():
+ '''
+ Unload DCGM, removing any memory it is pointing at. Use this if you really
+ want DCGM gone from your process. Shutdown() only closes the connection/embedded host engine
+ that was create in __init__().
+ '''
+ dcgm_agent.dcgmShutdown()
+
+ def GetIpAddress(self):
+ '''
+ Returns the IP address associated with this handle. None=embedded connection
+ '''
+ return self._ipAddress
diff --git a/model_analyzer/monitor/dcgm/DcgmJsonReader.py b/model_analyzer/monitor/dcgm/DcgmJsonReader.py
new file mode 100644
index 000000000..9c2ce187e
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/DcgmJsonReader.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from model_analyzer.monitor.dcgm.DcgmReader import DcgmReader
+from json import dumps as toJson
+from os import environ
+from socket import socket, AF_INET, SOCK_DGRAM
+from time import sleep
+import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+import logging
+
+
+class DcgmJsonReader(DcgmReader):
+
+ ###########################################################################
+ def ConvertFieldIdToTag(self, fieldId):
+ return self.m_fieldIdToInfo[fieldId].tag
+
+ ###########################################################################
+ def PrepareJson(self, gpuId, obj):
+ '''
+ Receive an object with measurements turn it into an equivalent JSON. We
+ add the GPU UUID first.
+ '''
+ uuid = self.m_gpuIdToUUId[gpuId]
+ # This mutates the original object, but it shouldn't be a problem here
+ obj['gpu_uuid'] = uuid
+ return toJson(obj)
+
+ ###########################################################################
+ def CustomDataHandler(self, fvs):
+ for gpuId in list(fvs.keys()):
+ # We don't need the keys because each value has a `fieldId`
+ # So just get the values
+ gpuData = list(fvs[gpuId].values())
+
+ # Get the values from FV (which is a list of values)
+ valuesListOfLists = [datum.values for datum in gpuData]
+
+ # We only want the last measurement
+ lastValueList = [l[-1] for l in valuesListOfLists]
+
+ # Turn FV into a conventional Python Object which can be converted to JSON
+ outObject = {
+ self.ConvertFieldIdToTag(i.fieldId): i.value
+ for i in lastValueList
+ }
+ outJson = self.PrepareJson(gpuId, outObject)
+
+ self.CustomJsonHandler(outJson)
+
+ ###########################################################################
+ def CustomJsonHandler(self, outJson):
+ '''
+ This method should be overriden by subclasses to handle the JSON objects
+ received.
+ '''
+ logging.warning('CustomJsonHandler has not been overriden')
+ logging.info(outJson)
diff --git a/model_analyzer/monitor/dcgm/DcgmReader.py b/model_analyzer/monitor/dcgm/DcgmReader.py
new file mode 100644
index 000000000..2c32a1f91
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/DcgmReader.py
@@ -0,0 +1,623 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import subprocess
+import signal, os
+import model_analyzer.monitor.dcgm.pydcgm as pydcgm
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+import threading
+import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+import sys
+import logging
+
+defaultFieldIds = [
+ dcgm_fields.DCGM_FI_DEV_POWER_USAGE, dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
+ dcgm_fields.DCGM_FI_DEV_SM_CLOCK, dcgm_fields.DCGM_FI_DEV_GPU_UTIL,
+ dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING,
+ dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
+ dcgm_fields.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, dcgm_fields.DCGM_FI_DEV_FB_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_FB_FREE, dcgm_fields.DCGM_FI_DEV_FB_USED,
+ dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
+ dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION,
+ dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION,
+ dcgm_fields.DCGM_FI_DEV_XID_ERRORS,
+ dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_MEM_CLOCK, dcgm_fields.DCGM_FI_DEV_MEMORY_TEMP,
+ dcgm_fields.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION,
+ dcgm_fields.DCGM_FI_DEV_MEM_COPY_UTIL,
+ dcgm_fields.DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_PCIE_TX_THROUGHPUT,
+ dcgm_fields.DCGM_FI_DEV_PCIE_RX_THROUGHPUT
+]
+
+
+def entity_group_id_to_string(entityGroupId):
+ if entityGroupId == dcgm_fields.DCGM_FE_GPU:
+ return 'GPU'
+ elif entityGroupId == dcgm_fields.DCGM_FE_VGPU:
+ return 'VGPU'
+ elif entityGroupId == dcgm_fields.DCGM_FE_SWITCH:
+ return 'NVSWITCH'
+ elif entityGroupId == dcgm_fields.DCGM_FE_GPU_I:
+ return 'GPU INSTANCE'
+ elif entityGroupId == dcgm_fields.DCGM_FE_GPU_CI:
+ return 'COMPUTE INSTANCE'
+ elif entityGroupId == dcgm_fields.DCGM_FE_LINK:
+ return 'LINK'
+ else:
+ return ''
+
+
+class DcgmReader(object):
+ ###########################################################################
+ '''
+ This function can be implemented as a callback in the class that inherits from DcgmReader
+ to handle each field individually.
+ By default, it passes a string with the gpu, field tag, and value to LogInfo()
+ @params:
+ gpuId : the id of the GPU this field is reporting on
+ fieldId : the id of the field (ignored by default, may be useful for children)
+ fieldTag : the string representation of the field id
+ val : the value class that comes from DCGM (v.value is the value for the field)
+ '''
+
+ def CustomFieldHandler(self, gpuId, fieldId, fieldTag, val):
+ print("GPU %s field %s=%s" % (str(gpuId), fieldTag, str(val.value)))
+
+ ###########################################################################
+ '''
+ This function can be implemented as a callback in the class that inherits from DcgmReader
+ to handle each field individually.
+ By default, it passes a string with the gpu, field tag, and value to LogInfo()
+ @params:
+ entityGroupId : the type of entity this field is reporting on
+ entityId : the id of the entity this field is reporting on
+ fieldId : the id of the field (ignored by default, may be useful for children)
+ fieldTag : the string representation of the field id
+ val : the value class that comes from DCGM (v.value is the value for the field)
+ '''
+
+ def CustomFieldHandler_v2(self, entityGroupId, entityId, fieldId, fieldTag,
+ val):
+ print("%s %s field %s=%s" % (entity_group_id_to_string(entityGroupId),
+ str(entityId), fieldTag, str(val.value)))
+
+ ###########################################################################
+ '''
+ This function can be implemented as a callback in the class that inherits from DcgmReader
+ to handle all of the data queried from DCGM.
+ By default, it will simply print the field tags and values for each GPU
+ @params:
+ fvs : Data in the format entityGroupId -> entityId -> values (dictionary of dictionaries)
+ '''
+
+ def CustomDataHandler_v2(self, fvs):
+ for entityGroupId in list(fvs.keys()):
+ entityGroup = fvs[entityGroupId]
+
+ for entityId in list(entityGroup.keys()):
+ entityFv = entityGroup[entityId]
+ for fieldId in list(entityFv.keys()):
+ if fieldId in self.m_dcgmIgnoreFields:
+ continue
+
+ val = entityFv[fieldId][-1]
+
+ if val.isBlank:
+ continue
+
+ fieldTag = self.m_fieldIdToInfo[fieldId].tag
+
+ self.CustomFieldHandler_v2(entityGroupId, entityId, fieldId,
+ fieldTag, val)
+
+ ###########################################################################
+ '''
+ This function can be implemented as a callback in the class that inherits from DcgmReader
+ to handle all of the data queried from DCGM.
+ By default, it will simply print the field tags and values for each GPU
+ @params:
+ fvs : Dictionary with gpuID as key and values as Value
+ '''
+
+ def CustomDataHandler(self, fvs):
+ for gpuId in list(fvs.keys()):
+ gpuFv = fvs[gpuId]
+
+ for fieldId in list(gpuFv.keys()):
+ if fieldId in self.m_dcgmIgnoreFields:
+ continue
+
+ val = gpuFv[fieldId][-1]
+
+ if val.isBlank:
+ continue
+
+ fieldTag = self.m_fieldIdToInfo[fieldId].tag
+
+ self.CustomFieldHandler(gpuId, fieldId, fieldTag, val)
+
+ ###########################################################################
+ def SetupGpuIdUUIdMappings(self):
+ '''
+ Populate the m_gpuIdToUUId map
+ '''
+
+ gpuIds = self.m_dcgmGroup.GetGpuIds()
+ for gpuId in gpuIds:
+ gpuInfo = self.m_dcgmSystem.discovery.GetGpuAttributes(gpuId)
+ self.m_gpuIdToUUId[gpuId] = gpuInfo.identifiers.uuid
+
+ ###########################################################################
+ '''
+ Constructor
+ @params:
+ hostname : Address:port of the host to connect. Defaults to localhost
+ fieldIds : List of the field ids to publish. If it isn't specified, our default list is used.
+ updateFrequency : Frequency of update in microseconds. Defauls to 10 seconds or 10000000 microseconds
+ maxKeepAge : Max time to keep data from NVML, in seconds. Default is 3600.0 (1 hour)
+ ignoreList : List of the field ids we want to query but not publish.
+ gpuIds : List of GPU IDs to monitor. If not provided, DcgmReader will monitor all GPUs on the system
+ fieldIntervalMap: Map of intervals to list of field numbers to monitor. Takes precedence over fieldIds and updateFrequency if not None.
+ '''
+
+ def __init__(self,
+ hostname='localhost',
+ fieldIds=None,
+ updateFrequency=10000000,
+ maxKeepAge=3600.0,
+ ignoreList=None,
+ fieldGroupName='dcgm_fieldgroupData',
+ gpuIds=None,
+ entities=None,
+ fieldIntervalMap=None):
+ fieldIds = fieldIds or defaultFieldIds
+ ignoreList = ignoreList or []
+ self.m_dcgmHostName = hostname
+ self.m_updateFreq = updateFrequency # default / redundant
+
+ self.m_fieldGroupName = fieldGroupName
+ self.m_publishFields = {}
+
+ if fieldIntervalMap is not None:
+ self.m_publishFields = fieldIntervalMap
+ else:
+ self.m_publishFields[self.m_updateFreq] = fieldIds
+
+ self.m_requestedGpuIds = gpuIds
+ self.m_requestedEntities = entities
+
+ self.m_dcgmIgnoreFields = ignoreList #Fields not to publish
+ self.m_maxKeepAge = maxKeepAge
+ self.m_dcgmHandle = None
+ self.m_dcgmSystem = None
+ self.m_dcgmGroup = None
+ self.m_closeHandle = False
+
+ self.m_gpuIdToBusId = {} #GpuID => PCI-E busId string
+ self.m_gpuIdToUUId = {} # FieldId => dcgm_fields.dcgm_field_meta_t
+ self.m_fieldIdToInfo = {} #FieldId => dcgm_fields.dcgm_field_meta_t
+ self.m_lock = threading.Lock(
+ ) #DCGM connection start-up/shutdown is not thread safe. Just lock pessimistically
+ self.m_debug = False
+
+ # For GetAllSinceLastCall* calls. We cache the value for these objects
+ # after first retrieval, so initializing them to None lets us know if
+ # we've made a first retrieval. The first retrieval is based on a
+ # "since" timestamp of 0, so it gets data in which we are not
+ # interested in. The second retrieval gets data since the first one, in
+ # which we ARE interested. The practical upshot of this is that actual
+ # reporting of data is delayed one collectd sampling interval -- as if
+ # the sampling was actually started one collectd sampling interval
+ # later. We expect this is not an issue.
+ self.fvs = None
+ self.dfvc = None
+ self.dfvec = None
+
+ ###########################################################################
+ '''
+ Define what should happen to this object at the beginning of a with
+ block. In this case, nothing more is needed since the constructor should've
+ been called.
+ '''
+
+ def __enter__(self):
+ return self
+
+ ###########################################################################
+ '''
+ Define the cleanup
+ '''
+
+ def __exit__(self, type, value, traceback):
+ self.Shutdown()
+
+ ###########################################################################
+ '''
+ This function intializes DCGM from the specified directory and connects to
+ the host engine.
+ '''
+
+ def InitWrapped(self, path=None):
+ dcgm_structs._dcgmInit(libDcgmPath=path)
+ self.Reconnect()
+
+ ###########################################################################
+ '''
+ This function tries to connect to hostengine and calls initwrapped to initialize
+ the dcgm.
+ '''
+
+ def Init(self, libpath=None):
+ with self.m_lock:
+ try:
+ self.InitWrapped(path=libpath)
+ except dcgm_structs.dcgmExceptionClass(
+ dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
+ self.LogError("Can't connect to nv-hostengine. Is it down?")
+ self.SetDisconnected()
+
+ ###########################################################################
+ '''
+ Delete the DCGM group, DCGM system and DCGM handle and clear the attributes
+ on shutdown.
+ '''
+
+ def SetDisconnected(self):
+ #Force destructors since DCGM currently doesn't support more than one client connection per process
+ if self.m_dcgmGroup is not None:
+ del (self.m_dcgmGroup)
+ self.m_dcgmGroup = None
+ if self.m_dcgmSystem is not None:
+ del (self.m_dcgmSystem)
+ self.m_dcgmSystem = None
+ if self.m_dcgmHandle is not None:
+ del (self.m_dcgmHandle)
+ self.m_dcgmHandle = None
+
+ ##########################################################################
+ '''
+ This function calls the SetDisconnected function which disconnects from
+ DCGM and clears DCGM handle and DCGM group.
+ '''
+
+ def Shutdown(self):
+ with self.m_lock:
+ if self.m_closeHandle == True:
+ self.SetDisconnected()
+
+ ############################################################################
+ '''
+ Turns debugging output on
+ '''
+
+ def AddDebugOutput(self):
+ self.m_debug = True
+
+ ############################################################################
+ '''
+ '''
+
+ def InitializeFromHandle(self):
+ self.m_dcgmSystem = self.m_dcgmHandle.GetSystem()
+
+ if not self.m_requestedGpuIds and not self.m_requestedEntities:
+ self.m_dcgmGroup = self.m_dcgmSystem.GetDefaultGroup()
+ else:
+ groupName = "dcgmreader_%d" % os.getpid()
+
+ if self.m_requestedGpuIds:
+ self.m_dcgmGroup = self.m_dcgmSystem.GetGroupWithGpuIds(
+ groupName, self.m_requestedGpuIds)
+ if self.m_requestedEntities:
+ for entity in self.m_requestedEntities:
+ self.m_dcgmGroup.AddEntity(entity.entityGroupId,
+ entity.entityId)
+ else:
+ self.m_dcgmGroup = self.m_dcgmSystem.GetGroupWithEntities(
+ groupName, self.m_requestedEntities)
+
+ self.SetupGpuIdBusMappings()
+ self.SetupGpuIdUUIdMappings()
+ self.GetFieldMetadata()
+ self.AddFieldWatches()
+
+ ############################################################################
+ '''
+ Has DcgmReader use but not own a handle. Currently for the unit tests.
+ '''
+
+ def SetHandle(self, handle):
+ self.m_dcgmHandle = pydcgm.DcgmHandle(handle)
+ self.InitializeFromHandle()
+
+ ############################################################################
+ '''
+ Reconnect function checks if connection handle is present. If the handle is
+ none, it creates the handle and gets the default DCGM group. It then maps
+ gpuIds to BusID, set the meta data of the field ids and adds watches to the
+ field Ids mentioned in the idToWatch list.
+ '''
+
+ def Reconnect(self):
+ if self.m_dcgmHandle is not None:
+ return
+
+ self.LogDebug("Connection handle is None. Trying to reconnect")
+
+ self.m_dcgmHandle = pydcgm.DcgmHandle(
+ None, self.m_dcgmHostName, dcgm_structs.DCGM_OPERATION_MODE_AUTO)
+ self.m_closeHandle = True
+
+ self.LogDebug("Connected to nv-hostengine")
+
+ self.InitializeFromHandle()
+
+ ###########################################################################
+ '''
+ Populate the g_gpuIdToBusId map. This map contains mapping from
+ gpuID to the BusID.
+ '''
+
+ def SetupGpuIdBusMappings(self):
+ self.m_gpuIdToBusId = {}
+
+ gpuIds = self.m_dcgmGroup.GetGpuIds()
+ for gpuId in gpuIds:
+ gpuInfo = self.m_dcgmSystem.discovery.GetGpuAttributes(gpuId)
+ self.m_gpuIdToBusId[gpuId] = gpuInfo.identifiers.pciBusId
+
+ ###########################################################################
+ '''
+ Add watches to the fields which are passed in init function in idToWatch
+ list. It also updates the field values for the first time.
+ '''
+
+ def AddFieldWatches(self):
+ maxKeepSamples = 0 #No limit. Handled by m_maxKeepAge
+ for interval, fieldGroup in self.m_fieldGroups.items():
+ self.LogDebug("AddWatchFields: interval = " + str(interval) + "\n")
+ self.m_dcgmGroup.samples.WatchFields(fieldGroup, interval,
+ self.m_maxKeepAge,
+ maxKeepSamples)
+ self.m_dcgmSystem.UpdateAllFields(1)
+ self.LogDebug("AddWatchFields exit\n")
+
+ ###########################################################################
+ '''
+ If the groupID already exists, we delete that group and create a new fieldgroup with
+ the fields mentioned in idToWatch. Then information of each field is acquired from its id.
+ '''
+
+ def GetFieldMetadata(self):
+ self.m_fieldIdToInfo = {}
+ self.m_fieldGroups = {}
+ self.m_fieldGroup = None
+ allFieldIds = []
+
+ # Initialize groups for all field intervals.
+ self.LogDebug("GetFieldMetaData:\n")
+
+ intervalIndex = 0
+ for interval, fieldIds in self.m_publishFields.items():
+ self.LogDebug("sampling interval = " + str(interval) + ":\n")
+ for fieldId in fieldIds:
+ self.LogDebug(" fieldId: " + str(fieldId) + "\n")
+
+ intervalIndex += 1
+ fieldGroupName = self.m_fieldGroupName + "_" + str(intervalIndex)
+ findByNameId = self.m_dcgmSystem.GetFieldGroupIdByName(
+ fieldGroupName)
+ self.LogDebug("fieldGroupName: " + fieldGroupName + "\n")
+
+ # Remove our field group if it exists already
+ if findByNameId is not None:
+ self.LogDebug("fieldGroupId: " + findByNameId + "\n")
+ delFieldGroup = pydcgm.DcgmFieldGroup(
+ dcgmHandle=self.m_dcgmHandle, fieldGroupId=findByNameId)
+ delFieldGroup.Delete()
+ del (delFieldGroup)
+
+ self.m_fieldGroups[interval] = pydcgm.DcgmFieldGroup(
+ self.m_dcgmHandle, fieldGroupName, fieldIds)
+
+ for fieldId in fieldIds:
+ if fieldId not in allFieldIds:
+ allFieldIds += [fieldId]
+
+ self.m_fieldIdToInfo[
+ fieldId] = self.m_dcgmSystem.fields.GetFieldById(fieldId)
+ if self.m_fieldIdToInfo[fieldId] == 0 or self.m_fieldIdToInfo[
+ fieldId] == None:
+ self.LogError(
+ "Cannot get field tag for field id %d. Please check dcgm_fields to see if it is valid."
+ % (fieldId))
+ raise dcgm_structs.DCGMError(
+ dcgm_structs.DCGM_ST_UNKNOWN_FIELD)
+ # Initialize a field group of ALL fields.
+ fieldGroupName = self.m_fieldGroupName
+ findByNameId = self.m_dcgmSystem.GetFieldGroupIdByName(fieldGroupName)
+
+ # Remove our field group if it exists already
+ if findByNameId is not None:
+ delFieldGroup = pydcgm.DcgmFieldGroup(dcgmHandle=self.m_dcgmHandle,
+ fieldGroupId=findByNameId)
+ delFieldGroup.Delete()
+ del (delFieldGroup)
+
+ self.m_fieldGroup = pydcgm.DcgmFieldGroup(self.m_dcgmHandle,
+ fieldGroupName, allFieldIds)
+
+ ###########################################################################
+ '''
+ This function attempts to connect to DCGM and calls the implemented
+ CustomDataHandler in the child class with field values.
+ @params:
+ self.m_dcgmGroup.samples.GetLatest(self.m_fieldGroup).values : The field
+ values for each field. This dictionary contains fieldInfo for each field id
+ requested to be watched.
+ '''
+
+ def Process(self):
+ with self.m_lock:
+ try:
+ self.Reconnect()
+
+ # The first call just clears the collection set.
+
+ if not self.m_requestedEntities:
+ self.dfvc = self.m_dcgmGroup.samples.GetAllSinceLastCall(
+ self.dfvc, self.m_fieldGroup)
+ self.CustomDataHandler(self.dfvc.values)
+ self.dfvc.EmptyValues()
+ else:
+ self.dfvec = self.m_dcgmGroup.samples.GetAllSinceLastCall_v2(
+ self.dfvec, self.m_fieldGroup)
+ self.CustomDataHandler_v2(self.dfvec.values)
+ self.dfvec.EmptyValues()
+ except dcgm_structs.dcgmExceptionClass(
+ dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
+ self.LogError("Can't connect to nv-hostengine. Is it down?")
+ self.SetDisconnected()
+
+ ###########################################################################
+ def LogInfo(self, msg):
+ logging.info(msg)
+
+ ###########################################################################
+ def LogDebug(self, msg):
+ logging.debug(msg)
+
+ ###########################################################################
+ def LogError(self, msg):
+ logging.error(msg)
+
+ ###########################################################################
+ '''
+ This function gets each value as a dictionary of dictionaries. The dictionary
+ returned is each gpu id mapped to a dictionary of it's field values. Each
+ field value dictionary is the field name mapped to the value or the field
+ id mapped to value depending on the parameter mapById.
+ '''
+
+ def GetLatestGpuValuesAsDict(self, mapById):
+ systemDictionary = {}
+
+ with self.m_lock:
+ try:
+ self.Reconnect()
+ fvs = self.m_dcgmGroup.samples.GetLatest(
+ self.m_fieldGroup).values
+ for gpuId in list(fvs.keys()):
+ systemDictionary[gpuId] = {
+ } # initialize the gpu's dictionary
+ gpuFv = fvs[gpuId]
+
+ for fieldId in list(gpuFv.keys()):
+ val = gpuFv[fieldId][-1]
+
+ if val.isBlank:
+ continue
+
+ if mapById == False:
+ fieldTag = self.m_fieldIdToInfo[fieldId].tag
+ systemDictionary[gpuId][
+ fieldTag] = val.value if isinstance(
+ val.value, bytes) else val.value
+ else:
+ systemDictionary[gpuId][
+ fieldId] = val.value if isinstance(
+ val.value, bytes) else val.value
+ except dcgm_structs.dcgmExceptionClass(
+ dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
+ self.LogError(
+ "Can't connection to nv-hostengine. Please verify that it is running."
+ )
+ self.SetDisconnected()
+
+ return systemDictionary
+
+ ###########################################################################
+ '''
+ This function gets value as a dictionary of dictionaries of lists. The
+ dictionary returned is each gpu id mapped to a dictionary of it's field
+ value lists. Each field value dictionary is the field name mapped to the
+ list of values or the field id mapped to list of values depending on the
+ parameter mapById. The list of values are the values for each field since
+ the last retrieval.
+ '''
+
+ def GetAllGpuValuesAsDictSinceLastCall(self, mapById):
+ systemDictionary = {}
+
+ with self.m_lock:
+ try:
+ self.Reconnect()
+ report = self.fvs is not None
+ self.fvs = self.m_dcgmGroup.samples.GetAllSinceLastCall(
+ self.fvs, self.m_fieldGroup)
+ if report:
+ for gpuId in list(self.fvs.values.keys()):
+ systemDictionary[gpuId] = {
+ } # initialize the gpu's dictionary
+ gpuFv = self.fvs.values[gpuId]
+
+ for fieldId in list(gpuFv.keys()):
+ for val in gpuFv[fieldId]:
+ if val.isBlank:
+ continue
+
+ if mapById == False:
+ fieldTag = self.m_fieldIdToInfo[fieldId].tag
+ if not fieldTag in systemDictionary[gpuId]:
+ systemDictionary[gpuId][fieldTag] = []
+
+ systemDictionary[gpuId][fieldTag].append(
+ val)
+ else:
+ if not fieldId in systemDictionary[gpuId]:
+ systemDictionary[gpuId][fieldId] = []
+ systemDictionary[gpuId][fieldId].append(val)
+ except dcgm_structs.dcgmExceptionClass(
+ dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
+ self.LogError(
+ "Can't connection to nv-hostengine. Please verify that it is running."
+ )
+ self.SetDisconnected()
+
+ if self.fvs is not None:
+ self.fvs.EmptyValues()
+
+ return systemDictionary
+
+ ###########################################################################
+ def GetLatestGpuValuesAsFieldIdDict(self):
+ return self.GetLatestGpuValuesAsDict(True)
+
+ ###########################################################################
+ def GetLatestGpuValuesAsFieldNameDict(self):
+ return self.GetLatestGpuValuesAsDict(False)
+
+ ###########################################################################
+ def GetAllGpuValuesAsFieldIdDictSinceLastCall(self):
+ return self.GetAllGpuValuesAsDictSinceLastCall(True)
+
+ ###########################################################################
+ def GetAllGpuValuesAsFieldNameDictSinceLastCall(self):
+ return self.GetAllGpuValuesAsDictSinceLastCall(False)
diff --git a/model_analyzer/monitor/dcgm/DcgmStatus.py b/model_analyzer/monitor/dcgm/DcgmStatus.py
new file mode 100644
index 000000000..f0a5e3a7d
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/DcgmStatus.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import model_analyzer.monitor.dcgm.pydcgm as pydcgm
+import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+
+
+class DcgmStatus:
+
+ def __init__(self):
+ self.handle = dcgm_agent.dcgmStatusCreate()
+ self.errors = []
+
+ def __del__(self):
+ dcgm_agent.dcgmStatusDestroy(self.handle)
+
+ '''
+ Take any errors stored in our handle and update self.errors with them
+ '''
+
+ def UpdateErrors(self):
+ errorCount = dcgm_agent.dcgmStatusGetCount(self.handle)
+ if errorCount < 1:
+ return
+
+ for i in range(errorCount):
+ self.errors.append(dcgm_agent.dcgmStatusPopError(self.handle))
+
+ '''
+ Throw an exception if any errors are stored in our status handle
+
+ The exception text will contain all of the errors
+ '''
+
+ def ThrowExceptionOnErrors(self):
+ #Make sure we've captured all errors before looking at them
+ self.UpdateErrors()
+
+ if len(self.errors) < 1:
+ return
+
+ errorString = "Errors: "
+ for value in self.errors:
+ errorString += "\"%s\"" % value
+ raise dcgm_structs.DCGMError(value.status)
diff --git a/model_analyzer/monitor/dcgm/DcgmSystem.py b/model_analyzer/monitor/dcgm/DcgmSystem.py
new file mode 100644
index 000000000..6df2759f7
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/DcgmSystem.py
@@ -0,0 +1,412 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import model_analyzer.monitor.dcgm.pydcgm as pydcgm
+import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+import ctypes
+
+
+class DcgmSystemDiscovery:
+ '''
+ Constructor
+ '''
+
+ def __init__(self, dcgmHandle):
+ self._dcgmHandle = dcgmHandle
+
+ '''
+ Get all IDs of the GPUs that DCGM knows about. To get only GPUs that DCGM support,
+ use GetAllSupportedGpuIds().
+
+ Returns an array of GPU IDs. Each of these can be passed to DcgmGroup::AddGpu()
+ '''
+
+ def GetAllGpuIds(self):
+ gpuIds = dcgm_agent.dcgmGetAllDevices(self._dcgmHandle.handle)
+ return gpuIds
+
+ '''
+ Get all of IDs of the GPUs that DCGM supports. This will exclude unsupported
+ GPUs
+
+ Returns an array of GPU IDs. Each of these can be passed to DcgmGroup::AddGpu()
+ '''
+
+ def GetAllSupportedGpuIds(self):
+ gpuIds = dcgm_agent.dcgmGetAllSupportedDevices(self._dcgmHandle.handle)
+ return gpuIds
+
+ '''
+ Get some basic GPU attributes for a given GPU ID.
+
+ Returns a dcgm_structs.c_dcgmDeviceAttributes_v3() object for the given GPU
+ '''
+
+ def GetGpuAttributes(self, gpuId):
+ return dcgm_agent.dcgmGetDeviceAttributes(self._dcgmHandle.handle,
+ gpuId)
+
+ '''
+ Get topology information for a given GPU ID
+
+ Returns a dcgm_structs.c_dcgmDeviceTopology_v1 structure representing the topology for the given GPU
+ '''
+
+ def GetGpuTopology(self, gpuId):
+ return dcgm_agent.dcgmGetDeviceTopology(self._dcgmHandle.handle, gpuId)
+
+ '''
+ Get all entityIds of the entities that DCGM knows about.
+
+ entityGroupId IN: DCGM_FE_? constant of the entity group to fetch the entities of
+ onlyActive IN: Boolean as to whether to fetch entities that are supported by DCGM (True)
+ or all entity IDs (False)
+
+ Returns an array of entity IDs. Each of these can be passed to DcgmGroup::AddEntity()
+ '''
+
+ def GetEntityGroupEntities(self, entityGroupId, onlySupported):
+ flags = 0
+ if onlySupported:
+ flags |= dcgm_structs.DCGM_GEGE_FLAG_ONLY_SUPPORTED
+ entityIds = dcgm_agent.dcgmGetEntityGroupEntities(
+ self._dcgmHandle.handle, entityGroupId, flags)
+ return entityIds
+
+ '''
+ Get the status of all of the NvLink links in the system.
+
+ Returns a dcgm_structs.c_dcgmNvLinkStatus_v3 object.
+ '''
+
+ def GetNvLinkLinkStatus(self):
+ return dcgm_agent.dcgmGetNvLinkLinkStatus(self._dcgmHandle.handle)
+
+ '''
+ From a bitmask of input gpu ids, return a bitmask of numGpus GPUs which identifies the topologically
+ closest GPUs to use for a single job. DCGM will consider CPU affinities and NVLink connection speeds
+ to determine the closest.
+ hintFlags can instruct DCGM to consider GPU health or not. By default, unhealthy GPUs are excluded from
+ consideration.
+ '''
+
+ def SelectGpusByTopology(self, inputGpuIds, numGpus, hintFlags):
+ return dcgm_agent.dcgmSelectGpusByTopology(self._dcgmHandle.handle,
+ inputGpuIds, numGpus,
+ hintFlags)
+
+
+class DcgmSystemIntrospect:
+ '''
+ Class to access the system-wide introspection modules of DCGM
+ '''
+
+ def __init__(self, dcgmHandle):
+ self._handle = dcgmHandle
+ self.memory = DcgmSystemIntrospectMemory(dcgmHandle)
+ self.cpuUtil = DcgmSystemIntrospectCpuUtil(dcgmHandle)
+
+ def UpdateAll(self, waitForUpdate=True):
+ dcgm_agent.dcgmIntrospectUpdateAll(self._handle.handle, waitForUpdate)
+
+
+class DcgmSystemIntrospectMemory:
+ '''
+ Class to access information about the memory usage of DCGM itself
+ '''
+
+ def __init__(self, dcgmHandle):
+ self._dcgmHandle = dcgmHandle
+
+ def GetForHostengine(self, waitIfNoData=True):
+ '''
+ Retrieve the total amount of virtual memory that the hostengine process is currently using.
+ This measurement represents both the resident set size (what is currently in RAM) and
+ the swapped memory that belongs to the process.
+
+ waitIfNoData: wait for metadata to be updated if it's not available
+
+ Returns a dcgm_structs.c_dcgmIntrospectMemory_v1 object
+ Raises an exception for DCGM_ST_NO_DATA if no data is available yet and \ref waitIfNoData is False
+ '''
+ return dcgm_agent.dcgmIntrospectGetHostengineMemoryUsage(
+ self._dcgmHandle.handle, waitIfNoData)
+
+
+class DcgmSystemIntrospectCpuUtil:
+ '''
+ Class to access information about the CPU Utilization of DCGM
+ '''
+
+ def __init__(self, dcgmHandle):
+ self._dcgmHandle = dcgmHandle
+
+ def GetForHostengine(self, waitIfNoData=True):
+ '''
+ Get the current CPU Utilization of the hostengine process.
+
+ waitIfNoData: wait for metadata to be updated if it's not available
+
+ Returns a dcgm_structs.c_dcgmIntrospectCpuUtil_v1 object
+ Raises an exception for DCGM_ST_NO_DATA if no data is available yet and \ref waitIfNoData is False
+ '''
+ return dcgm_agent.dcgmIntrospectGetHostengineCpuUtilization(
+ self._dcgmHandle.handle, waitIfNoData)
+
+
+'''
+Class to encapsulate DCGM field-metadata requests
+'''
+
+
+class DcgmSystemFields:
+
+ def GetFieldById(self, fieldId):
+ '''
+ Get a field's metadata by its dcgm_fields.DCGM_FI_* field ID
+
+ fieldId: dcgm_fields.DCGM_FI_* field ID of the field
+
+ Returns a dcgm_fields.c_dcgm_field_meta_t struct on success or None on error.
+ '''
+ return dcgm_fields.DcgmFieldGetById(fieldId)
+
+ def GetFieldByTag(self, tag):
+ '''
+ Get a field's metadata by its tag name. Ex: 'brand'
+
+ tag: Tag name of the field
+
+ Returns a dcgm_fields.c_dcgm_field_meta_t struct on success or None on error.
+ '''
+ return dcgm_fields.DcgmFieldGetByTag(tag)
+
+
+'''
+Class to encapsulate DCGM module management and introspection
+'''
+
+
+class DcgmSystemModules:
+ '''
+ Constructor
+ '''
+
+ def __init__(self, dcgmHandle):
+ self._dcgmHandle = dcgmHandle
+
+ '''
+ Denylist a module from being loaded by DCGM.
+
+ moduleId a dcgm_structs.dcgmModuleId* ID of the module to denylist
+
+ Returns: Nothing.
+ Raises a DCGM_ST_IN_USE exception if the module was already loaded
+ '''
+
+ def Denylist(self, moduleId):
+ dcgm_agent.dcgmModuleDenylist(self._dcgmHandle.handle, moduleId)
+
+ '''
+ Get the statuses of all of the modules in DCGM
+
+ Returns: a dcgm_structs.c_dcgmModuleGetStatuses_v1 structure.
+ '''
+
+ def GetStatuses(self):
+ return dcgm_agent.dcgmModuleGetStatuses(self._dcgmHandle.handle)
+
+
+'''
+Class to encapsulate DCGM profiling
+'''
+
+
+class DcgmSystemProfiling:
+ '''
+ Constructor
+ '''
+
+ def __init__(self, dcgmHandle):
+ self._dcgmHandle = dcgmHandle
+
+ '''
+ Pause profiling activities in DCGM. This should be used when you are monitoring profiling fields
+ from DCGM but want to be able to still run developer tools like nvprof, nsight systems, and nsight compute.
+ Profiling fields start with DCGM_PROF_ and are in the field ID range 1001-1012.
+
+ Call this API before you launch one of those tools and Resume() after the tool has completed.
+
+ DCGM will save BLANK values while profiling is paused.
+ Calling this while profiling activities are already paused is fine and will be treated as a no-op.
+ '''
+
+ def Pause(self):
+ return dcgm_agent.dcgmProfPause(self._dcgmHandle.handle)
+
+ '''
+ Resume profiling activities in DCGM that were previously paused with Pause().
+
+ Call this API after you have completed running other NVIDIA developer tools to reenable DCGM
+ profiling metrics.
+
+ DCGM will save BLANK values while profiling is paused.
+
+ Calling this while profiling activities have already been resumed is fine and will be treated as a no-op.
+ '''
+
+ def Resume(self):
+ return dcgm_agent.dcgmProfResume(self._dcgmHandle.handle)
+
+
+'''
+Class to encapsulate global DCGM methods. These apply to a single DcgmHandle, provided to the constructor
+'''
+
+
+class DcgmSystem:
+ '''
+ Constructor
+
+ dcgmHandle is a pydcgm.DcgmHandle instance of the connection that will be used by all methods of this class
+ '''
+
+ def __init__(self, dcgmHandle):
+ self._dcgmHandle = dcgmHandle
+
+ #Child classes
+ self.discovery = DcgmSystemDiscovery(self._dcgmHandle)
+ self.introspect = DcgmSystemIntrospect(self._dcgmHandle)
+ self.fields = DcgmSystemFields()
+ self.modules = DcgmSystemModules(self._dcgmHandle)
+ self.profiling = DcgmSystemProfiling(self._dcgmHandle)
+
+ '''
+ Request that the host engine perform a field value update cycle. If the host
+ engine was starting in DCGM_OPERATION_MODE_MANUAL, calling this method is
+ the only way that field values will be updated.
+
+ Note that performing a field value update cycle does not update every field.
+ It only update fields that are newly watched or fields that haven't updated
+ in enough time to warrant updating again, based on their update frequency.
+
+ waitForUpdate specifies whether this function call should block until the
+ field value update loop is complete or not. Use True if you intend to query
+ values immediately after calling this.
+ '''
+
+ def UpdateAllFields(self, waitForUpdate):
+ ret = dcgm_agent.dcgmUpdateAllFields(self._dcgmHandle.handle,
+ waitForUpdate)
+ #Throw an exception on error
+ dcgm_structs._dcgmCheckReturn(ret)
+
+ '''
+ Get a DcgmGroup instance for the default all-GPUs group. This object is used to
+ perform operations on a group of GPUs. See DcgmGroup.py for details.
+
+ AddGpu() and RemoveGpu() operations are not allowed on the default group
+ '''
+
+ def GetDefaultGroup(self):
+ return pydcgm.DcgmGroup(self._dcgmHandle,
+ groupId=dcgm_structs.DCGM_GROUP_ALL_GPUS)
+
+ '''
+ Get an instance of DcgmGroup with no GPUs. Call AddGpu() on the returned
+ object with GPU IDs from GetAllGpuIds() before performing actions on
+ the returned DcgmGroup instance.
+
+ groupName is the name of the group to create in the host engine. This name must be
+ unique.
+
+ Note: The group will be deleted from the host engine when the returned object goes out of scope
+ '''
+
+ def GetEmptyGroup(self, groupName):
+ return pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName)
+
+ '''
+ Get an instance of DcgmGroup populated with the gpuIds provided
+
+ groupName is the name of the group to create in the host engine. This name must be
+ unique.
+ gpuIds is the list of GPU IDs to add to the group
+
+ Note: The group will be deleted from the host engine when the returned object goes out of scope
+ '''
+
+ def GetGroupWithGpuIds(self, groupName, gpuIds):
+ newGroup = pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName)
+ for gpuId in gpuIds:
+ newGroup.AddGpu(gpuId)
+ return newGroup
+
+ '''
+ Get an instance of DcgmGroup populated with the provided entities
+
+ groupName is the name of the group to create in the host engine. This name must be
+ unique.
+ entities is the list of entity pairs (type and id) to add to the group
+
+ Note: The group will be deleted from the host engine when the returned object goes out of scope
+ '''
+
+ def GetGroupWithEntities(self, groupName, entities):
+ group = pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName)
+ for entity in entities:
+ group.AddEntity(entity.entityGroupId, entity.entityId)
+
+ return group
+
+ '''
+ Get ids of all DcgmGroups of GPUs. This returns a list containing the ids of the DcgmGroups.
+ '''
+
+ def GetAllGroupIds(self):
+ return dcgm_agent.dcgmGroupGetAllIds(self._dcgmHandle.handle)
+
+ '''
+ Get all all of the field groups in the system
+ '''
+
+ def GetAllFieldGroups(self):
+ return dcgm_agent.dcgmFieldGroupGetAll(self._dcgmHandle.handle)
+
+ '''
+ Get a field group's id by its name.
+
+ Returns: Field group ID if found
+ None if not found
+ '''
+
+ def GetFieldGroupIdByName(self, name):
+ allGroups = self.GetAllFieldGroups()
+ for i in range(0, allGroups.numFieldGroups):
+ if allGroups.fieldGroups[i].fieldGroupName == name:
+ return ctypes.c_void_p(allGroups.fieldGroups[i].fieldGroupId)
+
+ return None
+
+ def PauseTelemetryForDiag(self):
+ """Pause DCGM modules from updating field values."""
+ import dcgm_agent_internal
+ dcgm_agent_internal.dcgmPauseTelemetryForDiag(self._dcgmHandle.handle)
+
+ def ResumeTelemetryForDiag(self):
+ """Resume previously paused DCGM modules so that they can update field values."""
+ import dcgm_agent_internal
+ dcgm_agent_internal.dcgmResumeTelemetryForDiag(self._dcgmHandle.handle)
diff --git a/model_analyzer/monitor/dcgm/common/__init__.py b/model_analyzer/monitor/dcgm/common/__init__.py
new file mode 100644
index 000000000..236f66016
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/common/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py b/model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py
new file mode 100644
index 000000000..401dcee05
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py
@@ -0,0 +1,194 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from os import environ
+import argparse
+import logging
+import sys
+
+
+###############################################################################
+def create_parser(
+ publish_port=8000,
+ interval=10,
+ name='the monitoring tool', # Replace with 'prometheus', 'telegraf', etc.
+ field_ids=None,
+ log_file=None,
+ log_level='INFO',
+ dcgm_hostname=environ.get('DCGM_HOSTNAME') or 'localhost',
+):
+ '''
+ Create a parser that defaults to sane parameters.
+
+ The default parameters can be overridden through keyword arguments.
+
+ Note: if DCGM_HOSTNAME is set as an environment variable, it is used as
+ the default instead of localhost
+ '''
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '-p',
+ '--publish-port',
+ dest='publish_port',
+ type=int,
+ default=publish_port,
+ help='TCP port that the client should publish to. Default={}.'.format(
+ publish_port))
+ parser.add_argument(
+ '-i',
+ '--interval',
+ dest='interval',
+ type=int,
+ default=interval,
+ help=
+ 'How often the client should retrieve new values from DCGM in seconds. Default={}.'
+ .format(interval))
+ parser.add_argument(
+ '-f',
+ '--field-ids',
+ dest='field_ids',
+ type=str,
+ default=field_ids,
+ help=
+ 'Comma-separated list of field IDs that should be retrieved from DCGM. '
+ +
+ 'The full list of available field IDs can be obtained from dcgm_fields.h, dcgm_fields.py, '
+ + 'or running \'dcgmi dmon -l\'.')
+ parser.add_argument(
+ '--log-file',
+ dest='logfile',
+ type=str,
+ default=log_file,
+ help=
+ 'A path to a log file for recording what information is being sent to {}'
+ .format(name))
+ parser.add_argument(
+ '--log-level',
+ dest='loglevel',
+ type=str,
+ default=log_level,
+ help=
+ 'Specify a log level to use for logging.\n\tCRITICAL (0) - log only critical errors that drastically affect execution'
+ +
+ '\n\tERROR (1) - Log any error in execution\n\tWARNING (2) - Log all warnings and errors that occur'
+ +
+ '\n\tINFO (3) - Log informational messages about program execution in addition to warnings and errors'
+ +
+ '\n\tDEBUG (4) - Log debugging information in addition to all information about execution'
+ + '\nDefault: {}'.format(log_level))
+
+ group = parser.add_mutually_exclusive_group()
+ group.add_argument(
+ '-n',
+ '--hostname',
+ dest='hostname',
+ type=str,
+ default=dcgm_hostname,
+ help=
+ 'IP/hostname where the client should query DCGM for values. Default={} (all interfaces).'
+ .format(dcgm_hostname))
+ group.add_argument(
+ '-e',
+ '--embedded',
+ dest='embedded',
+ action='store_true',
+ help=
+ 'Launch DCGM from within this process instead of connecting to nv-hostengine.'
+ )
+
+ return parser
+
+
+def add_custom_argument(parser, *args, **kwargs):
+ parser.add_argument(*args, **kwargs)
+
+
+###############################################################################
+def add_target_host_argument(name, parser, default_target='localhost'):
+ parser.add_argument(
+ '-t',
+ '--publish-hostname',
+ dest='publish_hostname',
+ type=str,
+ default=default_target,
+ help='The hostname at which the client will publish the readings to {}'.
+ format(name))
+
+
+###############################################################################
+def run_parser(parser):
+ '''
+ Run a parser created using create_parser
+ '''
+ return parser.parse_args()
+
+
+###############################################################################
+def get_field_ids(args):
+ # This indicates the user supplied a string, so we should override the
+ # default
+ if isinstance(args.field_ids, str):
+ tokens = args.field_ids.split(",")
+ field_ids = [int(token) for token in tokens]
+ return field_ids
+ # The default object should already be an array of ints. Just return it
+ else:
+ return args.field_ids
+
+
+###############################################################################
+def get_log_level(args):
+ levelStr = args.loglevel.upper()
+ if levelStr == '0' or levelStr == 'CRITICAL':
+ numeric_log_level = logging.CRITICAL
+ elif levelStr == '1' or levelStr == 'ERROR':
+ numeric_log_level = logging.ERROR
+ elif levelStr == '2' or levelStr == 'WARNING':
+ numeric_log_level = logging.WARNING
+ elif levelStr == '3' or levelStr == 'INFO':
+ numeric_log_level = logging.INFO
+ elif levelStr == '4' or levelStr == 'DEBUG':
+ numeric_log_level = logging.DEBUG
+ else:
+ print("Could not understand the specified --log-level '%s'" %
+ (args.loglevel))
+ args.print_help()
+ sys.exit(2)
+ return numeric_log_level
+
+
+###############################################################################
+def parse_command_line(name, default_port, add_target_host=False):
+ # Fields we accept raw from the CLI
+ FIELDS_AS_IS = ['publish_port', 'interval', 'logfile', 'publish_hostname']
+
+ parser = create_parser(
+ name=name,
+ publish_port=default_port,
+ )
+
+ if add_target_host:
+ add_target_host_argument(name, parser)
+
+ args = run_parser(parser)
+ field_ids = get_field_ids(args)
+ log_level = get_log_level(args)
+
+ args_as_dict = vars(args)
+ settings = {i: args_as_dict[i] for i in FIELDS_AS_IS}
+ settings['dcgm_hostname'] = None if args.embedded else args.hostname
+ settings['field_ids'] = field_ids
+ settings['log_level'] = log_level
+
+ return settings
diff --git a/model_analyzer/monitor/dcgm/common/dcgm_client_main.py b/model_analyzer/monitor/dcgm/common/dcgm_client_main.py
new file mode 100644
index 000000000..54cd04673
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/common/dcgm_client_main.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from time import sleep
+from . import dcgm_client_cli_parser as cli
+import signal
+
+
+###############################################################################
+def exit_handler(signum, frame):
+ # The Prometheus client does something smarter but more complex
+ # Here we just exit
+ exit()
+
+
+###############################################################################
+def initialize_signal_handlers():
+ signal.signal(signal.SIGINT, exit_handler)
+ signal.signal(signal.SIGTERM, exit_handler)
+
+
+###############################################################################
+def main(DRConstructor, name, default_port, add_target_host=False):
+ '''
+ This main function should work for most DCGM clients. It creates a
+ DcgmReader object using DRConstructor and enters a loop that queries DCGM
+ for data
+
+ Arguments
+ ---------
+ DRConstructor: A constructor for a DcgmReader. The constructor must
+ accept the following keyword arguments:
+ - hostname: DCGM hostname
+ - publish_port: port on which the data is published
+ In some cases, the constructor will also need to accept:
+ - publish_hostname: hostname the data is published to
+ - field_ids: field ids to query and publish
+ name: The name of the client. This is displayed to the user
+ default_port: Default port to publish to
+
+ Keyword arguments
+ -----------------
+ add_target_host: Boolean that indicates whether this client accepts a
+ publish hostname
+
+ '''
+
+ initialize_signal_handlers()
+ settings = cli.parse_command_line(
+ name,
+ default_port,
+ add_target_host=add_target_host,
+ )
+
+ # Create a dictionary for the arguments because field_ids might not be
+ # provided (if it's None) when we want to use the default in DcgmReader
+ dr_args = {
+ 'hostname': settings['dcgm_hostname'],
+ 'publish_port': settings['publish_port'],
+ }
+
+ # publish_hostname is only available if we add the target_host parameter
+ if add_target_host:
+ dr_args['publish_hostname'] = settings['publish_hostname']
+
+ if settings['field_ids']:
+ dr_args['fieldIds'] = settings['field_ids']
+
+ dr = DRConstructor(**dr_args)
+
+ try:
+ while True:
+ dr.Process()
+ sleep(settings['interval'])
+ except KeyboardInterrupt:
+ print('Caught CTRL-C. Exiting')
diff --git a/model_analyzer/monitor/dcgm/dcgm_agent.py b/model_analyzer/monitor/dcgm/dcgm_agent.py
index 809b57f66..320db76d2 100755
--- a/model_analyzer/monitor/dcgm/dcgm_agent.py
+++ b/model_analyzer/monitor/dcgm/dcgm_agent.py
@@ -1,6 +1,4 @@
-#!/usr/bin/env python3
-
-# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -13,31 +11,61 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+##
+# Python bindings for the internal API of DCGM library (dcgm_agent.h)
+##
-from ctypes import (
- CFUNCTYPE,
- POINTER,
- byref,
- c_double,
- c_int,
- c_int32,
- c_int64,
- c_uint,
- c_uint16,
- c_uint32,
- c_uint64,
- c_void_p,
- py_object,
-)
-
-import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+from ctypes import *
+import functools
+
+
+def ensure_byte_strings():
+ """
+ Ensures that we don't call C APIs with unicode strings in the arguments
+ every unicode args gets converted to UTF-8 before the function is called
+ """
+
+ def convert_result_from_bytes(result):
+ if isinstance(result, bytes):
+ return result.decode('utf-8')
+ if isinstance(result, list):
+ return list(map(convert_result_from_bytes, result))
+ if isinstance(result, tuple):
+ return tuple(map(convert_result_from_bytes, result))
+ return result
+
+ def decorator(fn):
+
+ @functools.wraps(fn)
+ def wrapper(*args, **kwargs):
+ newargs = []
+ newkwargs = {}
+ for arg in args:
+ if isinstance(arg, str):
+ newargs.append(bytes(arg, 'utf-8'))
+ else:
+ newargs.append(arg)
+ for k, v in kwargs.items():
+ if isinstance(v, str):
+ newkwargs[k] = bytes(v, 'utf-8')
+ else:
+ newkwargs[k] = v
+ newargs = tuple(newargs)
+ return fn(*newargs, **newkwargs)
+
+ return wrapper
+
+ return decorator
+
# Provides access to functions from dcgm_agent_internal
dcgmFP = dcgm_structs._dcgmGetFunctionPointer
# This method is used to initialize DCGM
+@ensure_byte_strings()
def dcgmInit():
dcgm_handle = c_void_p()
fn = dcgmFP("dcgmInit")
@@ -47,6 +75,7 @@ def dcgmInit():
# This method is used to shutdown DCGM Engine
+@ensure_byte_strings()
def dcgmShutdown():
fn = dcgmFP("dcgmShutdown")
ret = fn()
@@ -54,6 +83,7 @@ def dcgmShutdown():
return ret
+@ensure_byte_strings()
def dcgmStartEmbedded(opMode):
dcgm_handle = c_void_p()
fn = dcgmFP("dcgmStartEmbedded")
@@ -62,6 +92,7 @@ def dcgmStartEmbedded(opMode):
return dcgm_handle
+@ensure_byte_strings()
def dcgmStopEmbedded(dcgm_handle):
fn = dcgmFP("dcgmStopEmbedded")
ret = fn(dcgm_handle)
@@ -69,6 +100,7 @@ def dcgmStopEmbedded(dcgm_handle):
return ret
+@ensure_byte_strings()
def dcgmConnect(ip_address):
dcgm_handle = c_void_p()
fn = dcgmFP("dcgmConnect")
@@ -77,9 +109,10 @@ def dcgmConnect(ip_address):
return dcgm_handle
-def dcgmConnect_v2(
- ip_address, connectParams, version=dcgm_structs.c_dcgmConnectV2Params_version
-):
+@ensure_byte_strings()
+def dcgmConnect_v2(ip_address,
+ connectParams,
+ version=dcgm_structs.c_dcgmConnectV2Params_version):
connectParams.version = version
dcgm_handle = c_void_p()
fn = dcgmFP("dcgmConnect_v2")
@@ -88,6 +121,7 @@ def dcgmConnect_v2(
return dcgm_handle
+@ensure_byte_strings()
def dcgmDisconnect(dcgm_handle):
fn = dcgmFP("dcgmDisconnect")
ret = fn(dcgm_handle)
@@ -95,6 +129,7 @@ def dcgmDisconnect(dcgm_handle):
return ret
+@ensure_byte_strings()
def dcgmGetAllSupportedDevices(dcgm_handle):
c_count = c_uint()
gpuid_list = c_uint * dcgm_structs.DCGM_MAX_NUM_DEVICES
@@ -102,9 +137,10 @@ def dcgmGetAllSupportedDevices(dcgm_handle):
fn = dcgmFP("dcgmGetAllSupportedDevices")
ret = fn(dcgm_handle, c_gpuid_list, byref(c_count))
dcgm_structs._dcgmCheckReturn(ret)
- return [c_gpuid_list[i] for i in range(c_count.value)[0 : int(c_count.value)]]
+ return list(c_gpuid_list[0:int(c_count.value)])
+@ensure_byte_strings()
def dcgmGetAllDevices(dcgm_handle):
c_count = c_uint()
gpuid_list = c_uint * dcgm_structs.DCGM_MAX_NUM_DEVICES
@@ -112,18 +148,26 @@ def dcgmGetAllDevices(dcgm_handle):
fn = dcgmFP("dcgmGetAllDevices")
ret = fn(dcgm_handle, c_gpuid_list, byref(c_count))
dcgm_structs._dcgmCheckReturn(ret)
- return [c_gpuid_list[i] for i in range(c_count.value)[0 : int(c_count.value)]]
+ return list(c_gpuid_list[0:int(c_count.value)])
-def dcgmGetDeviceAttributes(dcgm_handle, gpuId):
+@ensure_byte_strings()
+def dcgmGetDeviceAttributes(dcgm_handle,
+ gpuId,
+ version=dcgm_structs.dcgmDeviceAttributes_version3):
fn = dcgmFP("dcgmGetDeviceAttributes")
- device_values = dcgm_structs.c_dcgmDeviceAttributes_v2()
- device_values.version = dcgm_structs.dcgmDeviceAttributes_version2
+ if version == dcgm_structs.dcgmDeviceAttributes_version3:
+ device_values = dcgm_structs.c_dcgmDeviceAttributes_v3()
+ device_values.version = dcgm_structs.dcgmDeviceAttributes_version3
+ else:
+ dcgm_structs._dcgmCheckReturn(dcgm_structs.DCGM_ST_VER_MISMATCH)
+
ret = fn(dcgm_handle, c_int(gpuId), byref(device_values))
dcgm_structs._dcgmCheckReturn(ret)
return device_values
+@ensure_byte_strings()
def dcgmGetEntityGroupEntities(dcgm_handle, entityGroup, flags):
capacity = dcgm_structs.DCGM_GROUP_MAX_ENTITIES
c_count = c_int32(capacity)
@@ -135,24 +179,27 @@ def dcgmGetEntityGroupEntities(dcgm_handle, entityGroup, flags):
return c_entityIds[0 : int(c_count.value)]
+@ensure_byte_strings()
def dcgmGetNvLinkLinkStatus(dcgm_handle):
- linkStatus = dcgm_structs.c_dcgmNvLinkStatus_v2()
- linkStatus.version = dcgm_structs.dcgmNvLinkStatus_version2
+ linkStatus = dcgm_structs.c_dcgmNvLinkStatus_v3()
+ linkStatus.version = dcgm_structs.dcgmNvLinkStatus_version3
fn = dcgmFP("dcgmGetNvLinkLinkStatus")
ret = fn(dcgm_handle, byref(linkStatus))
dcgm_structs._dcgmCheckReturn(ret)
return linkStatus
+@ensure_byte_strings()
def dcgmGetGpuInstanceHierarchy(dcgm_handle):
- hierarchy = dcgm_structs.c_dcgmMigHierarchy_v1()
- hierarchy.version = dcgm_structs.c_dcgmMigHierarchy_version1
+ hierarchy = dcgm_structs.c_dcgmMigHierarchy_v2()
+ hierarchy.version = dcgm_structs.c_dcgmMigHierarchy_version2
fn = dcgmFP("dcgmGetGpuInstanceHierarchy")
ret = fn(dcgm_handle, byref(hierarchy))
dcgm_structs._dcgmCheckReturn(ret)
return hierarchy
+@ensure_byte_strings()
def dcgmCreateMigEntity(dcgm_handle, parentId, profile, createOption, flags):
fn = dcgmFP("dcgmCreateMigEntity")
cme = dcgm_structs.c_dcgmCreateMigEntity_v1()
@@ -165,6 +212,7 @@ def dcgmCreateMigEntity(dcgm_handle, parentId, profile, createOption, flags):
dcgm_structs._dcgmCheckReturn(ret)
+@ensure_byte_strings()
def dcgmDeleteMigEntity(dcgm_handle, entityGroupId, entityId, flags):
fn = dcgmFP("dcgmDeleteMigEntity")
dme = dcgm_structs.c_dcgmDeleteMigEntity_v1()
@@ -176,6 +224,7 @@ def dcgmDeleteMigEntity(dcgm_handle, entityGroupId, entityId, flags):
dcgm_structs._dcgmCheckReturn(ret)
+@ensure_byte_strings()
def dcgmGroupCreate(dcgm_handle, type, groupName):
c_group_id = c_void_p()
fn = dcgmFP("dcgmGroupCreate")
@@ -184,6 +233,7 @@ def dcgmGroupCreate(dcgm_handle, type, groupName):
return c_group_id
+@ensure_byte_strings()
def dcgmGroupDestroy(dcgm_handle, group_id):
fn = dcgmFP("dcgmGroupDestroy")
ret = fn(dcgm_handle, group_id)
@@ -191,6 +241,7 @@ def dcgmGroupDestroy(dcgm_handle, group_id):
return ret
+@ensure_byte_strings()
def dcgmGroupAddDevice(dcgm_handle, group_id, gpu_id):
fn = dcgmFP("dcgmGroupAddDevice")
ret = fn(dcgm_handle, group_id, gpu_id)
@@ -198,6 +249,7 @@ def dcgmGroupAddDevice(dcgm_handle, group_id, gpu_id):
return ret
+@ensure_byte_strings()
def dcgmGroupAddEntity(dcgm_handle, group_id, entityGroupId, entityId):
fn = dcgmFP("dcgmGroupAddEntity")
ret = fn(dcgm_handle, group_id, entityGroupId, entityId)
@@ -205,6 +257,7 @@ def dcgmGroupAddEntity(dcgm_handle, group_id, entityGroupId, entityId):
return ret
+@ensure_byte_strings()
def dcgmGroupRemoveDevice(dcgm_handle, group_id, gpu_id):
fn = dcgmFP("dcgmGroupRemoveDevice")
ret = fn(dcgm_handle, group_id, gpu_id)
@@ -212,6 +265,7 @@ def dcgmGroupRemoveDevice(dcgm_handle, group_id, gpu_id):
return ret
+@ensure_byte_strings()
def dcgmGroupRemoveEntity(dcgm_handle, group_id, entityGroupId, entityId):
fn = dcgmFP("dcgmGroupRemoveEntity")
ret = fn(dcgm_handle, group_id, entityGroupId, entityId)
@@ -219,12 +273,13 @@ def dcgmGroupRemoveEntity(dcgm_handle, group_id, entityGroupId, entityId):
return ret
-def dcgmGroupGetInfo(
- dcgm_handle, group_id, version=dcgm_structs.c_dcgmGroupInfo_version2
-):
+@ensure_byte_strings()
+def dcgmGroupGetInfo(dcgm_handle,
+ group_id,
+ version=dcgm_structs.c_dcgmGroupInfo_version2):
fn = dcgmFP("dcgmGroupGetInfo")
- # support the old version of the request since the host engine does
+ #support the old version of the request since the host engine does
if version == dcgm_structs.c_dcgmGroupInfo_version2:
device_values = dcgm_structs.c_dcgmGroupInfo_v2()
device_values.version = dcgm_structs.c_dcgmGroupInfo_version2
@@ -236,6 +291,7 @@ def dcgmGroupGetInfo(
return device_values
+@ensure_byte_strings()
def dcgmGroupGetAllIds(dcgmHandle):
fn = dcgmFP("dcgmGroupGetAllIds")
c_count = c_uint()
@@ -243,25 +299,22 @@ def dcgmGroupGetAllIds(dcgmHandle):
c_groupIdList = groupIdList()
ret = fn(dcgmHandle, c_groupIdList, byref(c_count))
dcgm_structs._dcgmCheckReturn(ret)
- return map(None, c_groupIdList[0 : int(c_count.value)])
+ return list(c_groupIdList[0:int(c_count.value)])
+@ensure_byte_strings()
def dcgmFieldGroupCreate(dcgm_handle, fieldIds, fieldGroupName):
c_field_group_id = c_void_p()
c_num_field_ids = c_int32(len(fieldIds))
c_field_ids = (c_uint16 * len(fieldIds))(*fieldIds)
fn = dcgmFP("dcgmFieldGroupCreate")
- ret = fn(
- dcgm_handle,
- c_num_field_ids,
- byref(c_field_ids),
- fieldGroupName,
- byref(c_field_group_id),
- )
+ ret = fn(dcgm_handle, c_num_field_ids, byref(c_field_ids), fieldGroupName,
+ byref(c_field_group_id))
dcgm_structs._dcgmCheckReturn(ret)
return c_field_group_id
+@ensure_byte_strings()
def dcgmFieldGroupDestroy(dcgm_handle, fieldGroupId):
fn = dcgmFP("dcgmFieldGroupDestroy")
ret = fn(dcgm_handle, fieldGroupId)
@@ -269,6 +322,7 @@ def dcgmFieldGroupDestroy(dcgm_handle, fieldGroupId):
return ret
+@ensure_byte_strings()
def dcgmFieldGroupGetInfo(dcgm_handle, fieldGroupId):
c_fieldGroupInfo = dcgm_structs.c_dcgmFieldGroupInfo_v1()
c_fieldGroupInfo.version = dcgm_structs.dcgmFieldGroupInfo_version1
@@ -279,6 +333,7 @@ def dcgmFieldGroupGetInfo(dcgm_handle, fieldGroupId):
return c_fieldGroupInfo
+@ensure_byte_strings()
def dcgmFieldGroupGetAll(dcgm_handle):
c_allGroupInfo = dcgm_structs.c_dcgmAllFieldGroup_v1()
c_allGroupInfo.version = dcgm_structs.dcgmAllFieldGroup_version1
@@ -288,6 +343,7 @@ def dcgmFieldGroupGetAll(dcgm_handle):
return c_allGroupInfo
+@ensure_byte_strings()
def dcgmStatusCreate():
c_status_handle = c_void_p()
fn = dcgmFP("dcgmStatusCreate")
@@ -296,6 +352,7 @@ def dcgmStatusCreate():
return c_status_handle
+@ensure_byte_strings()
def dcgmStatusDestroy(status_handle):
fn = dcgmFP("dcgmStatusDestroy")
ret = fn(status_handle)
@@ -303,6 +360,7 @@ def dcgmStatusDestroy(status_handle):
return ret
+@ensure_byte_strings()
def dcgmStatusGetCount(status_handle):
c_count = c_uint()
fn = dcgmFP("dcgmStatusGetCount")
@@ -311,6 +369,7 @@ def dcgmStatusGetCount(status_handle):
return c_count.value
+@ensure_byte_strings()
def dcgmStatusPopError(status_handle):
c_errorInfo = dcgm_structs.c_dcgmErrorInfo_v1()
fn = dcgmFP("dcgmStatusPopError")
@@ -321,6 +380,7 @@ def dcgmStatusPopError(status_handle):
return None
+@ensure_byte_strings()
def dcgmStatusClear(status_handle):
fn = dcgmFP("dcgmStatusClear")
ret = fn(status_handle)
@@ -328,6 +388,7 @@ def dcgmStatusClear(status_handle):
return ret
+@ensure_byte_strings()
def dcgmConfigSet(dcgm_handle, group_id, configToSet, status_handle):
fn = dcgmFP("dcgmConfigSet")
configToSet.version = dcgm_structs.dcgmDeviceConfig_version1
@@ -336,6 +397,7 @@ def dcgmConfigSet(dcgm_handle, group_id, configToSet, status_handle):
return ret
+@ensure_byte_strings()
def dcgmConfigGet(dcgm_handle, group_id, reqCfgType, count, status_handle):
fn = dcgmFP("dcgmConfigGet")
@@ -345,11 +407,13 @@ def dcgmConfigGet(dcgm_handle, group_id, reqCfgType, count, status_handle):
for index in range(0, count):
c_config_values[index].version = dcgm_structs.dcgmDeviceConfig_version1
- ret = fn(dcgm_handle, group_id, reqCfgType, count, c_config_values, status_handle)
+ ret = fn(dcgm_handle, group_id, reqCfgType, count, c_config_values,
+ status_handle)
dcgm_structs._dcgmCheckReturn(ret)
- return map(None, c_config_values[0:count])
+ return list(c_config_values[0:count])
+@ensure_byte_strings()
def dcgmConfigEnforce(dcgm_handle, group_id, status_handle):
fn = dcgmFP("dcgmConfigEnforce")
ret = fn(dcgm_handle, group_id, status_handle)
@@ -358,6 +422,7 @@ def dcgmConfigEnforce(dcgm_handle, group_id, status_handle):
# This method is used to tell the cache manager to update all fields
+@ensure_byte_strings()
def dcgmUpdateAllFields(dcgm_handle, waitForUpdate):
fn = dcgmFP("dcgmUpdateAllFields")
ret = fn(dcgm_handle, c_int(waitForUpdate))
@@ -366,6 +431,7 @@ def dcgmUpdateAllFields(dcgm_handle, waitForUpdate):
# This method is used to get the policy information
+@ensure_byte_strings()
def dcgmPolicyGet(dcgm_handle, group_id, count, status_handle):
fn = dcgmFP("dcgmPolicyGet")
policy_array = count * dcgm_structs.c_dcgmPolicy_v1
@@ -381,6 +447,7 @@ def dcgmPolicyGet(dcgm_handle, group_id, count, status_handle):
# This method is used to set the policy information
+@ensure_byte_strings()
def dcgmPolicySet(dcgm_handle, group_id, policy, status_handle):
fn = dcgmFP("dcgmPolicySet")
ret = fn(dcgm_handle, group_id, byref(policy), status_handle)
@@ -388,56 +455,38 @@ def dcgmPolicySet(dcgm_handle, group_id, policy, status_handle):
return ret
-# First parameter below is the return type
+#First parameter below is the return type
dcgmFieldValueEnumeration_f = CFUNCTYPE(
- c_int32, c_uint32, POINTER(dcgm_structs.c_dcgmFieldValue_v1), c_int32, c_void_p
-)
+ c_int32, c_uint32, POINTER(dcgm_structs.c_dcgmFieldValue_v1), c_int32,
+ c_void_p)
dcgmFieldValueEntityEnumeration_f = CFUNCTYPE(
- c_int32,
- c_uint32,
- c_uint32,
- POINTER(dcgm_structs.c_dcgmFieldValue_v1),
- c_int32,
- c_void_p,
-)
-
-
-def dcgmGetValuesSince(
- dcgm_handle, groupId, fieldGroupId, sinceTimestamp, enumCB, userData
-):
+ c_int32, c_uint32, c_uint32, POINTER(dcgm_structs.c_dcgmFieldValue_v1),
+ c_int32, c_void_p)
+
+
+@ensure_byte_strings()
+def dcgmGetValuesSince(dcgm_handle, groupId, fieldGroupId, sinceTimestamp,
+ enumCB, userData):
fn = dcgmFP("dcgmGetValuesSince")
c_nextSinceTimestamp = c_int64()
- ret = fn(
- dcgm_handle,
- groupId,
- fieldGroupId,
- c_int64(sinceTimestamp),
- byref(c_nextSinceTimestamp),
- enumCB,
- py_object(userData),
- )
+ ret = fn(dcgm_handle, groupId, fieldGroupId, c_int64(sinceTimestamp),
+ byref(c_nextSinceTimestamp), enumCB, py_object(userData))
dcgm_structs._dcgmCheckReturn(ret)
return c_nextSinceTimestamp.value
-def dcgmGetValuesSince_v2(
- dcgm_handle, groupId, fieldGroupId, sinceTimestamp, enumCB, userData
-):
+@ensure_byte_strings()
+def dcgmGetValuesSince_v2(dcgm_handle, groupId, fieldGroupId, sinceTimestamp,
+ enumCB, userData):
fn = dcgmFP("dcgmGetValuesSince_v2")
c_nextSinceTimestamp = c_int64()
- ret = fn(
- dcgm_handle,
- groupId,
- fieldGroupId,
- c_int64(sinceTimestamp),
- byref(c_nextSinceTimestamp),
- enumCB,
- py_object(userData),
- )
+ ret = fn(dcgm_handle, groupId, fieldGroupId, c_int64(sinceTimestamp),
+ byref(c_nextSinceTimestamp), enumCB, py_object(userData))
dcgm_structs._dcgmCheckReturn(ret)
return c_nextSinceTimestamp.value
+@ensure_byte_strings()
def dcgmGetLatestValues(dcgm_handle, groupId, fieldGroupId, enumCB, userData):
fn = dcgmFP("dcgmGetLatestValues")
ret = fn(dcgm_handle, groupId, fieldGroupId, enumCB, py_object(userData))
@@ -445,29 +494,26 @@ def dcgmGetLatestValues(dcgm_handle, groupId, fieldGroupId, enumCB, userData):
return ret
-def dcgmGetLatestValues_v2(dcgm_handle, groupId, fieldGroupId, enumCB, userData):
+@ensure_byte_strings()
+def dcgmGetLatestValues_v2(dcgm_handle, groupId, fieldGroupId, enumCB,
+ userData):
fn = dcgmFP("dcgmGetLatestValues_v2")
ret = fn(dcgm_handle, groupId, fieldGroupId, enumCB, py_object(userData))
dcgm_structs._dcgmCheckReturn(ret)
return ret
-def dcgmWatchFields(
- dcgm_handle, groupId, fieldGroupId, updateFreq, maxKeepAge, maxKeepSamples
-):
+@ensure_byte_strings()
+def dcgmWatchFields(dcgm_handle, groupId, fieldGroupId, updateFreq, maxKeepAge,
+ maxKeepSamples):
fn = dcgmFP("dcgmWatchFields")
- ret = fn(
- dcgm_handle,
- groupId,
- fieldGroupId,
- c_int64(updateFreq),
- c_double(maxKeepAge),
- c_int32(maxKeepSamples),
- )
+ ret = fn(dcgm_handle, groupId, fieldGroupId, c_int64(updateFreq),
+ c_double(maxKeepAge), c_int32(maxKeepSamples))
dcgm_structs._dcgmCheckReturn(ret)
return ret
+@ensure_byte_strings()
def dcgmUnwatchFields(dcgm_handle, groupId, fieldGroupId):
fn = dcgmFP("dcgmUnwatchFields")
ret = fn(dcgm_handle, groupId, fieldGroupId)
@@ -475,6 +521,7 @@ def dcgmUnwatchFields(dcgm_handle, groupId, fieldGroupId):
return ret
+@ensure_byte_strings()
def dcgmHealthSet(dcgm_handle, groupId, systems):
fn = dcgmFP("dcgmHealthSet")
ret = fn(dcgm_handle, groupId, systems)
@@ -482,6 +529,7 @@ def dcgmHealthSet(dcgm_handle, groupId, systems):
return ret
+@ensure_byte_strings()
def dcgmHealthSet_v2(dcgm_handle, groupId, systems, updateInterval, maxKeepAge):
params = dcgm_structs.c_dcgmHealthSetParams_v2()
params.version = dcgm_structs.dcgmHealthSetParams_version2
@@ -496,6 +544,7 @@ def dcgmHealthSet_v2(dcgm_handle, groupId, systems, updateInterval, maxKeepAge):
return ret
+@ensure_byte_strings()
def dcgmHealthGet(dcgm_handle, groupId):
c_systems = c_int32()
fn = dcgmFP("dcgmHealthGet")
@@ -504,9 +553,10 @@ def dcgmHealthGet(dcgm_handle, groupId):
return c_systems.value
-def dcgmHealthCheck(
- dcgm_handle, groupId, version=dcgm_structs.dcgmHealthResponse_version4
-):
+@ensure_byte_strings()
+def dcgmHealthCheck(dcgm_handle,
+ groupId,
+ version=dcgm_structs.dcgmHealthResponse_version4):
if version != dcgm_structs.dcgmHealthResponse_version4:
dcgm_structs._dcgmCheckReturn(dcgm_structs.DCGM_ST_VER_MISMATCH)
@@ -518,13 +568,16 @@ def dcgmHealthCheck(
return c_results
-def dcgmPolicyRegister(dcgm_handle, groupId, condition, beginCallback, finishCallback):
+@ensure_byte_strings()
+def dcgmPolicyRegister(dcgm_handle, groupId, condition, beginCallback,
+ finishCallback):
fn = dcgmFP("dcgmPolicyRegister")
ret = fn(dcgm_handle, groupId, condition, beginCallback, finishCallback)
dcgm_structs._dcgmCheckReturn(ret)
return ret
+@ensure_byte_strings()
def dcgmPolicyUnregister(dcgm_handle, groupId, condition):
fn = dcgmFP("dcgmPolicyUnregister")
ret = fn(dcgm_handle, groupId, condition)
@@ -532,6 +585,7 @@ def dcgmPolicyUnregister(dcgm_handle, groupId, condition):
return ret
+@ensure_byte_strings()
def dcgmPolicyTrigger(dcgm_handle):
fn = dcgmFP("dcgmPolicyTrigger")
ret = fn(dcgm_handle)
@@ -549,32 +603,34 @@ def helperDiagCheckReturn(ret, response):
info = "%s" % response.systemError.msg
e.SetAdditionalInfo(info)
- raise e # pylint: disable=E0710
+ raise e
else:
raise
return response
-def dcgmActionValidate_v2(
- dcgm_handle, runDiagInfo, runDiagVersion=dcgm_structs.dcgmRunDiag_version6
-):
- response = dcgm_structs.c_dcgmDiagResponse_v6()
+@ensure_byte_strings()
+def dcgmActionValidate_v2(dcgm_handle,
+ runDiagInfo,
+ runDiagVersion=dcgm_structs.dcgmRunDiag_version7):
+ response = dcgm_structs.c_dcgmDiagResponse_v8()
runDiagInfo.version = runDiagVersion
- response.version = dcgm_structs.dcgmDiagResponse_version6
+ response.version = dcgm_structs.dcgmDiagResponse_version8
fn = dcgmFP("dcgmActionValidate_v2")
ret = fn(dcgm_handle, byref(runDiagInfo), byref(response))
return helperDiagCheckReturn(ret, response)
+@ensure_byte_strings()
def dcgmActionValidate(dcgm_handle, group_id, validate):
- response = dcgm_structs.c_dcgmDiagResponse_v6()
- response.version = dcgm_structs.dcgmDiagResponse_version6
+ response = dcgm_structs.c_dcgmDiagResponse_v8()
+ response.version = dcgm_structs.dcgmDiagResponse_version8
# Put the group_id and validate into a dcgmRunDiag struct
- runDiagInfo = dcgm_structs.c_dcgmRunDiag_v6()
- runDiagInfo.version = dcgm_structs.dcgmRunDiag_version6
+ runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
+ runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7
runDiagInfo.validate = validate
runDiagInfo.groupId = group_id
@@ -584,28 +640,27 @@ def dcgmActionValidate(dcgm_handle, group_id, validate):
return helperDiagCheckReturn(ret, response)
+@ensure_byte_strings()
def dcgmRunDiagnostic(dcgm_handle, group_id, diagLevel):
- response = dcgm_structs.c_dcgmDiagResponse_v6()
- response.version = dcgm_structs.dcgmDiagResponse_version6
+ response = dcgm_structs.c_dcgmDiagResponse_v8()
+ response.version = dcgm_structs.dcgmDiagResponse_version8
fn = dcgmFP("dcgmRunDiagnostic")
ret = fn(dcgm_handle, group_id, diagLevel, byref(response))
return helperDiagCheckReturn(ret, response)
-def dcgmWatchPidFields(dcgm_handle, groupId, updateFreq, maxKeepAge, maxKeepSamples):
+@ensure_byte_strings()
+def dcgmWatchPidFields(dcgm_handle, groupId, updateFreq, maxKeepAge,
+ maxKeepSamples):
fn = dcgmFP("dcgmWatchPidFields")
- ret = fn(
- dcgm_handle,
- groupId,
- c_int64(updateFreq),
- c_double(maxKeepAge),
- c_int32(maxKeepSamples),
- )
+ ret = fn(dcgm_handle, groupId, c_int64(updateFreq), c_double(maxKeepAge),
+ c_int32(maxKeepSamples))
dcgm_structs._dcgmCheckReturn(ret)
return ret
+@ensure_byte_strings()
def dcgmGetPidInfo(dcgm_handle, groupId, pid):
fn = dcgmFP("dcgmGetPidInfo")
pidInfo = dcgm_structs.c_dcgmPidInfo_v2()
@@ -618,6 +673,7 @@ def dcgmGetPidInfo(dcgm_handle, groupId, pid):
return pidInfo
+@ensure_byte_strings()
def dcgmGetDeviceTopology(dcgm_handle, gpuId):
devtopo = dcgm_structs.c_dcgmDeviceTopology_v1()
fn = dcgmFP("dcgmGetDeviceTopology")
@@ -626,6 +682,7 @@ def dcgmGetDeviceTopology(dcgm_handle, gpuId):
return devtopo
+@ensure_byte_strings()
def dcgmGetGroupTopology(dcgm_handle, groupId):
grouptopo = dcgm_structs.c_dcgmGroupTopology_v1()
fn = dcgmFP("dcgmGetGroupTopology")
@@ -634,19 +691,17 @@ def dcgmGetGroupTopology(dcgm_handle, groupId):
return grouptopo
-def dcgmWatchJobFields(dcgm_handle, groupId, updateFreq, maxKeepAge, maxKeepSamples):
+@ensure_byte_strings()
+def dcgmWatchJobFields(dcgm_handle, groupId, updateFreq, maxKeepAge,
+ maxKeepSamples):
fn = dcgmFP("dcgmWatchJobFields")
- ret = fn(
- dcgm_handle,
- groupId,
- c_int64(updateFreq),
- c_double(maxKeepAge),
- c_int32(maxKeepSamples),
- )
+ ret = fn(dcgm_handle, groupId, c_int64(updateFreq), c_double(maxKeepAge),
+ c_int32(maxKeepSamples))
dcgm_structs._dcgmCheckReturn(ret)
return ret
+@ensure_byte_strings()
def dcgmJobStartStats(dcgm_handle, groupId, jobid):
fn = dcgmFP("dcgmJobStartStats")
ret = fn(dcgm_handle, groupId, jobid)
@@ -654,6 +709,7 @@ def dcgmJobStartStats(dcgm_handle, groupId, jobid):
return ret
+@ensure_byte_strings()
def dcgmJobStopStats(dcgm_handle, jobid):
fn = dcgmFP("dcgmJobStopStats")
ret = fn(dcgm_handle, jobid)
@@ -661,6 +717,7 @@ def dcgmJobStopStats(dcgm_handle, jobid):
return ret
+@ensure_byte_strings()
def dcgmJobGetStats(dcgm_handle, jobid):
fn = dcgmFP("dcgmJobGetStats")
jobInfo = dcgm_structs.c_dcgmJobInfo_v3()
@@ -672,6 +729,7 @@ def dcgmJobGetStats(dcgm_handle, jobid):
return jobInfo
+@ensure_byte_strings()
def dcgmJobRemove(dcgm_handle, jobid):
fn = dcgmFP("dcgmJobRemove")
ret = fn(dcgm_handle, jobid)
@@ -679,6 +737,7 @@ def dcgmJobRemove(dcgm_handle, jobid):
return ret
+@ensure_byte_strings()
def dcgmJobRemoveAll(dcgm_handle):
fn = dcgmFP("dcgmJobRemoveAll")
ret = fn(dcgm_handle)
@@ -686,13 +745,7 @@ def dcgmJobRemoveAll(dcgm_handle):
return ret
-def dcgmIntrospectToggleState(dcgm_handle, enabledState):
- fn = dcgmFP("dcgmIntrospectToggleState")
- ret = fn(dcgm_handle, enabledState)
- dcgm_structs._dcgmCheckReturn(ret)
- return ret
-
-
+@ensure_byte_strings()
def dcgmIntrospectGetHostengineMemoryUsage(dcgm_handle, waitIfNoData=True):
fn = dcgmFP("dcgmIntrospectGetHostengineMemoryUsage")
@@ -704,6 +757,7 @@ def dcgmIntrospectGetHostengineMemoryUsage(dcgm_handle, waitIfNoData=True):
return memInfo
+@ensure_byte_strings()
def dcgmIntrospectGetHostengineCpuUtilization(dcgm_handle, waitIfNoData=True):
fn = dcgmFP("dcgmIntrospectGetHostengineCpuUtilization")
@@ -715,88 +769,45 @@ def dcgmIntrospectGetHostengineCpuUtilization(dcgm_handle, waitIfNoData=True):
return cpuUtil
-def dcgmIntrospectGetFieldsExecTime(dcgm_handle, introspectContext, waitIfNoData=True):
- fn = dcgmFP("dcgmIntrospectGetFieldsExecTime")
-
- execTime = dcgm_structs.c_dcgmIntrospectFullFieldsExecTime_v2()
- execTime.version = dcgm_structs.dcgmIntrospectFullFieldsExecTime_version2
-
- ret = fn(dcgm_handle, byref(introspectContext), byref(execTime), waitIfNoData)
- dcgm_structs._dcgmCheckReturn(ret)
- return execTime
-
-
-def dcgmIntrospectGetFieldsMemoryUsage(
- dcgm_handle, introspectContext, waitIfNoData=True
-):
- fn = dcgmFP("dcgmIntrospectGetFieldsMemoryUsage")
-
- memInfo = dcgm_structs.c_dcgmIntrospectFullMemory_v1()
- memInfo.version = dcgm_structs.dcgmIntrospectFullMemory_version1
-
- ret = fn(dcgm_handle, byref(introspectContext), byref(memInfo), waitIfNoData)
- dcgm_structs._dcgmCheckReturn(ret)
- return memInfo
-
-
-def dcgmIntrospectUpdateAll(dcgmHandle, waitForUpdate):
- fn = dcgmFP("dcgmIntrospectUpdateAll")
- ret = fn(dcgmHandle, c_int(waitForUpdate))
- dcgm_structs._dcgmCheckReturn(ret)
-
-
+@ensure_byte_strings()
def dcgmEntityGetLatestValues(dcgmHandle, entityGroup, entityId, fieldIds):
fn = dcgmFP("dcgmEntityGetLatestValues")
field_values = (dcgm_structs.c_dcgmFieldValue_v1 * len(fieldIds))()
id_values = (c_uint16 * len(fieldIds))(*fieldIds)
- ret = fn(
- dcgmHandle,
- c_uint(entityGroup),
- dcgm_fields.c_dcgm_field_eid_t(entityId),
- id_values,
- c_uint(len(fieldIds)),
- field_values,
- )
+ ret = fn(dcgmHandle, c_uint(entityGroup),
+ dcgm_fields.c_dcgm_field_eid_t(entityId), id_values,
+ c_uint(len(fieldIds)), field_values)
dcgm_structs._dcgmCheckReturn(ret)
return field_values
+@ensure_byte_strings()
def dcgmEntitiesGetLatestValues(dcgmHandle, entities, fieldIds, flags):
fn = dcgmFP("dcgmEntitiesGetLatestValues")
numFvs = len(fieldIds) * len(entities)
field_values = (dcgm_structs.c_dcgmFieldValue_v2 * numFvs)()
- entities_values = (dcgm_structs.c_dcgmGroupEntityPair_t * len(entities))(*entities)
+ entities_values = (dcgm_structs.c_dcgmGroupEntityPair_t *
+ len(entities))(*entities)
field_id_values = (c_uint16 * len(fieldIds))(*fieldIds)
- ret = fn(
- dcgmHandle,
- entities_values,
- c_uint(len(entities)),
- field_id_values,
- c_uint(len(fieldIds)),
- flags,
- field_values,
- )
+ ret = fn(dcgmHandle, entities_values, c_uint(len(entities)),
+ field_id_values, c_uint(len(fieldIds)), flags, field_values)
dcgm_structs._dcgmCheckReturn(ret)
return field_values
+@ensure_byte_strings()
def dcgmSelectGpusByTopology(dcgmHandle, inputGpuIds, numGpus, hintFlags):
fn = dcgmFP("dcgmSelectGpusByTopology")
outputGpuIds = c_int64()
- ret = fn(
- dcgmHandle,
- c_uint64(inputGpuIds),
- c_uint32(numGpus),
- byref(outputGpuIds),
- c_uint64(hintFlags),
- )
+ ret = fn(dcgmHandle, c_uint64(inputGpuIds), c_uint32(numGpus),
+ byref(outputGpuIds), c_uint64(hintFlags))
dcgm_structs._dcgmCheckReturn(ret)
return outputGpuIds
-def dcgmGetFieldSummary(
- dcgmHandle, fieldId, entityGroupType, entityId, summaryMask, startTime, endTime
-):
+@ensure_byte_strings()
+def dcgmGetFieldSummary(dcgmHandle, fieldId, entityGroupType, entityId,
+ summaryMask, startTime, endTime):
fn = dcgmFP("dcgmGetFieldSummary")
request = dcgm_structs.c_dcgmFieldSummaryRequest_v1()
request.version = dcgm_structs.dcgmFieldSummaryRequest_version1
@@ -811,13 +822,15 @@ def dcgmGetFieldSummary(
return request
-def dcgmModuleBlacklist(dcgmHandle, moduleId):
- fn = dcgmFP("dcgmModuleBlacklist")
+@ensure_byte_strings()
+def dcgmModuleDenylist(dcgmHandle, moduleId):
+ fn = dcgmFP("dcgmModuleDenylist")
ret = fn(dcgmHandle, c_uint32(moduleId))
dcgm_structs._dcgmCheckReturn(ret)
return ret
+@ensure_byte_strings()
def dcgmModuleGetStatuses(dcgmHandle):
moduleStatuses = dcgm_structs.c_dcgmModuleGetStatuses_v1()
moduleStatuses.version = dcgm_structs.dcgmModuleGetStatuses_version1
@@ -827,45 +840,18 @@ def dcgmModuleGetStatuses(dcgmHandle):
return moduleStatuses
-def dcgmProfGetSupportedMetricGroups(dcgmHandle, groupId):
- msg = dcgm_structs.c_dcgmProfGetMetricGroups_v2()
- msg.version = dcgm_structs.dcgmProfGetMetricGroups_version1
- msg.groupId = groupId
+@ensure_byte_strings()
+def dcgmProfGetSupportedMetricGroups(dcgmHandle, gpuId):
+ msg = dcgm_structs.c_dcgmProfGetMetricGroups_v3()
+ msg.version = dcgm_structs.dcgmProfGetMetricGroups_version3
+ msg.gpuId = gpuId
fn = dcgmFP("dcgmProfGetSupportedMetricGroups")
ret = fn(dcgmHandle, byref(msg))
dcgm_structs._dcgmCheckReturn(ret)
return msg
-def dcgmProfWatchFields(
- dcgmHandle, fieldIds, groupId, updateFreq, maxKeepAge, maxKeepSamples
-):
- msg = dcgm_structs.c_dcgmProfWatchFields_v1()
- msg.version = dcgm_structs.dcgmProfWatchFields_version1
- msg.groupId = groupId
- msg.updateFreq = updateFreq
- msg.maxKeepAge = maxKeepAge
- msg.maxKeepSamples = maxKeepSamples
- msg.numFieldIds = c_uint32(len(fieldIds))
- for i, fieldId in enumerate(fieldIds):
- msg.fieldIds[i] = fieldId
-
- fn = dcgmFP("dcgmProfWatchFields")
- ret = fn(dcgmHandle, byref(msg))
- dcgm_structs._dcgmCheckReturn(ret)
- return msg
-
-
-def dcgmProfUnwatchFields(dcgmHandle, groupId):
- msg = dcgm_structs.c_dcgmProfUnwatchFields_v1()
- msg.version = dcgm_structs.dcgmProfUnwatchFields_version1
- msg.groupId = groupId
- fn = dcgmFP("dcgmProfUnwatchFields")
- ret = fn(dcgmHandle, byref(msg))
- dcgm_structs._dcgmCheckReturn(ret)
- return msg
-
-
+@ensure_byte_strings()
def dcgmProfPause(dcgmHandle):
fn = dcgmFP("dcgmProfPause")
ret = fn(dcgmHandle)
@@ -873,6 +859,7 @@ def dcgmProfPause(dcgmHandle):
return ret
+@ensure_byte_strings()
def dcgmProfResume(dcgmHandle):
fn = dcgmFP("dcgmProfResume")
ret = fn(dcgmHandle)
@@ -880,6 +867,7 @@ def dcgmProfResume(dcgmHandle):
return ret
+@ensure_byte_strings()
def dcgmVersionInfo():
msg = dcgm_structs.c_dcgmVersionInfo_v2()
msg.version = dcgm_structs.dcgmVersionInfo_version2
@@ -889,10 +877,11 @@ def dcgmVersionInfo():
return msg
+@ensure_byte_strings()
def dcgmHostengineIsHealthy(dcgmHandle):
heHealth = dcgm_structs.c_dcgmHostengineHealth_v1()
heHealth.version = dcgm_structs.dcgmHostengineHealth_version1
fn = dcgmFP("dcgmHostengineIsHealthy")
ret = fn(dcgmHandle, byref(heHealth))
dcgm_structs._dcgmCheckReturn(ret)
- return heHealth
+ return heHealth
\ No newline at end of file
diff --git a/model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py b/model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py
new file mode 100644
index 000000000..d3355c556
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py
@@ -0,0 +1,369 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import subprocess
+import signal
+import os
+import re
+import sys
+
+dir_path = os.path.dirname(os.path.realpath(__file__))
+parent_dir_path = os.path.abspath(os.path.join(dir_path, os.pardir))
+sys.path.insert(0, parent_dir_path)
+
+import model_analyzer.monitor.dcgm.dcgm_fields_collectd as dcgm_fields_collectd
+import model_analyzer.monitor.dcgm.pydcgm as pydcgm
+import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+import threading
+from model_analyzer.monitor.dcgm.DcgmReader import DcgmReader
+
+if 'DCGM_TESTING_FRAMEWORK' in os.environ:
+ try:
+ import collectd_tester_api as collectd
+ except:
+ import collectd
+else:
+ import collectd
+
+# Set default values for the hostname and the library path
+g_dcgmLibPath = '/usr/lib'
+g_dcgmHostName = 'localhost'
+
+# Add overriding through the environment instead of hard coded.
+if 'DCGM_HOSTNAME' in os.environ:
+ g_dcgmHostName = os.environ['DCGM_HOSTNAME']
+
+if 'DCGMLIBPATH' in os.environ:
+ g_dcgmLibPath = os.environ['DCGMLIBPATH']
+
+c_ONE_SEC_IN_USEC = 1000000
+
+g_intervalSec = 10 # Default
+
+g_dcgmIgnoreFields = [dcgm_fields.DCGM_FI_DEV_UUID] # Fields not to publish
+
+g_publishFieldIds = [
+ dcgm_fields.DCGM_FI_DEV_UUID, #Needed for plugin instance
+ dcgm_fields.DCGM_FI_DEV_POWER_USAGE,
+ dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
+ dcgm_fields.DCGM_FI_DEV_SM_CLOCK,
+ dcgm_fields.DCGM_FI_DEV_GPU_UTIL,
+ dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING,
+ dcgm_fields.DCGM_FI_DEV_RETIRED_SBE,
+ dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
+ dcgm_fields.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_FB_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_FB_FREE,
+ dcgm_fields.DCGM_FI_DEV_FB_USED,
+ dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
+ dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION,
+ dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION,
+ dcgm_fields.DCGM_FI_DEV_XID_ERRORS,
+ dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_MEM_CLOCK,
+ dcgm_fields.DCGM_FI_DEV_MEMORY_TEMP,
+ dcgm_fields.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION,
+ dcgm_fields.DCGM_FI_DEV_MEM_COPY_UTIL,
+ dcgm_fields.DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_PCIE_TX_THROUGHPUT,
+ dcgm_fields.DCGM_FI_DEV_PCIE_RX_THROUGHPUT
+]
+
+g_fieldIntervalMap = None
+g_parseRegEx = None
+g_fieldRegEx = None
+
+# We build up a regex to match field IDs. These can be numeric IDs, or
+# names. We start with field_regex that matches either as a string (as
+# well as names that might start with digits, but we do not worry about
+# this over-generation of valid IDs at this point).
+#
+# Basically a field is an integral number or a textual name. A field
+# list is a field, or a list of fields separated by commas and enclosed
+# in parenthssis. A field list may be optionally followed by a colon,
+# indicating a possible non-default interval if also followed by a
+# floating point interval value. This is a complete field list.
+# Multiple complete field lists may appear, separated by commas.
+#
+# For example: (1001,tensor_active):5,1002:10
+#
+# This specifies that fields 1001 and tensor_active are to be sampled
+# at a rate of every 5 seconds, and 1002 every ten seconds.
+#
+# For example: (1001,tensor_active):5,1002:
+#
+# This is the same, but field 1002 is to be sampled at the default rate
+# (and the colon in entirely unnecessary, but not illegal).
+
+field_regex = r"[0-9a-zA-Z_]+"
+g_fieldRegEx = re.compile("((" + field_regex + "),?)")
+
+# We now generate a list of field regular expressions, separated by a
+# comma, and enclosed with parenthesis, for grouping.
+
+fields_regex = r"\(" + field_regex + "(," + field_regex + ")*" + r"\)"
+
+# This is an optional interval specification, allowing an optional :,
+# followed by an optional floating point dcgm sampling interval. If any
+# are missing, the default collectd sampling interval is used.
+
+interval_regex = r"(:[0-9]*(\.[0-9]+)?)?,?"
+
+# Here, we combine a field regex or field list regex with an optional
+# interval regex. Multiple of these may appear in succession.
+
+g_parseRegEx = re.compile("((" + field_regex + "|(" + fields_regex + "))" +
+ interval_regex + ")")
+
+
+class DcgmCollectdPlugin(DcgmReader):
+ ###########################################################################
+ def __init__(self):
+ global c_ONE_SEC_IN_USEC
+
+ collectd.debug(
+ 'Initializing DCGM with interval={}s'.format(g_intervalSec))
+ DcgmReader.__init__(self,
+ fieldIds=g_publishFieldIds,
+ ignoreList=g_dcgmIgnoreFields,
+ fieldGroupName='collectd_plugin',
+ updateFrequency=g_intervalSec * c_ONE_SEC_IN_USEC,
+ fieldIntervalMap=g_fieldIntervalMap)
+
+###########################################################################
+
+ def CustomDataHandler(self, fvs):
+ global c_ONE_SEC_IN_USEC
+
+ value = collectd.Values(type='gauge') # pylint: disable=no-member
+ value.plugin = 'dcgm_collectd'
+
+ for gpuId in list(fvs.keys()):
+ gpuFv = fvs[gpuId]
+
+ uuid = self.m_gpuIdToUUId[gpuId]
+ collectd.debug('CustomDataHandler uuid: ' + '%s' % (uuid) + '\n')
+ value.plugin_instance = '%s' % (uuid)
+
+ typeInstance = str(gpuId)
+
+ for fieldId in list(gpuFv.keys()):
+ # Skip ignore list
+ if fieldId in self.m_dcgmIgnoreFields:
+ continue
+
+ fieldTag = self.m_fieldIdToInfo[fieldId].tag
+ lastValTime = float("inf")
+
+ # Filter out times too close together (< 1.0 sec) but always
+ # include latest one.
+
+ for val in gpuFv[fieldId][::-1]:
+ # Skip blank values. Otherwise, we'd have to insert a placeholder blank value based on the fieldId
+ if val.isBlank:
+ continue
+
+ valTimeSec1970 = (val.ts / c_ONE_SEC_IN_USEC
+ ) #Round down to 1-second for now
+ if (lastValTime - valTimeSec1970) < 1.0:
+ collectd.debug(
+ "DCGM sample for field ID %d too soon at %f, last one sampled at %f"
+ % (fieldId, valTimeSec1970, lastValTime))
+ val.isBlank = True # Filter this one out
+ continue
+
+ lastValTime = valTimeSec1970
+
+ i = 0
+
+ for val in gpuFv[fieldId]:
+ # Skip blank values. Otherwise, we'd have to insert a placeholder blank value based on the fieldId
+ if val.isBlank:
+ continue
+
+ # Round down to 1-second for now
+ valTimeSec1970 = (val.ts / c_ONE_SEC_IN_USEC)
+ valueArray = [
+ val.value,
+ ]
+ value.dispatch(type=fieldTag,
+ type_instance=typeInstance,
+ time=valTimeSec1970,
+ values=valueArray,
+ plugin=value.plugin)
+
+ collectd.debug(
+ " gpuId %d, tag %s, sample %d, value %s, time %s" %
+ (gpuId, fieldTag, i, str(val.value), str(val.ts))) # pylint: disable=no-member
+ i += 1
+
+ ###########################################################################
+ def LogInfo(self, msg):
+ collectd.info(msg) # pylint: disable=no-member
+
+ ###########################################################################
+ def LogError(self, msg):
+ collectd.error(msg) # pylint: disable=no-member
+
+
+###############################################################################
+##### Parse supplied collectd configuration object.
+###############################################################################
+def parse_config(config):
+ global c_ONE_SEC_IN_USEC
+ global g_intervalSec
+ global g_fieldIntervalMap
+ global g_parseRegEx
+ global g_fieldRegEx
+
+ g_fieldIntervalMap = {}
+
+ for node in config.children:
+ if node.key == 'Interval':
+ g_intervalSec = float(node.values[0])
+ elif node.key == 'FieldIds':
+ fieldIds = node.values[0]
+
+ # And we parse out the field ID list with this regex.
+ field_set_list = g_parseRegEx.finditer(fieldIds)
+
+ for field_set in field_set_list:
+ # We get the list of fields...
+ fields = field_set.group(2)
+
+ # ... and the optional interval.
+ interval_str = field_set.group(5)
+
+ # We figure out if the default collectd sampling interval is
+ # to be used, or a different one.
+ if (interval_str == None) or (interval_str == ":"):
+ interval = int(g_intervalSec * c_ONE_SEC_IN_USEC)
+ else:
+ interval = int(float(interval_str[1:]) *
+ c_ONE_SEC_IN_USEC) # strip :
+
+ # We keep a set of fields for each unique interval
+ if interval not in g_fieldIntervalMap.keys():
+ g_fieldIntervalMap[interval] = []
+
+ # Here we parse out either miltiple fields sharing an
+ # interval, or a single field.
+ if fields[0:1] == "(": # a true field set
+ fields = fields[1:-1]
+ field_list = g_fieldRegEx.finditer(fields)
+ for field_group in field_list:
+
+ # We map any field names to field numbers, and add
+ # them to the list for the interval
+ field = dcgm_fields_collectd.GetFieldByName(
+ field_group.group(2))
+ g_fieldIntervalMap[interval] += [field]
+ else: # just one field
+ # Map field name to number.
+ field = dcgm_fields_collectd.GetFieldByName(fields)
+ g_fieldIntervalMap[interval] += [field]
+
+
+###############################################################################
+##### Wrapper the Class methods for collectd callbacks
+###############################################################################
+def config_dcgm(config=None):
+ """
+ collectd config for dcgm is in the form of a dcgm.conf file, usually
+ installed in /etc/collectd/collectd.conf.d/dcgm.conf.
+
+ An example is:
+
+ LoadPlugin python
+
+ ModulePath "/usr/lib64/collectd/dcgm"
+ LogTraces true
+ Interactive false
+ Import "dcgm_collectd_plugin"
+
+ Interval 2
+ FieldIds "(1001,tensor_active):5,1002:10,1004:.1,1010:"
+ FieldIds "1007"
+
+
+
+ ModulePath indicates where the plugin and supporting files are installed
+ (generally copied from /usr/local/dcgm/bindings/python3).
+
+ Interval is the default collectd sampling interval in seconds.
+
+ FieldIds may appear several times. One is either a field ID by name or
+ number. A field ID list is either a single field ID or a list of same,
+ separated by commas (,) and bounded by parenthesis ( ( and ) ). Each field
+ ID list can be followed by an optional colon (:) and a floating point
+ DCGM sampling interval. If no sampling interval is specified the default
+ collectd sampling interval is used (and the colon is redundant but not
+ illegal). Multiple field ID lists can appear on one FieldIds entry,
+ separated by commas (,). FieldIDs are strings and must be enclosed in
+ quotes ("). Multiple FieldIds lines are permitted.
+
+ DCGM will sample the fields at the interval(s) indicated, and collectd will
+ collect the samples asynchronously at the Interval specified. Because this
+ is asynchronous sometimes one less than expected will be collected and other
+ times one more than expected will be collected.
+ """
+
+ # If we throw an exception here, collectd config will terminate loading the
+ # plugin.
+ if config is not None:
+ parse_config(config)
+
+ # Register the read function with the default collectd sampling interval.
+ collectd.register_read(read_dcgm, interval=g_intervalSec) # pylint: disable=no-member
+
+
+###############################################################################
+def init_dcgm():
+ global g_dcgmCollectd
+
+ # restore default SIGCHLD behavior to avoid exceptions with new processes
+ signal.signal(signal.SIGCHLD, signal.SIG_DFL)
+
+ g_dcgmCollectd = DcgmCollectdPlugin()
+ g_dcgmCollectd.Init()
+
+
+###############################################################################
+def shutdown_dcgm():
+ g_dcgmCollectd.Shutdown()
+
+
+###############################################################################
+def read_dcgm(data=None):
+ g_dcgmCollectd.Process()
+
+
+def register_collectd_callbacks():
+ collectd.register_config(config_dcgm, name="dcgm_collectd_plugin") # pylint: disable=no-member
+ # config_dcgm registers read since it needs to parse the sampling interval.
+ collectd.register_init(init_dcgm) # pylint: disable=no-member
+ collectd.register_shutdown(shutdown_dcgm) # pylint: disable=no-member
+
+
+###############################################################################
+##### Main
+###############################################################################
+register_collectd_callbacks()
diff --git a/model_analyzer/monitor/dcgm/dcgm_errors.py b/model_analyzer/monitor/dcgm/dcgm_errors.py
new file mode 100644
index 000000000..e52f3b114
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/dcgm_errors.py
@@ -0,0 +1,395 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ctypes
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+
+DCGM_FR_OK = 0 # No error
+DCGM_FR_UNKNOWN = 1 # Unknown error code
+DCGM_FR_UNRECOGNIZED = 2 # Unrecognized error code
+DCGM_FR_PCI_REPLAY_RATE = 3 # Unacceptable rate of PCI errors
+DCGM_FR_VOLATILE_DBE_DETECTED = 4 # Uncorrectable volatile double bit error
+DCGM_FR_VOLATILE_SBE_DETECTED = 5 # Unacceptable rate of volatile single bit errors
+DCGM_FR_PENDING_PAGE_RETIREMENTS = 6 # Pending page retirements detected
+DCGM_FR_RETIRED_PAGES_LIMIT = 7 # Unacceptable total page retirements detected
+DCGM_FR_RETIRED_PAGES_DBE_LIMIT = 8 # Unacceptable total page retirements due to uncorrectable errors
+DCGM_FR_CORRUPT_INFOROM = 9 # Corrupt inforom found
+DCGM_FR_CLOCK_THROTTLE_THERMAL = 10 # Clocks being throttled due to overheating
+DCGM_FR_POWER_UNREADABLE = 11 # Cannot get a reading for power from NVML
+DCGM_FR_CLOCK_THROTTLE_POWER = 12 # Clock being throttled due to power restrictions
+DCGM_FR_NVLINK_ERROR_THRESHOLD = 13 # Unacceptable rate of NVLink errors
+DCGM_FR_NVLINK_DOWN = 14 # NVLink is down
+DCGM_FR_NVSWITCH_FATAL_ERROR = 15 # Fatal errors on the NVSwitch
+DCGM_FR_NVSWITCH_NON_FATAL_ERROR = 16 # Non-fatal errors on the NVSwitch
+DCGM_FR_NVSWITCH_DOWN = 17 # NVSwitch is down
+DCGM_FR_NO_ACCESS_TO_FILE = 18 # Cannot access a file
+DCGM_FR_NVML_API = 19 # Error occurred on an NVML API
+DCGM_FR_DEVICE_COUNT_MISMATCH = 20 # Disagreement in GPU count between /dev and NVML
+DCGM_FR_BAD_PARAMETER = 21 # Bad parameter passed to API
+DCGM_FR_CANNOT_OPEN_LIB = 22 # Cannot open a library that must be accessed
+DCGM_FR_DENYLISTED_DRIVER = 23 # A driver on the denylist (nouveau) is active
+DCGM_FR_NVML_LIB_BAD = 24 # The NVML library is missing expected functions
+DCGM_FR_GRAPHICS_PROCESSES = 25 # Graphics processes are active on this GPU
+DCGM_FR_HOSTENGINE_CONN = 26 # Unstable connection to nv-hostengine (daemonized DCGM)
+DCGM_FR_FIELD_QUERY = 27 # Error querying a field from DCGM
+DCGM_FR_BAD_CUDA_ENV = 28 # The environment has variables that hurt CUDA
+DCGM_FR_PERSISTENCE_MODE = 29 # Persistence mode is disabled
+DCGM_FR_LOW_BANDWIDTH = 30 # The bandwidth is unacceptably low
+DCGM_FR_HIGH_LATENCY = 31 # Latency is too high
+DCGM_FR_CANNOT_GET_FIELD_TAG = 32 # Cannot find a tag for a field
+DCGM_FR_FIELD_VIOLATION = 33 # The value for the specified error field is above 0
+DCGM_FR_FIELD_THRESHOLD = 34 # The value for the specified field is above the threshold
+DCGM_FR_FIELD_VIOLATION_DBL = 35 # The value for the specified error field is above 0
+DCGM_FR_FIELD_THRESHOLD_DBL = 36 # The value for the specified field is above the threshold
+DCGM_FR_UNSUPPORTED_FIELD_TYPE = 37 # Field type cannot be supported
+DCGM_FR_FIELD_THRESHOLD_TS = 38 # The value for the specified field is above the threshold
+DCGM_FR_FIELD_THRESHOLD_TS_DBL = 39 # The value for the specified field is above the threshold
+DCGM_FR_THERMAL_VIOLATIONS = 40 # Thermal violations detected
+DCGM_FR_THERMAL_VIOLATIONS_TS = 41 # Thermal violations detected with a timestamp
+DCGM_FR_TEMP_VIOLATION = 42 # Temperature is too high
+DCGM_FR_THROTTLING_VIOLATION = 43 # Non-benign clock throttling is occurring
+DCGM_FR_INTERNAL = 44 # An internal error was detected
+DCGM_FR_PCIE_GENERATION = 45 # PCIe generation is too low
+DCGM_FR_PCIE_WIDTH = 46 # PCIe width is too low
+DCGM_FR_ABORTED = 47 # Test was aborted by a user signal
+DCGM_FR_TEST_DISABLED = 48 # This test is disabled for this GPU
+DCGM_FR_CANNOT_GET_STAT = 49 # Cannot get telemetry for a needed value
+DCGM_FR_STRESS_LEVEL = 50 # Stress level is too low (bad performance)
+DCGM_FR_CUDA_API = 51 # Error calling the specified CUDA API
+DCGM_FR_FAULTY_MEMORY = 52 # Faulty memory detected on this GPU
+DCGM_FR_CANNOT_SET_WATCHES = 53 # Unable to set field watches in DCGM
+DCGM_FR_CUDA_UNBOUND = 54 # CUDA context is no longer bound
+DCGM_FR_ECC_DISABLED = 55 # ECC memory is disabled right now
+DCGM_FR_MEMORY_ALLOC = 56 # Cannot allocate memory
+DCGM_FR_CUDA_DBE = 57 # CUDA detected unrecovable double-bit error
+DCGM_FR_MEMORY_MISMATCH = 58 # Memory error detected
+DCGM_FR_CUDA_DEVICE = 59 # No CUDA device discoverable for existing GPU
+DCGM_FR_ECC_UNSUPPORTED = 60 # ECC memory is unsupported by this SKU
+DCGM_FR_ECC_PENDING = 61 # ECC memory is in a pending state
+DCGM_FR_MEMORY_BANDWIDTH = 62 # Memory bandwidth is too low
+DCGM_FR_TARGET_POWER = 63 # Cannot hit the target power draw
+DCGM_FR_API_FAIL = 64 # The specified API call failed
+DCGM_FR_API_FAIL_GPU = 65 # The specified API call failed for the specified GPU
+DCGM_FR_CUDA_CONTEXT = 66 # Cannot create a CUDA context on this GPU
+DCGM_FR_DCGM_API = 67 # DCGM API failure
+DCGM_FR_CONCURRENT_GPUS = 68 # Need multiple GPUs to run this test
+DCGM_FR_TOO_MANY_ERRORS = 69 # More errors than fit in the return struct
+DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD = 70 # More than 100 CRC errors are happening per second
+DCGM_FR_NVLINK_ERROR_CRITICAL = 71 # NVLink error for a field that should always be 0
+DCGM_FR_ENFORCED_POWER_LIMIT = 72 # The enforced power limit is too low to hit the target
+DCGM_FR_MEMORY_ALLOC_HOST = 73 # Cannot allocate memory on the host
+DCGM_FR_GPU_OP_MODE = 74 # Bad GPU operating mode for running plugin
+DCGM_FR_NO_MEMORY_CLOCKS = 75 # No memory clocks with the needed MHz were found
+DCGM_FR_NO_GRAPHICS_CLOCKS = 76 # No graphics clocks with the needed MHz were found
+DCGM_FR_HAD_TO_RESTORE_STATE = 77 # Note that we had to restore a GPU's state
+DCGM_FR_L1TAG_UNSUPPORTED = 78 # L1TAG test is unsupported by this SKU
+DCGM_FR_L1TAG_MISCOMPARE = 79 # L1TAG test failed on a miscompare
+DCGM_FR_ROW_REMAP_FAILURE = 80 # Row remapping failed (Ampere or newer GPUs)
+DCGM_FR_UNCONTAINED_ERROR = 81 # Uncontained error - XID 95
+DCGM_FR_EMPTY_GPU_LIST = 82 # No GPU information given to plugin
+DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS = 83 # Pending page retirements due to a DBE
+DCGM_FR_UNCORRECTABLE_ROW_REMAP = 84 # Uncorrectable row remapping
+DCGM_FR_PENDING_ROW_REMAP = 85 # Row remapping is pending
+DCGM_FR_BROKEN_P2P_MEMORY_DEVICE = 86 # P2P copy test detected an error writing to this GPU
+DCGM_FR_BROKEN_P2P_WRITER_DEVICE = 87 # P2P copy test detected an error writing from this GPU
+DCGM_FR_NVSWITCH_NVLINK_DOWN = 88 # An NVLink is down
+DCGM_FR_EUD_BINARY_PERMISSIONS = 89 # EUD binary permissions are incorrect
+DCGM_FR_EUD_NON_ROOT_USER = 90 # EUD plugin is not running as root
+DCGM_FR_EUD_SPAWN_FAILURE = 91 # EUD plugin failed to spawn the EUD binary
+DCGM_FR_EUD_TIMEOUT = 92 # EUD plugin timed out
+DCGM_FR_EUD_ZOMBIE = 93 # EUD process remains running after the plugin considers it finished
+DCGM_FR_EUD_NON_ZERO_EXIT_CODE = 94 # EUD process exited with a non-zero exit code
+DCGM_FR_EUD_TEST_FAILED = 95 # EUD test failed
+DCGM_FR_FILE_CREATE_PERMISSIONS = 96 # We cannot write a file in this directory.
+DCGM_FR_PAUSE_RESUME_FAILED = 97 # Pause/Resume failed
+DCGM_FR_ERROR_SENTINEL = 98 # MUST BE THE LAST ERROR CODE
+
+# Standard message for running a field diagnostic
+TRIAGE_RUN_FIELD_DIAG_MSG = "Run a field diagnostic on the GPU."
+DEBUG_COOLING_MSG = "Verify that the cooling on this machine is functional, including external, thermal "\
+ "material interface, fans, and any other components."
+BUG_REPORT_MSG = "Please capture an nvidia-bug-report and send it to NVIDIA."
+
+# Define DCGM error priorities
+DCGM_ERROR_MONITOR = 0 # Can perform workload, but needs to be monitored.
+DCGM_ERROR_ISOLATE = 1 # Cannot perform workload. GPU should be isolated.
+DCGM_ERROR_UNKNOWN = 2 # This error code is not recognized
+
+# Messages for the error codes. All messages must be defined in the ERROR_CODE_MSG format
+# where is the actual message.
+
+DCGM_FR_OK_MSG = "The operation completed successfully."
+DCGM_FR_UNKNOWN_MSG = "Unknown error."
+DCGM_FR_UNRECOGNIZED_MSG = "Unrecognized error code."
+# replay limit, gpu id, replay errors detected
+DCGM_FR_PCI_REPLAY_RATE_MSG = "Detected more than %u PCIe replays per minute for GPU %u : %d"
+# dbes deteced, gpu id
+DCGM_FR_VOLATILE_DBE_DETECTED_MSG = "Detected %d volatile double-bit ECC error(s) in GPU %u."
+# sbe limit, gpu id, sbes detected
+DCGM_FR_VOLATILE_SBE_DETECTED_MSG = "More than %u single-bit ECC error(s) detected in GPU %u Volatile SBEs: %lld"
+# gpu id
+DCGM_FR_PENDING_PAGE_RETIREMENTS_MSG = "A pending retired page has been detected in GPU %u."
+# retired pages detected, gpud id
+DCGM_FR_RETIRED_PAGES_LIMIT_MSG = "%u or more retired pages have been detected in GPU %u. "
+# retired pages due to dbes detected, gpu id
+DCGM_FR_RETIRED_PAGES_DBE_LIMIT_MSG = "An excess of %u retired pages due to DBEs have been detected and" \
+ " more than one page has been retired due to DBEs in the past" \
+ " week in GPU %u."
+# gpu id
+DCGM_FR_CORRUPT_INFOROM_MSG = "A corrupt InfoROM has been detected in GPU %u."
+# gpu id
+DCGM_FR_CLOCK_THROTTLE_THERMAL_MSG = "Detected clock throttling due to thermal violation in GPU %u."
+# gpu id
+DCGM_FR_POWER_UNREADABLE_MSG = "Cannot reliably read the power usage for GPU %u."
+# gpu id
+DCGM_FR_CLOCK_THROTTLE_POWER_MSG = "Detected clock throttling due to power violation in GPU %u."
+# nvlink errors detected, nvlink id, error threshold
+DCGM_FR_NVLINK_ERROR_THRESHOLD_MSG = "Detected %ld NvLink errors on NvLink %u which exceeds threshold of %u"
+# gpu id, nvlink id
+DCGM_FR_NVLINK_DOWN_MSG = "GPU %u's NvLink link %d is currently down"
+# nvswitch id, nvlink id
+DCGM_FR_NVSWITCH_FATAL_ERROR_MSG = "Detected fatal errors on NvSwitch %u link %u"
+# nvswitch id, nvlink id
+DCGM_FR_NVSWITCH_NON_FATAL_ERROR_MSG = "Detected nonfatal errors on NvSwitch %u link %u"
+# nvswitch id, nvlink port
+DCGM_FR_NVSWITCH_DOWN_MSG = "NvSwitch physical ID %u's NvLink port %d is currently down."
+# file path, error detail
+DCGM_FR_NO_ACCESS_TO_FILE_MSG = "File %s could not be accessed directly: %s"
+# purpose for communicating with NVML, NVML error as string, NVML error
+DCGM_FR_NVML_API_MSG = "Error calling NVML API %s: %s"
+DCGM_FR_DEVICE_COUNT_MISMATCH_MSG = "The number of devices NVML returns is different than the number "\
+ "of devices in /dev."
+# function name
+DCGM_FR_BAD_PARAMETER_MSG = "Bad parameter to function %s cannot be processed"
+# library name, error returned from dlopen
+DCGM_FR_CANNOT_OPEN_LIB_MSG = "Cannot open library %s: '%s'"
+# the name of the driver on the denylist
+DCGM_FR_DENYLISTED_DRIVER_MSG = "Found driver on the denylist: %s"
+# the name of the function that wasn't found
+DCGM_FR_NVML_LIB_BAD_MSG = "Cannot get pointer to %s from libnvidia-ml.so"
+DCGM_FR_GRAPHICS_PROCESSES_MSG = "NVVS has detected graphics processes running on at least one "\
+ "GPU. This may cause some tests to fail."
+# error message from the API call
+DCGM_FR_HOSTENGINE_CONN_MSG = "Could not connect to the host engine: '%s'"
+# field name, gpu id
+DCGM_FR_FIELD_QUERY_MSG = "Could not query field %s for GPU %u"
+# environment variable name
+DCGM_FR_BAD_CUDA_ENV_MSG = "Found CUDA performance-limiting environment variable '%s'."
+# gpu id
+DCGM_FR_PERSISTENCE_MODE_MSG = "Persistence mode for GPU %u is currently disabled. The DCGM "\
+ "diagnostic requires peristence mode to be enabled."
+DCGM_FR_LOW_BANDWIDTH_MSG = "Bandwidth of GPU %u in direction %s of %.2f did not exceed "\
+ "minimum required bandwidth of %.2f."
+DCGM_FR_HIGH_LATENCY_MSG = "Latency type %s of GPU %u value %.2f exceeded maximum allowed "\
+ "latency of %.2f."
+DCGM_FR_CANNOT_GET_FIELD_TAG_MSG = "Unable to get field information for field id %hu"
+DCGM_FR_FIELD_VIOLATION_MSG = "Detected %ld %s for GPU %u"
+DCGM_FR_FIELD_THRESHOLD_MSG = "Detected %ld %s for GPU %u which is above the threshold %ld"
+DCGM_FR_FIELD_VIOLATION_DBL_MSG = "Detected %.1f %s for GPU %u"
+DCGM_FR_FIELD_THRESHOLD_DBL_MSG = "Detected %.1f %s for GPU %u which is above the threshold %.1f"
+DCGM_FR_UNSUPPORTED_FIELD_TYPE_MSG = "Field %s is not supported by this API because it is neither an "\
+ "int64 nor a double type."
+DCGM_FR_FIELD_THRESHOLD_TS_MSG = "%s met or exceeded the threshold of %lu per second: %lu at "\
+ "%.1f seconds into the test."
+DCGM_FR_FIELD_THRESHOLD_TS_DBL_MSG = "%s met or exceeded the threshold of %.1f per second: %.1f at "\
+ "%.1f seconds into the test."
+DCGM_FR_THERMAL_VIOLATIONS_MSG = "There were thermal violations totaling %lu seconds for GPU %u"
+DCGM_FR_THERMAL_VIOLATIONS_TS_MSG = "Thermal violations totaling %lu samples started at %.1f seconds "\
+ "into the test for GPU %u"
+DCGM_FR_TEMP_VIOLATION_MSG = "Temperature %lld of GPU %u exceeded user-specified maximum "\
+ "allowed temperature %lld"
+DCGM_FR_THROTTLING_VIOLATION_MSG = "Clocks are being throttling for GPU %u because of clock "\
+ "throttling starting %.1f seconds into the test. %s"
+DCGM_FR_INTERNAL_MSG = "There was an internal error during the test: '%s'"
+DCGM_FR_PCIE_GENERATION_MSG = "GPU %u is running at PCI link generation %d, which is below "\
+ "the minimum allowed link generation of %d (parameter '%s')"
+DCGM_FR_PCIE_WIDTH_MSG = "GPU %u is running at PCI link width %dX, which is below the "\
+ "minimum allowed link generation of %d (parameter '%s')"
+DCGM_FR_ABORTED_MSG = "Test was aborted early due to user signal"
+DCGM_FR_TEST_DISABLED_MSG = "The %s test is skipped for this GPU."
+DCGM_FR_CANNOT_GET_STAT_MSG = "Unable to generate / collect stat %s for GPU %u"
+DCGM_FR_STRESS_LEVEL_MSG = "Max stress level of %.1f did not reach desired stress level of "\
+ "%.1f for GPU %u"
+DCGM_FR_CUDA_API_MSG = "Error using CUDA API %s"
+DCGM_FR_FAULTY_MEMORY_MSG = "Found %d faulty memory elements on GPU %u"
+DCGM_FR_CANNOT_SET_WATCHES_MSG = "Unable to add field watches to DCGM: %s"
+DCGM_FR_CUDA_UNBOUND_MSG = "Cuda GPU %d is no longer bound to a CUDA context...Aborting"
+DCGM_FR_ECC_DISABLED_MSG = "Skipping test %s because ECC is not enabled on GPU %u"
+DCGM_FR_MEMORY_ALLOC_MSG = "Couldn't allocate at least %.1f%% of GPU memory on GPU %u"
+DCGM_FR_CUDA_DBE_MSG = "CUDA APIs have indicated that a double-bit ECC error has "\
+ "occured on GPU %u."
+DCGM_FR_MEMORY_MISMATCH_MSG = "A memory mismatch was detected on GPU %u, but no error was "\
+ "reported by CUDA or NVML."
+DCGM_FR_CUDA_DEVICE_MSG = "Unable to find a corresponding CUDA device for GPU %u: '%s'"
+DCGM_FR_ECC_UNSUPPORTED_MSG = "This card does not support ECC Memory. Skipping test."
+DCGM_FR_ECC_PENDING_MSG = "ECC memory for GPU %u is in a pending state."
+DCGM_FR_MEMORY_BANDWIDTH_MSG = "GPU %u only achieved a memory bandwidth of %.2f GB/s, failing "\
+ "to meet %.2f GB/s for test %d"
+DCGM_FR_TARGET_POWER_MSG = "Max power of %.1f did not reach desired power minimum %s of "\
+ "%.1f for GPU %u"
+DCGM_FR_API_FAIL_MSG = "API call %s failed: '%s'"
+DCGM_FR_API_FAIL_GPU_MSG = "API call %s failed for GPU %u: '%s'"
+DCGM_FR_CUDA_CONTEXT_MSG = "GPU %u failed to create a CUDA context: %s"
+DCGM_FR_DCGM_API_MSG = "Error using DCGM API %s"
+DCGM_FR_CONCURRENT_GPUS_MSG = "Unable to run concurrent pair bandwidth test without 2 or more "\
+ "gpus. Skipping"
+DCGM_FR_TOO_MANY_ERRORS_MSG = "This API can only return up to four errors per system. "\
+ "Additional errors were found for this system that couldn't be "\
+ "communicated."
+DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_MSG = "%.1f %s NvLink errors found occuring per second on GPU %u, "\
+ "exceeding the limit of 100 per second."
+DCGM_FR_NVLINK_ERROR_CRITICAL_MSG = "Detected %ld %s NvLink errors on GPU %u's NVLink (should be 0)"
+DCGM_FR_ENFORCED_POWER_LIMIT_MSG = "Enforced power limit on GPU %u set to %.1f, which is too low to "\
+ "attempt to achieve target power %.1f"
+DCGM_FR_MEMORY_ALLOC_HOST_MSG = "Cannot allocate %zu bytes on the host"
+DCGM_FR_GPU_OP_MODE_MSG = "Skipping plugin due to a GPU being in GPU Operating Mode: LOW_DP."
+DCGM_FR_NO_MEMORY_CLOCKS_MSG = "No memory clocks <= %u MHZ were found in %u supported memory clocks."
+DCGM_FR_NO_GRAPHICS_CLOCKS_MSG = "No graphics clocks <= %u MHZ were found in %u supported graphics clocks for memory clock %u MHZ."
+DCGM_FR_HAD_TO_RESTORE_STATE_MSG = "Had to restore GPU state on NVML GPU(s): %s"
+DCGM_FR_L1TAG_UNSUPPORTED_MSG = "This card does not support the L1 cache test. Skipping test."
+DCGM_FR_L1TAG_MISCOMPARE_MSG = "The L1 cache test failed with a miscompare."
+DCGM_FR_ROW_REMAP_FAILURE_MSG = "Row remapping failed."
+DCGM_FR_UNCONTAINED_ERROR_MSG = "GPU had an uncontained error (XID 95)"
+DCGM_FR_EMPTY_GPU_LIST_MSG = "No valid GPUs passed to plugin"
+DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_MSG = "Pending page retirements together with a DBE were detected on GPU %u."
+DCGM_FR_UNCORRECTABLE_ROW_REMAP_MSG = "GPU %u has uncorrectable row remappings"
+DCGM_FR_PENDING_ROW_REMAP_MSG = "GPU %u has pending row remappings"
+DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_MSG = "GPU %u was unsuccessfully written to in a peer-to-peer test: %s"
+DCGM_FR_BROKEN_P2P_WRITER_DEVICE_MSG = "GPU %u unsuccessfully wrote data in a peer-to-peer test: %s"
+DCGM_FR_NVSWITCH_NVLINK_DOWN_MSG = "NVSwitch %u's NvLink %u is down."
+DCGM_FR_FILE_CREATE_PERMISSIONS_MSG = "The DCGM Diagnostic does not have permissions to create a file in directory '%s'"
+
+# Suggestions for next steps for the corresponding error message
+DCGM_FR_OK_NEXT = "N/A"
+DCGM_FR_UNKNOWN_NEXT = ""
+DCGM_FR_UNRECOGNIZED_NEXT = ""
+DCGM_FR_PCI_REPLAY_RATE_NEXT = "Reconnect PCIe card. Run system side PCIE diagnostic utilities "\
+ "to verify hops off the GPU board. If issue is on the board, run "\
+ "the field diagnostic."
+DCGM_FR_VOLATILE_DBE_DETECTED_NEXT = "Drain the GPU and reset it or reboot the node."
+DCGM_FR_VOLATILE_SBE_DETECTED_NEXT = "Monitor - this GPU can still perform workload."
+DCGM_FR_PENDING_PAGE_RETIREMENTS_NEXT = "If volatile double bit errors exist, drain the GPU and reset it "\
+ "or reboot the node. Otherwise, monitor - GPU can still perform "\
+ "workload."
+DCGM_FR_RETIRED_PAGES_LIMIT_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_RETIRED_PAGES_DBE_LIMIT_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_CORRUPT_INFOROM_NEXT = "Flash the InfoROM to clear this corruption."
+DCGM_FR_CLOCK_THROTTLE_THERMAL_NEXT = DEBUG_COOLING_MSG
+DCGM_FR_POWER_UNREADABLE_NEXT = ""
+DCGM_FR_CLOCK_THROTTLE_POWER_NEXT = "Monitor the power conditions. This GPU can still perform workload."
+DCGM_FR_NVLINK_ERROR_THRESHOLD_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_NVLINK_DOWN_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_NVSWITCH_FATAL_ERROR_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_NVSWITCH_NON_FATAL_ERROR_NEXT = "Monitor the NVSwitch. It can still perform workload."
+DCGM_FR_NVSWITCH_DOWN_NEXT = ""
+DCGM_FR_NO_ACCESS_TO_FILE_NEXT = "Check relevant permissions, access, and existence of the file."
+DCGM_FR_NVML_API_NEXT = "Check the error condition and ensure that appropriate libraries "\
+ "are present and accessible."
+DCGM_FR_DEVICE_COUNT_MISMATCH_NEXT = "Check for the presence of cgroups, operating system blocks, and "\
+ "or unsupported / older cards"
+DCGM_FR_BAD_PARAMETER_NEXT = ""
+DCGM_FR_CANNOT_OPEN_LIB_NEXT = "Check for the existence of the library and set LD_LIBRARY_PATH "\
+ "if needed."
+DCGM_FR_DENYLISTED_DRIVER_NEXT = "Please load the appropriate driver."
+DCGM_FR_NVML_LIB_BAD_NEXT = "Make sure that the required version of libnvidia-ml.so "\
+ "is present and accessible on the system."
+DCGM_FR_GRAPHICS_PROCESSES_NEXT = "Stop the graphics processes or run this diagnostic on a server "\
+ "that is not being used for display purposes."
+DCGM_FR_HOSTENGINE_CONN_NEXT = "If hostengine is run separately, please ensure that it is up "\
+ "and responsive."
+DCGM_FR_FIELD_QUERY_NEXT = ""
+DCGM_FR_BAD_CUDA_ENV_NEXT = "Please unset this environment variable to address test failures."
+DCGM_FR_PERSISTENCE_MODE_NEXT = "Enable persistence mode by running \"nvidia-smi -i -pm "\
+ "1 \" as root."
+DCGM_FR_LOW_BANDWIDTH_NEXT = "Verify that your minimum bandwidth setting is appropriate for "\
+ "all topological consequences."
+DCGM_FR_HIGH_LATENCY_NEXT = ""
+DCGM_FR_CANNOT_GET_FIELD_TAG_NEXT = ""
+DCGM_FR_FIELD_VIOLATION_NEXT = ""
+DCGM_FR_FIELD_THRESHOLD_NEXT = ""
+DCGM_FR_FIELD_VIOLATION_DBL_NEXT = ""
+DCGM_FR_FIELD_THRESHOLD_DBL_NEXT = ""
+DCGM_FR_UNSUPPORTED_FIELD_TYPE_NEXT = ""
+DCGM_FR_FIELD_THRESHOLD_TS_NEXT = ""
+DCGM_FR_FIELD_THRESHOLD_TS_DBL_NEXT = ""
+DCGM_FR_THERMAL_VIOLATIONS_NEXT = DEBUG_COOLING_MSG
+DCGM_FR_THERMAL_VIOLATIONS_TS_NEXT = DEBUG_COOLING_MSG
+DCGM_FR_TEMP_VIOLATION_NEXT = "Verify that the user-specified temperature maximum is set "\
+ "correctly. If it is, %s" % DEBUG_COOLING_MSG
+DCGM_FR_THROTTLING_VIOLATION_NEXT = ""
+DCGM_FR_INTERNAL_NEXT = ""
+DCGM_FR_PCIE_GENERATION_NEXT = ""
+DCGM_FR_PCIE_WIDTH_NEXT = ""
+DCGM_FR_ABORTED_NEXT = ""
+DCGM_FR_TEST_DISABLED_NEXT = ""
+DCGM_FR_CANNOT_GET_STAT_NEXT = "If running a standalone nv-hostengine, verify that it is up "\
+ "and responsive."
+DCGM_FR_STRESS_LEVEL_NEXT = ""
+DCGM_FR_CUDA_API_NEXT = ""
+DCGM_FR_FAULTY_MEMORY_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_CANNOT_SET_WATCHES_NEXT = ""
+DCGM_FR_CUDA_UNBOUND_NEXT = ""
+DCGM_FR_ECC_DISABLED_NEXT = "Enable ECC memory by running \"nvidia-smi -i -e 1\" "\
+ "to enable. This may require a GPU reset or reboot to take effect."
+DCGM_FR_MEMORY_ALLOC_NEXT = ""
+DCGM_FR_CUDA_DBE_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_MEMORY_MISMATCH_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_CUDA_DEVICE_NEXT = ""
+DCGM_FR_ECC_UNSUPPORTED_NEXT = ""
+DCGM_FR_ECC_PENDING_NEXT = "Please reboot to activate it."
+DCGM_FR_MEMORY_BANDWIDTH_NEXT = ""
+DCGM_FR_TARGET_POWER_NEXT = ""
+DCGM_FR_API_FAIL_NEXT = ""
+DCGM_FR_API_FAIL_GPU_NEXT = ""
+DCGM_FR_CUDA_CONTEXT_NEXT = "Please make sure the correct driver version is installed and "\
+ "verify that no conflicting libraries are present."
+DCGM_FR_DCGM_API_NEXT = ""
+DCGM_FR_CONCURRENT_GPUS_NEXT = ""
+DCGM_FR_TOO_MANY_ERRORS_NEXT = ""
+DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_NVLINK_ERROR_CRITICAL_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_ENFORCED_POWER_LIMIT_NEXT = "If this enforced power limit is necessary, then this test "\
+ "cannot be run. If it is unnecessary, then raise the enforced "\
+ "power limit setting to be able to run this test."
+DCGM_FR_MEMORY_ALLOC_HOST_NEXT = "Manually kill processes or restart your machine."
+DCGM_FR_GPU_OP_MODE_NEXT = "Fix by running nvidia-smi as root with: nvidia-smi --gom=0 -i "\
+ ""
+DCGM_FR_NO_MEMORY_CLOCKS_NEXT = ""
+DCGM_FR_NO_GRAPHICS_CLOCKS_NEXT = ""
+DCGM_FR_HAD_TO_RESTORE_STATE_NEXT = ""
+DCGM_FR_L1TAG_UNSUPPORTED_NEXT = ""
+DCGM_FR_L1TAG_MISCOMPARE_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_ROW_REMAP_FAILURE_NEXT = DCGM_FR_VOLATILE_DBE_DETECTED_NEXT
+DCGM_FR_UNCONTAINED_ERROR_NEXT = DCGM_FR_VOLATILE_DBE_DETECTED_NEXT
+DCGM_FR_EMPTY_GPU_LIST_NEXT = ""
+DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_NEXT = "Drain the GPU and reset it or reboot the node to resolve this issue."
+DCGM_FR_UNCORRECTABLE_ROW_REMAP_NEXT = ""
+DCGM_FR_PENDING_ROW_REMAP_NEXT = ""
+DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_NEXT = BUG_REPORT_MSG
+DCGM_FR_BROKEN_P2P_WRITER_DEVICE_NEXT = BUG_REPORT_MSG
+DCGM_FR_NVSWITCH_NVLINK_DOWN_NEXT = "Please check fabric manager and initialization logs to figure out why the link is down. You may also need to run a field diagnostic."
+DCGM_FR_FILE_CREATE_PERMISSIONS_NEXT = "Please restart the hostengine with parameter --home-dir to specify a different home directory for the " \
+ "diagnostic or change permissions in the current directory to allow the user to write files there."
+
+
+def dcgmErrorGetPriorityByCode(code):
+ fn = dcgm_structs._dcgmGetFunctionPointer("dcgmErrorGetPriorityByCode")
+ ret = fn(code)
+ return ret
+
+
+def dcgmErrorGetFormatMsgByCode(code):
+ fn = dcgm_structs._dcgmGetFunctionPointer("dcgmErrorGetFormatMsgByCode")
+ fn.restype = ctypes.c_char_p
+ ret = fn(code)
+ return ret.decode('utf-8') if isinstance(ret, bytes) else ret
diff --git a/model_analyzer/monitor/dcgm/dcgm_field_helpers.py b/model_analyzer/monitor/dcgm/dcgm_field_helpers.py
index d29a5c412..ceb9f7e0e 100755
--- a/model_analyzer/monitor/dcgm/dcgm_field_helpers.py
+++ b/model_analyzer/monitor/dcgm/dcgm_field_helpers.py
@@ -1,6 +1,4 @@
-#!/usr/bin/env python3
-
-# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -14,29 +12,31 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+import time
+import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+import model_analyzer.monitor.dcgm.dcgm_fields_internal as dcgm_fields_internal
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
import ctypes
+import model_analyzer.monitor.dcgm.dcgmvalue as dcgmvalue
+import model_analyzer.monitor.dcgm.pydcgm as pydcgm
import json
+'''
+Helper class that makes a python-friendly field value from one returned from the python bindings
+'''
-import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
-import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
-import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
-import model_analyzer.monitor.dcgm.dcgm_value as dcgmvalue
+class DcgmFieldValue():
+ '''
+ Constructor
-class DcgmFieldValue:
- """
- Helper class that makes a python-friendly field value from one returned
- from the python bindings
- """
+ rawValue is the latest dcgm_structs.c_dcgmFieldValue_v? structure of a field value returned from the raw APIs
+ '''
def __init__(self, rawValue):
- """
- rawValue : dcgm_structs.c_dcgmFieldValue_v?
- is the latest structure of a field value returned from the raw APIs
- """
- # Make sure the class passed in is an expected type
+ #Make sure the class passed in is an expected type
if not type(rawValue) == dcgm_structs.c_dcgmFieldValue_v1:
- raise Exception(f"Unexpected rawValue type {str(type(rawValue))}")
+ raise Exception("Unexpected rawValue type %s" % str(type(rawValue)))
self.ts = rawValue.ts
self.fieldId = rawValue.fieldId
@@ -51,10 +51,7 @@ def __init__(self, rawValue):
if self.fieldType == dcgm_fields.DCGM_FT_DOUBLE:
self.value = float(rawValue.value.dbl)
self.isBlank = dcgmvalue.DCGM_FP64_IS_BLANK(self.value)
- elif (
- self.fieldType == dcgm_fields.DCGM_FT_INT64
- or self.fieldType == dcgm_fields.DCGM_FT_TIMESTAMP
- ):
+ elif self.fieldType == dcgm_fields.DCGM_FT_INT64 or self.fieldType == dcgm_fields.DCGM_FT_TIMESTAMP:
self.value = int(rawValue.value.i64)
self.isBlank = dcgmvalue.DCGM_INT64_IS_BLANK(self.value)
elif self.fieldType == dcgm_fields.DCGM_FT_STRING:
@@ -63,31 +60,33 @@ def __init__(self, rawValue):
elif self.fieldType == dcgm_fields.DCGM_FT_BINARY:
if self.fieldId == dcgm_fields.DCGM_FI_DEV_ACCOUNTING_DATA:
accStats = dcgm_structs.c_dcgmDevicePidAccountingStats_v1()
- ctypes.memmove(
- ctypes.addressof(accStats),
- rawValue.value.blob,
- accStats.FieldsSizeof(),
- )
- if self.fieldId == dcgm_fields.DCGM_FI_DEV_COMPUTE_PIDS:
- accStats = dcgm_structs.c_dcgmDeviceVgpuProcessUtilInfo_v1()
- ctypes.memmove(
- ctypes.addressof(accStats),
- rawValue.value.blob,
- accStats.FieldsSizeof(),
- )
+ ctypes.memmove(ctypes.addressof(accStats), rawValue.value.blob,
+ accStats.FieldsSizeof())
+ if self.fieldId in [
+ dcgm_fields_internal.DCGM_FI_DEV_COMPUTE_PIDS,
+ dcgm_fields_internal.DCGM_FI_DEV_GRAPHICS_PIDS
+ ]:
+ processStats = dcgm_structs.c_dcgmRunningProcess_t()
+ ctypes.memmove(ctypes.addressof(processStats),
+ rawValue.value.blob, processStats.FieldsSizeof())
+ self.value = processStats
+ self.fieldType = dcgm_fields.DCGM_FT_BINARY
+ # This should always be false
+ self.isBlank = dcgmvalue.DCGM_INT64_IS_BLANK(processStats.pid)
elif self.fieldId == dcgm_fields.DCGM_FI_SYNC_BOOST:
- # Not exposed publicly for now
+ #Not exposed publicly for now
self.value = None
else:
- raise Exception("Blobs not handled yet for fieldId %d" % self.fieldId)
+ raise Exception("Blobs not handled yet for fieldId %d" %
+ self.fieldId)
else:
raise Exception("Unhandled fieldType: %s" % self.fieldType)
class DcgmFieldValueTimeSeries:
+
def __init__(self):
- # Values in timestamp order
- self.values = []
+ self.values = [] #Values in timestamp order
def __len__(self):
return len(self.values)
@@ -100,7 +99,7 @@ def InsertValue(self, value):
self.values.append(value)
return
- # Otherwise, we need to insert the value in the correct place.
+ #Otherwise, we need to insert the value in the correct place. Find the place
for i, existingValue in enumerate(self.values):
if value.ts < existingValue.ts:
self.values.insert(i, value)
@@ -110,75 +109,75 @@ def InsertValue(self, value):
class FieldValueEncoder(json.JSONEncoder):
- # Pylint does not link overloading the default method, so the comment below
- # is WAR for the linting problem
+ # Pylint does not link overloading the default method, so the comment below is WAR for the linting problem
def default(self, obj): # pylint: disable=E0202
nested_json = []
+ i = 0
for key in obj:
if isinstance(key, DcgmFieldValue):
- if key.isBlank:
+ if (key.isBlank):
continue
- nested_json.append(
- {"Timestamp": key.ts, "FieldId": key.fieldId, "Value": key.value}
- )
+ nested_json.append({
+ 'Timestamp': key.ts,
+ 'FieldId': key.fieldId,
+ 'Value': key.value
+ })
else:
return json.JSONEncoder.default(
- self, obj
- ) # Let default encoder throw exception
+ self, obj) # Let default encoder throw exception
return nested_json
-def py_helper_dcgm_field_values_since_callback(gpuId, values, numValues, userData):
+def py_helper_dcgm_field_values_since_callback(gpuId, values, numValues,
+ userData):
+
userData = ctypes.cast(userData, ctypes.py_object).value
userData._ProcessValues(gpuId, values[0:numValues])
return 0
helper_dcgm_field_values_since_callback = dcgm_agent.dcgmFieldValueEnumeration_f(
- py_helper_dcgm_field_values_since_callback
-)
+ py_helper_dcgm_field_values_since_callback)
-def py_helper_dcgm_field_values_since_callback_v2(
- entityGroupId, entityId, values, numValues, userData
-):
+def py_helper_dcgm_field_values_since_callback_v2(entityGroupId, entityId,
+ values, numValues, userData):
userData = ctypes.cast(userData, ctypes.py_object).value
- userData._ProcessValues(entityGroupId, entityId, values[0:numValues])
+ userData._ProcessValuesV2(entityGroupId, entityId, values[0:numValues])
return 0
-helper_dcgm_field_values_since_callback_v2 = (
- dcgm_agent.dcgmFieldValueEntityEnumeration_f(
- py_helper_dcgm_field_values_since_callback_v2
- )
-)
+helper_dcgm_field_values_since_callback_v2 = dcgm_agent.dcgmFieldValueEntityEnumeration_f(
+ py_helper_dcgm_field_values_since_callback_v2)
+'''
+Helper class for handling field value update callbacks and storing them in a .values member variable
+'''
class DcgmFieldValueCollection:
- """
- Helper class for handling field value update callbacks and storing them
- in a .values member variable
- """
def __init__(self, handle, groupId):
- self.values = {}
- # 2D dictionary of [gpuId][fieldId](DcgmFieldValueTimeSeries)
+ self.values = {
+ } #2D dictionary of [gpuId][fieldId](DcgmFieldValueTimeSeries)
+ self.entityValues = {
+ } #3D dictionary of [entityGroupId][entityId][fieldId](DcgmFieldValueTimeSeries)
self._handle = handle
self._groupId = groupId
self._numValuesSeen = 0
+ self._nextSinceTimestamp = 0
+
+ '''
+ Helper function called by the callback of dcgm_agent.dcgmGetValuesSince to process individual field values
+ '''
def _ProcessValues(self, gpuId, values):
- """
- Helper function called by the callback of
- dcgm_agent.dcgmGetValuesSince to process individual field values
- """
self._numValuesSeen += len(values)
if gpuId not in self.values:
self.values[gpuId] = {}
for rawValue in values:
- # Convert to python-friendly value
+ #Convert to python-friendly value
value = DcgmFieldValue(rawValue)
if value.fieldId not in self.values[gpuId]:
@@ -186,185 +185,187 @@ def _ProcessValues(self, gpuId, values):
self.values[gpuId][value.fieldId].InsertValue(value)
- def GetLatestValues(self, fieldGroup):
- """
- Get the latest values for a fieldGroup and store them to the .values
- member variable
+ '''
+ Helper function called by the callback py_helper_dcgm_field_values_since_callback_v2 to process individual field values
+ '''
+
+ def _ProcessValuesV2(self, entityGroupId, entityId, values):
+ self._numValuesSeen += len(values)
+
+ if entityGroupId not in self.entityValues:
+ self.entityValues[entityGroupId] = {}
+
+ if entityId not in self.entityValues[entityGroupId]:
+ self.entityValues[entityGroupId][entityId] = {}
+
+ for rawValue in values:
+ #Convert to python-friendly value
+ value = DcgmFieldValue(rawValue)
- Note: This class does not automatically watch fieldGroup. You must do
- that ahead of time with dcgmGroup.samples.WatchFields()
- """
+ if value.fieldId not in self.entityValues[entityGroupId][entityId]:
+ self.entityValues[entityGroupId][entityId][
+ value.fieldId] = DcgmFieldValueTimeSeries()
+
+ self.entityValues[entityGroupId][entityId][
+ value.fieldId].InsertValue(value)
+
+ '''
+ Get the latest values for a fieldGroup and store them to the .values member variable
+
+ Note: This class does not automatically watch fieldGroup. You must do that ahead of time with dcgmGroup.samples.WatchFields()
+ '''
+
+ def GetLatestValues(self, fieldGroup):
ret = dcgm_agent.dcgmGetLatestValues(
- self._handle,
- self._groupId,
- fieldGroup.fieldGroupId,
- helper_dcgm_field_values_since_callback,
- self,
- )
- # Will throw exception on error
+ self._handle, self._groupId, fieldGroup.fieldGroupId,
+ helper_dcgm_field_values_since_callback, self)
+ #Will throw exception on error
dcgm_structs._dcgmCheckReturn(ret)
+ '''
+ Method to cause more field values to be retrieved from DCGM. Returns the
+ number of field values that were retrieved.
+ '''
+
+ def GetAllSinceLastCall(self, fieldGroup):
+ beforeCount = self._numValuesSeen
+ self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince(
+ self._handle, self._groupId, fieldGroup.fieldGroupId,
+ self._nextSinceTimestamp, helper_dcgm_field_values_since_callback,
+ self)
+ afterCount = self._numValuesSeen
+ return afterCount - beforeCount
+
def GetLatestValues_v2(self, fieldGroup):
ret = dcgm_agent.dcgmGetLatestValues_v2(
- self._handle,
- self._groupId,
- fieldGroup.fieldGroupId,
- helper_dcgm_field_values_since_callback_v2,
- self,
- )
- # Will throw exception on error
+ self._handle, self._groupId, fieldGroup.fieldGroupId,
+ helper_dcgm_field_values_since_callback_v2, self)
+ #Will throw exception on error
dcgm_structs._dcgmCheckReturn(ret)
+ '''
+ Method to cause more field values to be retrieved from DCGM. Returns the number of field values that were retrieved
+ '''
+
+ def GetAllSinceLastCall_v2(self, fieldGroup):
+ beforeCount = self._numValuesSeen
+ self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince_v2(
+ self._handle, self._groupId, fieldGroup.fieldGroupId,
+ self._nextSinceTimestamp,
+ helper_dcgm_field_values_since_entity_callback, self)
+ afterCount = self._numValuesSeen
+ return afterCount - beforeCount
+
+ '''
+ Empty .values{} so that old data is no longer present in this structure.
+ This can be used to prevent .values from growing over time
+ '''
+
def EmptyValues(self):
- """
- Empty .values{} so that old data is no longer present in this
- structure. This can be used to prevent .values from growing over time
- """
self.values = {}
self._numValuesSeen = 0
+'''
+Helper class for watching a field group and storing fields values returned from it
+'''
+
+
class DcgmFieldGroupWatcher(DcgmFieldValueCollection):
- """
- Helper class for watching a field group and storing fields values returned
- from it
- """
-
- def __init__(
- self,
- handle,
- groupId,
- fieldGroup,
- operationMode,
- updateFreq,
- maxKeepAge,
- maxKeepSamples,
- startTimestamp,
- ):
- """
- handle :
- DCGM handle from dcgm_agent.dcgmInit()
- groupId :
- a DCGM group ID returned from dcgm_agent.dcgmGroupCreate
- fieldGroup :
- DcgmFieldGroup() instance to watch fields for
- operationMode :
- a dcgm_structs.DCGM_OPERATION_MODE_? constant for if the host
- engine is running in lock step or auto mode
- updateFreq :
- how often to update each field in usec
- maxKeepAge :
- how long DCGM should keep values for in seconds
- maxKeepSamples :
- is the maximum number of samples DCGM should ever cache for each
- field
- startTimestamp :
- a base timestamp we should start from when first reading
- values. This can be used to resume a previous instance of a
- DcgmFieldGroupWatcher by using its _nextSinceTimestamp. 0=start
- with all cached data
- """
+ '''
+ Constructor
+
+ handle is a DCGM handle from dcgm_agent.dcgmInit()
+ groupId is a valid DCGM group ID returned from dcgm_agent.dcgmGroupCreate
+ fieldGroup is the DcgmFieldGroup() instance to watch fields for
+ operationMode is a dcgm_structs.DCGM_OPERATION_MODE_? constant for if the host engine is running in lock step or auto mode
+ updateFreq is how often to update each field in usec
+ maxKeepAge is how long DCGM should keep values for in seconds
+ maxKeepSamples is the maximum number of samples DCGM should ever cache for each field
+ startTimestamp is a base timestamp we should start from when first reading values. This can be used to resume a
+ previous instance of a DcgmFieldGroupWatcher by using its _nextSinceTimestamp.
+ 0=start with all cached data
+ '''
+
+ def __init__(self, handle, groupId, fieldGroup, operationMode, updateFreq,
+ maxKeepAge, maxKeepSamples, startTimestamp):
self._fieldGroup = fieldGroup
- self._oprationMode = operationMode
+ self._operationMode = operationMode
self._updateFreq = updateFreq
self._maxKeepAge = maxKeepAge
self._maxKeepSamples = maxKeepSamples
DcgmFieldValueCollection.__init__(self, handle, groupId)
- # Start from beginning of time
- self._nextSinceTimestamp = 0
+ self._nextSinceTimestamp = 0 #Start from beginning of time
if startTimestamp > 0:
self._nextSinceTimestamp = startTimestamp
- # Start watches
+ #Start watches
self._WatchFieldGroup()
+ '''
+ Initiate the host engine watch on the fields
+ '''
+
def _WatchFieldGroup(self):
- """
- Initiate the host engine watch on the fields
- """
- ret = dcgm_agent.dcgmWatchFields(
- self._handle,
- self._groupId,
- self._fieldGroup,
- self._updateFreq,
- self._maxKeepAge,
- self._maxKeepSamples,
- )
- # Will throw exception on error
- dcgm_structs._dcgmCheckReturn(ret)
+ ret = dcgm_agent.dcgmWatchFields(self._handle, self._groupId,
+ self._fieldGroup.fieldGroupId,
+ self._updateFreq, self._maxKeepAge,
+ self._maxKeepSamples)
+ dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error
- # Force an update of the fields so that we can fetch initial values
+ # Force an update of the fields so that we can fetch initial values.
ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1)
- # Will throw exception on error
- dcgm_structs._dcgmCheckReturn(ret)
+ dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error
- # initial update will fetch from startTimestamp
- self.GetMore()
+ # Initial update will fetch from startTimestamp.
+ self.GetAllSinceLastCall()
- def GetMore(self):
- """
- Method to cause more field values to be retrieved from DCGM.
+ '''
+ Method to cause more field values to be retrieved from DCGM. Returns the
+ number of field values that were retrieved
+ '''
- Returns
- -------
- int
- the number of field values that were retrieved
- """
- beforeCount = self._numValuesSeen
-
- # If we're in manual mode, force an update
- if self._oprationMode == dcgm_structs.DCGM_OPERATION_MODE_MANUAL:
+ def GetAllSinceLastCall(self):
+ #If we're in manual mode, force an update
+ if self._operationMode == dcgm_structs.DCGM_OPERATION_MODE_MANUAL:
ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1)
- # Will throw exception on error
- dcgm_structs._dcgmCheckReturn(ret)
+ dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error
+
+ return super().GetAllSinceLastCall(self._fieldGroup)
- self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince(
- self._handle,
- self._groupId,
- self._fieldGroup,
- self._nextSinceTimestamp,
- helper_dcgm_field_values_since_callback,
- self,
- )
- afterCount = self._numValuesSeen
- return afterCount - beforeCount
+def py_helper_dcgm_field_values_since_entity_callback(entityGroupId, entityId,
+ values, numValues,
+ userData):
-def py_helper_dcgm_field_values_since_entity_callback(
- entityGroupId, entityId, values, numValues, userData
-):
userData = ctypes.cast(userData, ctypes.py_object).value
userData._ProcessValues(entityGroupId, entityId, values[0:numValues])
return 0
-helper_dcgm_field_values_since_entity_callback = (
- dcgm_agent.dcgmFieldValueEntityEnumeration_f(
- py_helper_dcgm_field_values_since_entity_callback
- )
-)
+helper_dcgm_field_values_since_entity_callback = dcgm_agent.dcgmFieldValueEntityEnumeration_f(
+ py_helper_dcgm_field_values_since_entity_callback)
+'''
+Helper class for handling field value update callbacks and storing them in a .values member variable
+'''
class DcgmFieldValueEntityCollection:
- """
- Helper class for handling field value update callbacks and storing them
- in a .values member variable
- """
def __init__(self, handle, groupId):
- # 3D dictionary of
- # [entityGroupId][entityId][fieldId](DcgmFieldValueTimeSeries)
- self.values = {}
+ self.values = {
+ } #3D dictionary of [entityGroupId][entityId][fieldId](DcgmFieldValueTimeSeries)
self._handle = handle
self._groupId = groupId
self._numValuesSeen = 0
+ self._nextSinceTimestamp = 0
+
+ '''
+ Helper function called by the callback of dcgm_agent.dcgmGetValuesSince to process individual field values
+ '''
def _ProcessValues(self, entityGroupId, entityId, values):
- """
- Helper function called by the callback of
- dcgm_agent.dcgmGetValuesSince to process individual field values
- """
self._numValuesSeen += len(values)
if entityGroupId not in self.values:
@@ -374,141 +375,172 @@ def _ProcessValues(self, entityGroupId, entityId, values):
self.values[entityGroupId][entityId] = {}
for rawValue in values:
- # Convert to python-friendly value
+ #Convert to python-friendly value
value = DcgmFieldValue(rawValue)
if value.fieldId not in self.values[entityGroupId][entityId]:
self.values[entityGroupId][entityId][
- value.fieldId
- ] = DcgmFieldValueTimeSeries()
+ value.fieldId] = DcgmFieldValueTimeSeries()
- self.values[entityGroupId][entityId][value.fieldId].InsertValue(value)
+ self.values[entityGroupId][entityId][value.fieldId].InsertValue(
+ value)
- def GetLatestValues(self, fieldGroup):
- """
- Get the latest values for a fieldGroup and store them to the
- .values member variable
+ '''
+ Get the latest values for a fieldGroup and store them to the .values member variable
+
+ Note: This class does not automatically watch fieldGroup. You must do that ahead of time with dcgmGroup.samples.WatchFields()
+ '''
- Note: This class does not automatically watch fieldGroup. You must do
- that ahead of time with dcgmGroup.samples.WatchFields()
- """
+ def GetLatestValues(self, fieldGroup):
ret = dcgm_agent.dcgmGetLatestValues_v2(
- self._handle,
- self._groupId,
- fieldGroup.fieldGroupId,
- helper_dcgm_field_values_since_entity_callback,
- self,
- )
- # Will throw exception on error
+ self._handle, self._groupId, fieldGroup.fieldGroupId,
+ helper_dcgm_field_values_since_entity_callback, self)
+ #Will throw exception on error
dcgm_structs._dcgmCheckReturn(ret)
+ '''
+ Method to cause more field values to be retrieved from DCGM. Returns the
+ number of field values that were retrieved.
+ '''
+
+ def GetAllSinceLastCall(self, fieldGroup):
+ beforeCount = self._numValuesSeen
+ self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince_v2(
+ self._handle, self._groupId, fieldGroup.fieldGroupId,
+ self._nextSinceTimestamp,
+ helper_dcgm_field_values_since_entity_callback, self)
+ afterCount = self._numValuesSeen
+ return afterCount - beforeCount
+
+ '''
+ Empty .values{} so that old data is no longer present in this structure.
+ This can be used to prevent .values from growing over time
+ '''
+
def EmptyValues(self):
- """
- Empty .values{} so that old data is no longer present in this
- structure. This can be used to prevent .values from growing over time
- """
self.values = {}
self._numValuesSeen = 0
+'''
+Helper class for watching a field group and storing fields values returned from it
+'''
+
+
class DcgmFieldGroupEntityWatcher(DcgmFieldValueEntityCollection):
- """
- Helper class for watching a field group and storing fields values
- returned from it
- """
-
- def __init__(
- self,
- handle,
- groupId,
- fieldGroup,
- operationMode,
- updateFreq,
- maxKeepAge,
- maxKeepSamples,
- startTimestamp,
- ):
- """
- Constructor
-
- handle :
- a DCGM handle from dcgm_agent.dcgmInit()
- groupId :
- a valid DCGM group ID returned from dcgm_agent.dcgmGroupCreate
- fieldGroup :
- DcgmFieldGroup() instance to watch fields for
- operationMode :
- is a dcgm_structs.DCGM_OPERATION_MODE_? constant for if the host
- engine is running in lock step or auto mode
- updateFreq :
- how often to update each field in usec
- maxKeepAge :
- how long DCGM should keep values for in seconds
- maxKeepSamples :
- the maximum number of samples DCGM should ever cache for each field
- startTimestamp :
- a base timestamp we should start from when first reading values.
- This can be used to resume a previous instance of a
- DcgmFieldGroupWatcher by using its _nextSinceTimestamp. 0=start
- with all cached data
- """
+ '''
+ Constructor
+
+ handle is a DCGM handle from dcgm_agent.dcgmInit()
+ groupId is a valid DCGM group ID returned from dcgm_agent.dcgmGroupCreate
+ fieldGroup is the DcgmFieldGroup() instance to watch fields for
+ operationMode is a dcgm_structs.DCGM_OPERATION_MODE_? constant for if the host engine is running in lock step or auto mode
+ updateFreq is how often to update each field in usec
+ maxKeepAge is how long DCGM should keep values for in seconds
+ maxKeepSamples is the maximum number of samples DCGM should ever cache for each field
+ startTimestamp is a base timestamp we should start from when first reading values. This can be used to resume a
+ previous instance of a DcgmFieldGroupWatcher by using its _nextSinceTimestamp.
+ 0=start with all cached data
+ '''
+
+ def __init__(self, handle, groupId, fieldGroup, operationMode, updateFreq,
+ maxKeepAge, maxKeepSamples, startTimestamp):
self._fieldGroup = fieldGroup
- self._oprationMode = operationMode
+ self._operationMode = operationMode
self._updateFreq = updateFreq
self._maxKeepAge = maxKeepAge
self._maxKeepSamples = maxKeepSamples
DcgmFieldValueEntityCollection.__init__(self, handle, groupId)
- # Start from beginning of time
- self._nextSinceTimestamp = 0
+ self._nextSinceTimestamp = 0 #Start from beginning of time
if startTimestamp > 0:
self._nextSinceTimestamp = startTimestamp
- # Start watches
+ #Start watches
self._WatchFieldGroup()
+ '''
+ Initiate the host engine watch on the fields
+ '''
+
def _WatchFieldGroup(self):
- """
- Initiate the host engine watch on the fields
- """
- ret = dcgm_agent.dcgmWatchFields(
- self._handle,
- self._groupId,
- self._fieldGroup.fieldGroupId,
- self._updateFreq,
- self._maxKeepAge,
- self._maxKeepSamples,
- )
- # Will throw exception on error
- dcgm_structs._dcgmCheckReturn(ret)
+ ret = dcgm_agent.dcgmWatchFields(self._handle, self._groupId,
+ self._fieldGroup.fieldGroupId,
+ self._updateFreq, self._maxKeepAge,
+ self._maxKeepSamples)
+ dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error
- # Force an update of the fields so that we can fetch initial values
+ # Force an update of the fields so that we can fetch initial values.
ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1)
- # Will throw exception on error
- dcgm_structs._dcgmCheckReturn(ret)
- # initial update will fetch from startTimestamp
- self.GetMore()
-
- def GetMore(self):
- """
- Method to cause more field values to be retrieved from DCGM. Returns
- the number of field values that were retrieved
- """
- beforeCount = self._numValuesSeen
+ dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error
- # If we're in manual mode, force an update
- if self._oprationMode == dcgm_structs.DCGM_OPERATION_MODE_MANUAL:
- ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1)
- # Will throw exception on error
- dcgm_structs._dcgmCheckReturn(ret)
+ # Initial update will fetch from startTimestamp.
+ self.GetAllSinceLastCall()
- self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince_v2(
- self._handle,
- self._groupId,
- self._fieldGroup.fieldGroupId,
- self._nextSinceTimestamp,
- helper_dcgm_field_values_since_entity_callback,
- self,
- )
- afterCount = self._numValuesSeen
- return afterCount - beforeCount
+ '''
+ Method to cause more field values to be retrieved from DCGM. Returns the
+ number of field values that were retrieved
+ '''
+
+ def GetAllSinceLastCall(self):
+ #If we're in manual mode, force an update
+ if self._operationMode == dcgm_structs.DCGM_OPERATION_MODE_MANUAL:
+ ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1)
+ dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error
+
+ return super().GetAllSinceLastCall(self._fieldGroup)
+
+
+#Test program for demonstrating how this module works
+def main():
+ operationMode = dcgm_structs.DCGM_OPERATION_MODE_AUTO
+ timeStep = 1.0
+
+ dcgm_structs._dcgmInit()
+ dcgm_agent.dcgmInit() #Will throw an exception on error
+ handle = dcgm_agent.dcgmStartEmbedded(operationMode)
+ handleObj = pydcgm.DcgmHandle(handle=handle)
+ groupId = dcgm_structs.DCGM_GROUP_ALL_GPUS
+ fieldIds = [
+ dcgm_fields.DCGM_FI_DEV_SM_CLOCK, dcgm_fields.DCGM_FI_DEV_MEM_CLOCK
+ ]
+
+ fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "my_field_group", fieldIds)
+
+ updateFreq = int(timeStep * 1000000.0)
+ maxKeepAge = 3600.0 #1 hour
+ maxKeepSamples = 0 #unlimited. maxKeepAge will enforce quota
+ startTimestamp = 0 #beginning of time
+
+ dfcw = DcgmFieldGroupWatcher(handle, groupId, fieldGroup, operationMode,
+ updateFreq, maxKeepAge, maxKeepSamples,
+ startTimestamp)
+ dfcw2 = DcgmFieldGroupEntityWatcher(handle, groupId, fieldGroup,
+ operationMode, updateFreq, maxKeepAge,
+ maxKeepSamples, startTimestamp)
+
+ while (True):
+ newUpdateCount = dfcw.GetAllSinceLastCall()
+ newUpdateCount2 = dfcw2.GetAllSinceLastCall()
+ print("Got %d and %d new field value updates" %
+ (newUpdateCount, newUpdateCount2))
+ for gpuId in list(dfcw.values.keys()):
+ print("gpuId %d" % gpuId)
+ for fieldId in list(dfcw.values[gpuId].keys()):
+ print(" fieldId %d: %d values. latest timestamp %d" % \
+ (fieldId, len(dfcw.values[gpuId][fieldId]), dfcw.values[gpuId][fieldId][-1].ts))
+
+ for entityGroupId in list(dfcw2.values.keys()):
+ print("entityGroupId %d" % entityGroupId)
+ for entityId in list(dfcw2.values[entityGroupId].keys()):
+ print(" entityId %d" % entityId)
+ for fieldId in list(
+ dfcw2.values[entityGroupId][entityId].keys()):
+ print(" fieldId %d: %d values. latest timestamp %d" % \
+ (fieldId, len(dfcw2.values[entityGroupId][entityId][fieldId]), dfcw2.values[entityGroupId][entityId][fieldId][-1].ts))
+
+ time.sleep(timeStep)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/model_analyzer/monitor/dcgm/dcgm_fields.py b/model_analyzer/monitor/dcgm/dcgm_fields.py
index 708008233..7c07111cd 100755
--- a/model_analyzer/monitor/dcgm/dcgm_fields.py
+++ b/model_analyzer/monitor/dcgm/dcgm_fields.py
@@ -1,6 +1,4 @@
-#!/usr/bin/env python3
-
-# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -13,38 +11,28 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+##
+# Python bindings for the internal API of DCGM library (dcgm_fields.h)
+##
-from ctypes import (
- POINTER,
- Structure,
- addressof,
- c_char,
- c_char_p,
- c_int,
- c_short,
- c_ubyte,
- c_uint32,
- memmove,
- sizeof,
-)
-
+from ctypes import *
+from ctypes.util import find_library
import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
# Provides access to functions
dcgmFP = dcgm_structs._dcgmGetFunctionPointer
# Field Types are a single byte. List these in ASCII order
-DCGM_FT_BINARY = "b" # Blob of binary data representing a structure
-DCGM_FT_DOUBLE = "d" # 8-byte double precision
-DCGM_FT_INT64 = "i" # 8-byte signed integer
-DCGM_FT_STRING = "s" # Null-terminated ASCII Character string
-DCGM_FT_TIMESTAMP = "t" # 8-byte signed integer usec since 1970
+DCGM_FT_BINARY = 'b' # Blob of binary data representing a structure
+DCGM_FT_DOUBLE = 'd' # 8-byte double precision
+DCGM_FT_INT64 = 'i' # 8-byte signed integer
+DCGM_FT_STRING = 's' # Null-terminated ASCII Character string
+DCGM_FT_TIMESTAMP = 't' # 8-byte signed integer usec since 1970
# Field scope. What are these fields associated with
DCGM_FS_GLOBAL = 0 # Field is global (ex: driver version)
DCGM_FS_ENTITY = 1 # Field is associated with an entity (GPU, VGPU, ..etc)
-# Field is associated with a device. Deprecated. Use DCGM_FS_ENTITY
-DCGM_FS_DEVICE = DCGM_FS_ENTITY
+DCGM_FS_DEVICE = DCGM_FS_ENTITY # Field is associated with a device. Deprecated. Use DCGM_FS_ENTITY
# DCGM_FI_DEV_CLOCK_THROTTLE_REASONS is a bitmap of why the clock is throttled.
# These macros are masks for relevant throttling, and are a 1:1 map to the NVML
@@ -63,8 +51,7 @@
#
# This is an indicator of:
# - temperature being too high
-# - External Power Brake Assertion is triggered
-# (e.g. by the system power supply)
+# - External Power Brake Assertion is triggered (e.g. by the system power supply)
# - Power draw is too high and Fast Trigger protection is reducing the clocks
# - May be also reported during PState or clock change
# - This behavior may be removed in a later release.
@@ -87,635 +74,451 @@
# - Current memory temperature above the Memory Max Operating Temperature
DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL = 0x0000000000000020
-# HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is
-# engaged
+# HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
#
# This is an indicator of:
# - temperature being too high
DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL = 0x0000000000000040
-# HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more)
-# is engaged
+# HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
#
# This is an indicator of:
-# - External Power Brake Assertion being triggered (e.g. by the system power
-# supply)
+# - External Power Brake Assertion being triggered (e.g. by the system power supply)
DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE = 0x0000000000000080
# GPU clocks are limited by current setting of Display clocks
DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS = 0x0000000000000100
-# Field entity groups. Which type of entity is this field or field value
-# associated with
-
-# Field is not associated with an entity. Field scope should be DCGM_FS_GLOBAL
-DCGM_FE_NONE = 0
+#Field entity groups. Which type of entity is this field or field value associated with
+DCGM_FE_NONE = 0 # Field is not associated with an entity. Field scope should be DCGM_FS_GLOBAL
DCGM_FE_GPU = 1 # Field is associated with a GPU entity
DCGM_FE_VGPU = 2 # Field is associated with a VGPU entity
DCGM_FE_SWITCH = 3 # Field is associated with a Switch entity
DCGM_FE_GPU_I = 4 # Field is associated with a GPU Instance entity
DCGM_FE_GPU_CI = 5 # Field is associated with a GPU Compute Instance entity
+DCGM_FE_LINK = 6 # Field is associated with an NVLINK
-# Represents an identifier for an entity within a field entity. For instance,
-# this is the gpuId for DCGM_FE_GPU.
-c_dcgm_field_eid_t = c_uint32
+c_dcgm_field_eid_t = c_uint32 #Represents an identifier for an entity within a field entity. For instance, this is the gpuId for DCGM_FE_GPU.
-#
-# System attributes
-#
+#System attributes
DCGM_FI_UNKNOWN = 0
-# Driver Version
-DCGM_FI_DRIVER_VERSION = 1
-# Underlying NVML version
-DCGM_FI_NVML_VERSION = 2
-# Process Name. Will be nv-hostengine or your process's name in embedded mode
-DCGM_FI_PROCESS_NAME = 3
-# Number of Devices on the node
-DCGM_FI_DEV_COUNT = 4
-
-#
-# Device attributes
-#
-# Name of the GPU device
-DCGM_FI_DEV_NAME = 50
-# Device Brand
-DCGM_FI_DEV_BRAND = 51
-# NVML index of this GPU
-DCGM_FI_DEV_NVML_INDEX = 52
-# Device Serial Number
-DCGM_FI_DEV_SERIAL = 53
-# UUID corresponding to the device
-DCGM_FI_DEV_UUID = 54
-# Device node minor number /dev/nvidia#
-DCGM_FI_DEV_MINOR_NUMBER = 55
-# OEM inforom version
-DCGM_FI_DEV_OEM_INFOROM_VER = 56
-# PCI attributes for the device
-DCGM_FI_DEV_PCI_BUSID = 57
-# The combined 16-bit device id and 16-bit vendor id
-DCGM_FI_DEV_PCI_COMBINED_ID = 58
-# The 32-bit Sub System Device ID
-DCGM_FI_DEV_PCI_SUBSYS_ID = 59
-# Topology of all GPUs on the system via PCI (static)
-DCGM_FI_GPU_TOPOLOGY_PCI = 60
-# Topology of all GPUs on the system via NVLINK (static)
-DCGM_FI_GPU_TOPOLOGY_NVLINK = 61
-# Affinity of all GPUs on the system (static)
-DCGM_FI_GPU_TOPOLOGY_AFFINITY = 62
-# Compute mode for the device
-DCGM_FI_DEV_COMPUTE_MODE = 65
-# Persistence mode for the device
-DCGM_FI_DEV_PERSISTENCE_MODE = 66
-# MIG mode for the device
-DCGM_FI_DEV_MIG_MODE = 67
-# String value for CUDA_VISIBLE_DEVICES for the device
-DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR = 68
-# Device CPU affinity. part 1/8 = cpus 0 - 63
-DCGM_FI_DEV_CPU_AFFINITY_0 = 70
-# Device CPU affinity. part 1/8 = cpus 64 - 127
-DCGM_FI_DEV_CPU_AFFINITY_1 = 71
-# Device CPU affinity. part 2/8 = cpus 128 - 191
-DCGM_FI_DEV_CPU_AFFINITY_2 = 72
-# Device CPU affinity. part 3/8 = cpus 192 - 255
-DCGM_FI_DEV_CPU_AFFINITY_3 = 73
-# ECC inforom version
-DCGM_FI_DEV_ECC_INFOROM_VER = 80
-# Power management object inforom version
-DCGM_FI_DEV_POWER_INFOROM_VER = 81
-# Inforom image version
-DCGM_FI_DEV_INFOROM_IMAGE_VER = 82
-# Inforom configuration checksum
-DCGM_FI_DEV_INFOROM_CONFIG_CHECK = 83
-# Reads the infoROM from the flash and verifies the checksums
-DCGM_FI_DEV_INFOROM_CONFIG_VALID = 84
-# VBIOS version of the device
-DCGM_FI_DEV_VBIOS_VERSION = 85
-# Total BAR1 of the GPU
-DCGM_FI_DEV_BAR1_TOTAL = 90
-# Deprecated - Sync boost settings on the node
-DCGM_FI_SYNC_BOOST = 91
-# Used BAR1 of the GPU in MB
-DCGM_FI_DEV_BAR1_USED = 92
-# Free BAR1 of the GPU in MB
-DCGM_FI_DEV_BAR1_FREE = 93
-
-#
-# Clocks and power
-#
-# SM clock for the device
-DCGM_FI_DEV_SM_CLOCK = 100
-# Memory clock for the device
-DCGM_FI_DEV_MEM_CLOCK = 101
-# Video encoder/decoder clock for the device
-DCGM_FI_DEV_VIDEO_CLOCK = 102
-# SM Application clocks
-DCGM_FI_DEV_APP_SM_CLOCK = 110
-# Memory Application clocks
-DCGM_FI_DEV_APP_MEM_CLOCK = 111
-# Current clock throttle reasons (bitmask of DCGM_CLOCKS_THROTTLE_REASON_*)
-DCGM_FI_DEV_CLOCK_THROTTLE_REASONS = 112
-# Maximum supported SM clock for the device
-DCGM_FI_DEV_MAX_SM_CLOCK = 113
-# Maximum supported Memory clock for the device
-DCGM_FI_DEV_MAX_MEM_CLOCK = 114
-# Maximum supported Video encoder/decoder clock for the device
-DCGM_FI_DEV_MAX_VIDEO_CLOCK = 115
-# Auto-boost for the device (1 = enabled. 0 = disabled)
-DCGM_FI_DEV_AUTOBOOST = 120
-# Supported clocks for the device
-DCGM_FI_DEV_SUPPORTED_CLOCKS = 130
-# Memory temperature for the device
-DCGM_FI_DEV_MEMORY_TEMP = 140
-# Current temperature readings for the device, in degrees C
-DCGM_FI_DEV_GPU_TEMP = 150
-# Power usage for the device in Watts
-DCGM_FI_DEV_POWER_USAGE = 155
-# Total energy consumption for the GPU in mJ since the driver was last reloaded
-DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 156
-# Slowdown temperature for the device
-DCGM_FI_DEV_SLOWDOWN_TEMP = 158
-# Shutdown temperature for the device
-DCGM_FI_DEV_SHUTDOWN_TEMP = 159
-# Current Power limit for the device
-DCGM_FI_DEV_POWER_MGMT_LIMIT = 160
-# Minimum power management limit for the device
-DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN = 161
-# Maximum power management limit for the device
-DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX = 162
-# Default power management limit for the device
-DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF = 163
-# Effective power limit that the driver enforces after taking into account all
-# limiters
-DCGM_FI_DEV_ENFORCED_POWER_LIMIT = 164
-# Performance state (P-State) 0-15. 0=highest
-DCGM_FI_DEV_PSTATE = 190
-# Fan speed for the device in percent 0-100
-DCGM_FI_DEV_FAN_SPEED = 191
-
-#
-# Device utilization and telemetry
-#
-# Deprecated - PCIe Tx utilization information
-DCGM_FI_DEV_PCIE_TX_THROUGHPUT = 200
-# Deprecated - PCIe Rx utilization information
-DCGM_FI_DEV_PCIE_RX_THROUGHPUT = 201
-# PCIe replay counter
-DCGM_FI_DEV_PCIE_REPLAY_COUNTER = 202
-# GPU Utilization
-DCGM_FI_DEV_GPU_UTIL = 203
-# Memory Utilization
-DCGM_FI_DEV_MEM_COPY_UTIL = 204
-# Process accounting stats
-DCGM_FI_DEV_ACCOUNTING_DATA = 205
-# Encoder utilization
-DCGM_FI_DEV_ENC_UTIL = 206
-# Decoder utilization
-DCGM_FI_DEV_DEC_UTIL = 207
-# Memory utilization samples
-DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES = 210
-# SM utilization samples
-DCGM_FI_DEV_GPU_UTIL_SAMPLES = 211
-# Graphics processes running on the GPU.
-DCGM_FI_DEV_GRAPHICS_PIDS = 220
-# Compute processes running on the GPU.
-DCGM_FI_DEV_COMPUTE_PIDS = 221
-# XID errors. The value is the specific XID error
-DCGM_FI_DEV_XID_ERRORS = 230
-# PCIe Max Link Generation
-DCGM_FI_DEV_PCIE_MAX_LINK_GEN = 235
-# PCIe Max Link Width
-DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH = 236
-# PCIe Current Link Generation
-DCGM_FI_DEV_PCIE_LINK_GEN = 237
-# PCIe Current Link Width
-DCGM_FI_DEV_PCIE_LINK_WIDTH = 238
-
-#
-# Violation counters
-#
-# Power Violation time in usec
-DCGM_FI_DEV_POWER_VIOLATION = 240
-# Thermal Violation time in usec
-DCGM_FI_DEV_THERMAL_VIOLATION = 241
-# Sync Boost Violation time in usec
-DCGM_FI_DEV_SYNC_BOOST_VIOLATION = 242
-# Board Limit Violation time in usec.
-DCGM_FI_DEV_BOARD_LIMIT_VIOLATION = 243
-# Low Utilization Violation time in usec.
-DCGM_FI_DEV_LOW_UTIL_VIOLATION = 244
-# Reliability Violation time in usec.
-DCGM_FI_DEV_RELIABILITY_VIOLATION = 245
-# App Clocks Violation time in usec.
-DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION = 246
-# Base Clocks Violation time in usec.
-DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION = 247
-
-#
-# Framebuffer usage
-#
-# Total framebuffer memory in MB
-DCGM_FI_DEV_FB_TOTAL = 250
-# Total framebuffer used in MB
-DCGM_FI_DEV_FB_FREE = 251
-# Total framebuffer free in MB
-DCGM_FI_DEV_FB_USED = 252
-
-#
-# Device ECC Counters
-#
-# Current ECC mode for the device
-DCGM_FI_DEV_ECC_CURRENT = 300
-# Pending ECC mode for the device
-DCGM_FI_DEV_ECC_PENDING = 301
-# Total single bit volatile ecc errors
-DCGM_FI_DEV_ECC_SBE_VOL_TOTAL = 310
-# Total double bit volatile ecc errors
-DCGM_FI_DEV_ECC_DBE_VOL_TOTAL = 311
-# Total single bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_SBE_AGG_TOTAL = 312
-# Total double bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_DBE_AGG_TOTAL = 313
-# L1 cache single bit volatile ecc errors
-DCGM_FI_DEV_ECC_SBE_VOL_L1 = 314
-# L1 cache double bit volatile ecc errors
-DCGM_FI_DEV_ECC_DBE_VOL_L1 = 315
-# L2 cache single bit volatile ecc errors
-DCGM_FI_DEV_ECC_SBE_VOL_L2 = 316
-# L2 cache double bit volatile ecc errors
-DCGM_FI_DEV_ECC_DBE_VOL_L2 = 317
-# Device memory single bit volatile ecc errors
-DCGM_FI_DEV_ECC_SBE_VOL_DEV = 318
-# Device memory double bit volatile ecc errors
-DCGM_FI_DEV_ECC_DBE_VOL_DEV = 319
-# Register file single bit volatile ecc errors
-DCGM_FI_DEV_ECC_SBE_VOL_REG = 320
-# Register file double bit volatile ecc errors
-DCGM_FI_DEV_ECC_DBE_VOL_REG = 321
-# Texture memory single bit volatile ecc errors
-DCGM_FI_DEV_ECC_SBE_VOL_TEX = 322
-# Texture memory double bit volatile ecc errors
-DCGM_FI_DEV_ECC_DBE_VOL_TEX = 323
-# L1 cache single bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_SBE_AGG_L1 = 324
-# L1 cache double bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_DBE_AGG_L1 = 325
-# L2 cache single bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_SBE_AGG_L2 = 326
-# L2 cache double bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_DBE_AGG_L2 = 327
-# Device memory single bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_SBE_AGG_DEV = 328
-# Device memory double bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_DBE_AGG_DEV = 329
-# Register File single bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_SBE_AGG_REG = 330
-# Register File double bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_DBE_AGG_REG = 331
-# Texture memory single bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_SBE_AGG_TEX = 332
-# Texture memory double bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_DBE_AGG_TEX = 333
-# Number of retired pages because of single bit errors
-DCGM_FI_DEV_RETIRED_SBE = 390
-# Number of retired pages because of double bit errors
-DCGM_FI_DEV_RETIRED_DBE = 391
-# Number of pages pending retirement
-DCGM_FI_DEV_RETIRED_PENDING = 392
-
-#
-# Row remapper fields (Ampere and newer)
-#
-# Number of remapped rows for uncorrectable errors
-DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS = 393
-# Number of remapped rows for correctable errors
-DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS = 394
-# Whether remapping of rows has failed
-DCGM_FI_DEV_ROW_REMAP_FAILURE = 395
-
-#
-# Device NvLink Bandwidth and Error Counters
-#
-# NV Link flow control CRC Error Counter for Lane 0
-DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 = 400
-# NV Link flow control CRC Error Counter for Lane 1
-DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 = 401
-# NV Link flow control CRC Error Counter for Lane 2
-DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 = 402
-# NV Link flow control CRC Error Counter for Lane 3
-DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 = 403
-# NV Link flow control CRC Error Counter for Lane 4
-DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 = 404
-# NV Link flow control CRC Error Counter for Lane 5
-DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 = 405
-# NV Link flow control CRC Error Counter total for all Lanes
-DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = 409
-# NV Link data CRC Error Counter for Lane 0
-DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 = 410
-# NV Link data CRC Error Counter for Lane 1
-DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 = 411
-# NV Link data CRC Error Counter for Lane 2
-DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 = 412
-# NV Link data CRC Error Counter for Lane 3
-DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 = 413
-# NV Link data CRC Error Counter for Lane 4
-DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 = 414
-# NV Link data CRC Error Counter for Lane 5
-DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 = 415
-# NV Link data CRC Error Counter total for all Lanes
-DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = 419
-# NV Link Replay Error Counter for Lane 0
-DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 = 420
-# NV Link Replay Error Counter for Lane 1
-DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 = 421
-# NV Link Replay Error Counter for Lane 2
-DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 = 422
-# NV Link Replay Error Counter for Lane 3
-DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 = 423
-# NV Link Replay Error Counter for Lane 4
-DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 = 424
-# NV Link Replay Error Counter for Lane 3
-DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 = 425
-# NV Link Replay Error Counter total for all Lanes
-DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL = 429
-# NV Link Recovery Error Counter for Lane 0
-DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 = 430
-# NV Link Recovery Error Counter for Lane 1
-DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 = 431
-# NV Link Recovery Error Counter for Lane 2
-DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 = 432
-# NV Link Recovery Error Counter for Lane 3
-DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 = 433
-# NV Link Recovery Error Counter for Lane 4
-DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 = 434
-# NV Link Recovery Error Counter for Lane 5
-DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 = 435
-# NV Link Recovery Error Counter total for all Lanes
-DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = 439
-# NV Link Bandwidth Counter for Lane 0
-DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 = 440
-# NV Link Bandwidth Counter for Lane 1
-DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 = 441
-# NV Link Bandwidth Counter for Lane 2
-DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 = 442
-# NV Link Bandwidth Counter for Lane 3
-DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 = 443
-# NV Link Bandwidth Counter for Lane 4
-DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 = 444
-# NV Link Bandwidth Counter for Lane 5
-DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 = 445
-# NV Link Bandwidth Counter total for all Lanes
-DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL = 449
-# GPU NVLink error information
-DCGM_FI_DEV_GPU_NVLINK_ERRORS = 450
-
-#
-# Device Attributes associated with virtualization
-#
-# Operating mode of the GPU
-DCGM_FI_DEV_VIRTUAL_MODE = 500
-# Includes Count and Supported vGPU type information
-DCGM_FI_DEV_SUPPORTED_TYPE_INFO = 501
-# Includes Count and List of Creatable vGPU type IDs
-DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS = 502
-# Includes Count and List of vGPU instance IDs
-DCGM_FI_DEV_VGPU_INSTANCE_IDS = 503
-# Utilization values for vGPUs running on the device
-DCGM_FI_DEV_VGPU_UTILIZATIONS = 504
-# Utilization values for processes running within vGPU VMs using the device
-DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION = 505
-# Current encoder statistics for a given device
-DCGM_FI_DEV_ENC_STATS = 506
-# Statistics of current active frame buffer capture sessions on a given device
-DCGM_FI_DEV_FBC_STATS = 507
-# Information about active frame buffer capture sessions on a target device
-DCGM_FI_DEV_FBC_SESSIONS_INFO = 508
-
-#
-# Related to vGPU Instance IDs
-#
-# vGPU VM ID
-DCGM_FI_DEV_VGPU_VM_ID = 520
-# vGPU VM name
-DCGM_FI_DEV_VGPU_VM_NAME = 521
-# vGPU type of the vGPU instance
-DCGM_FI_DEV_VGPU_TYPE = 522
-# UUID of the vGPU instance
-DCGM_FI_DEV_VGPU_UUID = 523
-# Driver version of the vGPU instance
-DCGM_FI_DEV_VGPU_DRIVER_VERSION = 524
-# Memory usage of the vGPU instance
-DCGM_FI_DEV_VGPU_MEMORY_USAGE = 525
-# License status of the vGPU instance
-DCGM_FI_DEV_VGPU_LICENSE_STATUS = 526
-# Frame rate limit of the vGPU instance
-DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT = 527
-# Current encoder statistics of the vGPU instance
-DCGM_FI_DEV_VGPU_ENC_STATS = 528
-# Information about all active encoder sessions on the vGPU instance
-DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO = 529
-# Statistics of current active frame buffer capture sessions on the vGPU
-# instance
-DCGM_FI_DEV_VGPU_FBC_STATS = 530
-# Information about active frame buffer capture sessions on the vGPU instance
-DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO = 531
-
-# Internal fields reserve the range 600..699
-# below fields related to NVSwitch
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P00 = 700
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P00 = 701
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P00 = 702
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P00 = 703
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P01 = 704
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P01 = 705
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P01 = 706
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P01 = 707
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P02 = 708
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P02 = 709
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P02 = 710
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P02 = 711
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P03 = 712
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P03 = 713
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P03 = 714
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P03 = 715
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P04 = 716
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P04 = 717
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P04 = 718
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P04 = 719
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P05 = 720
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P05 = 721
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P05 = 722
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P05 = 723
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P06 = 724
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P06 = 725
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P06 = 726
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P06 = 727
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P07 = 728
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P07 = 729
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P07 = 730
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P07 = 731
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P08 = 732
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P08 = 733
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P08 = 734
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P08 = 735
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P09 = 736
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P09 = 737
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P09 = 738
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P09 = 739
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P10 = 740
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P10 = 741
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P10 = 742
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P10 = 743
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P11 = 744
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P11 = 745
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P11 = 746
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P11 = 747
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P12 = 748
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P12 = 749
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P12 = 750
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P12 = 751
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P13 = 752
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P13 = 753
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P13 = 754
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P13 = 755
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P14 = 756
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P14 = 757
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P14 = 758
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P14 = 759
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P15 = 760
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P15 = 761
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P15 = 762
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P15 = 763
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P16 = 764
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P16 = 765
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P16 = 766
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P16 = 767
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P17 = 768
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P17 = 769
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P17 = 770
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P17 = 771
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00 = 780
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00 = 781
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P01 = 782
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P01 = 783
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P02 = 784
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P02 = 785
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P03 = 786
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P03 = 787
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P04 = 788
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P04 = 789
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P05 = 790
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P05 = 791
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P06 = 792
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P06 = 793
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P07 = 794
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P07 = 795
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P08 = 796
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P08 = 797
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P09 = 798
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P09 = 799
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P10 = 800
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P10 = 801
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P11 = 802
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P11 = 803
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P12 = 804
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P12 = 805
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P13 = 806
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P13 = 807
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P14 = 808
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P14 = 809
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P15 = 810
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P15 = 811
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P16 = 812
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P16 = 813
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P17 = 814
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P17 = 815
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00 = 820
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P00 = 821
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P01 = 822
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P01 = 823
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P02 = 824
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P02 = 825
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P03 = 826
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P03 = 827
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P04 = 828
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P04 = 829
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P05 = 830
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P05 = 831
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P06 = 832
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P06 = 833
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P07 = 834
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P07 = 835
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P08 = 836
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P08 = 837
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P09 = 838
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P09 = 839
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P10 = 840
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P10 = 841
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P11 = 842
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P11 = 843
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P12 = 844
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P12 = 845
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P13 = 846
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P13 = 847
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P14 = 848
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P14 = 849
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P15 = 850
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P15 = 851
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P16 = 852
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P16 = 853
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P17 = 854
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P17 = 855
+DCGM_FI_DRIVER_VERSION = 1 #Driver Version
+DCGM_FI_NVML_VERSION = 2 #Underlying NVML version
+DCGM_FI_PROCESS_NAME = 3 #Process Name. Will be nv-hostengine or your process's name in embedded mode
+DCGM_FI_DEV_COUNT = 4 #Number of Devices on the node
+DCGM_FI_CUDA_DRIVER_VERSION = 5 #Cuda Driver Version as an integer. CUDA 11.1 = 11100
+#Device attributes
+DCGM_FI_DEV_NAME = 50 #Name of the GPU device
+DCGM_FI_DEV_BRAND = 51 #Device Brand
+DCGM_FI_DEV_NVML_INDEX = 52 #NVML index of this GPU
+DCGM_FI_DEV_SERIAL = 53 #Device Serial Number
+DCGM_FI_DEV_UUID = 54 #UUID corresponding to the device
+DCGM_FI_DEV_MINOR_NUMBER = 55 #Device node minor number /dev/nvidia#
+DCGM_FI_DEV_OEM_INFOROM_VER = 56 #OEM inforom version
+DCGM_FI_DEV_PCI_BUSID = 57 #PCI attributes for the device
+DCGM_FI_DEV_PCI_COMBINED_ID = 58 #The combined 16-bit device id and 16-bit vendor id
+DCGM_FI_DEV_PCI_SUBSYS_ID = 59 #The 32-bit Sub System Device ID
+DCGM_FI_GPU_TOPOLOGY_PCI = 60 #Topology of all GPUs on the system via PCI (static)
+DCGM_FI_GPU_TOPOLOGY_NVLINK = 61 #Topology of all GPUs on the system via NVLINK (static)
+DCGM_FI_GPU_TOPOLOGY_AFFINITY = 62 #Affinity of all GPUs on the system (static)
+DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY = 63 #Cuda compute capability for the device
+DCGM_FI_DEV_COMPUTE_MODE = 65 #Compute mode for the device
+DCGM_FI_DEV_PERSISTENCE_MODE = 66 #Persistence mode for the device
+DCGM_FI_DEV_MIG_MODE = 67 #MIG mode for the device
+DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR = 68 #String value for CUDA_VISIBLE_DEVICES for the device
+DCGM_FI_DEV_MIG_MAX_SLICES = 69 #The maximum number of slices this GPU supports
+DCGM_FI_DEV_CPU_AFFINITY_0 = 70 #Device CPU affinity. part 1/8 = cpus 0 - 63
+DCGM_FI_DEV_CPU_AFFINITY_1 = 71 #Device CPU affinity. part 1/8 = cpus 64 - 127
+DCGM_FI_DEV_CPU_AFFINITY_2 = 72 #Device CPU affinity. part 2/8 = cpus 128 - 191
+DCGM_FI_DEV_CPU_AFFINITY_3 = 73 #Device CPU affinity. part 3/8 = cpus 192 - 255
+DCGM_FI_DEV_CC_MODE = 74 #Device CC/APM mode
+DCGM_FI_DEV_MIG_ATTRIBUTES = 75 #MIG device attributes
+DCGM_FI_DEV_MIG_GI_INFO = 76 #GPU instance profile information
+DCGM_FI_DEV_MIG_CI_INFO = 77 #Compute instance profile information
+DCGM_FI_DEV_ECC_INFOROM_VER = 80 #ECC inforom version
+DCGM_FI_DEV_POWER_INFOROM_VER = 81 #Power management object inforom version
+DCGM_FI_DEV_INFOROM_IMAGE_VER = 82 #Inforom image version
+DCGM_FI_DEV_INFOROM_CONFIG_CHECK = 83 #Inforom configuration checksum
+DCGM_FI_DEV_INFOROM_CONFIG_VALID = 84 #Reads the infoROM from the flash and verifies the checksums
+DCGM_FI_DEV_VBIOS_VERSION = 85 #VBIOS version of the device
+DCGM_FI_DEV_BAR1_TOTAL = 90 #Total BAR1 of the GPU
+DCGM_FI_SYNC_BOOST = 91 #Deprecated - Sync boost settings on the node
+DCGM_FI_DEV_BAR1_USED = 92 #Used BAR1 of the GPU in MB
+DCGM_FI_DEV_BAR1_FREE = 93 #Free BAR1 of the GPU in MB
+#Clocks and power
+DCGM_FI_DEV_SM_CLOCK = 100 #SM clock for the device
+DCGM_FI_DEV_MEM_CLOCK = 101 #Memory clock for the device
+DCGM_FI_DEV_VIDEO_CLOCK = 102 #Video encoder/decoder clock for the device
+DCGM_FI_DEV_APP_SM_CLOCK = 110 #SM Application clocks
+DCGM_FI_DEV_APP_MEM_CLOCK = 111 #Memory Application clocks
+DCGM_FI_DEV_CLOCK_THROTTLE_REASONS = 112 #Current clock throttle reasons (bitmask of DCGM_CLOCKS_THROTTLE_REASON_*)
+DCGM_FI_DEV_MAX_SM_CLOCK = 113 #Maximum supported SM clock for the device
+DCGM_FI_DEV_MAX_MEM_CLOCK = 114 #Maximum supported Memory clock for the device
+DCGM_FI_DEV_MAX_VIDEO_CLOCK = 115 #Maximum supported Video encoder/decoder clock for the device
+DCGM_FI_DEV_AUTOBOOST = 120 #Auto-boost for the device (1 = enabled. 0 = disabled)
+DCGM_FI_DEV_SUPPORTED_CLOCKS = 130 #Supported clocks for the device
+DCGM_FI_DEV_MEMORY_TEMP = 140 #Memory temperature for the device
+DCGM_FI_DEV_GPU_TEMP = 150 #Current temperature readings for the device, in degrees C
+DCGM_FI_DEV_MEM_MAX_OP_TEMP = 151 #Maximum operating temperature for the memory of this GPU
+DCGM_FI_DEV_GPU_MAX_OP_TEMP = 152 #Maximum operating temperature for this GPU
+DCGM_FI_DEV_POWER_USAGE = 155 #Power usage for the device in Watts
+DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 156 #Total energy consumption for the GPU in mJ since the driver was last reloaded
+DCGM_FI_DEV_SLOWDOWN_TEMP = 158 #Slowdown temperature for the device
+DCGM_FI_DEV_SHUTDOWN_TEMP = 159 #Shutdown temperature for the device
+DCGM_FI_DEV_POWER_MGMT_LIMIT = 160 #Current Power limit for the device
+DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN = 161 #Minimum power management limit for the device
+DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX = 162 #Maximum power management limit for the device
+DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF = 163 #Default power management limit for the device
+DCGM_FI_DEV_ENFORCED_POWER_LIMIT = 164 #Effective power limit that the driver enforces after taking into account all limiters
+DCGM_FI_DEV_PSTATE = 190 #Performance state (P-State) 0-15. 0=highest
+DCGM_FI_DEV_FAN_SPEED = 191 #Fan speed for the device in percent 0-100
+#Device utilization and telemetry
+DCGM_FI_DEV_PCIE_TX_THROUGHPUT = 200 #Deprecated - PCIe Tx utilization information
+DCGM_FI_DEV_PCIE_RX_THROUGHPUT = 201 #Deprecated - PCIe Rx utilization information
+DCGM_FI_DEV_PCIE_REPLAY_COUNTER = 202 #PCIe replay counter
+DCGM_FI_DEV_GPU_UTIL = 203 #GPU Utilization
+DCGM_FI_DEV_MEM_COPY_UTIL = 204 #Memory Utilization
+DCGM_FI_DEV_ACCOUNTING_DATA = 205 #Process accounting stats
+DCGM_FI_DEV_ENC_UTIL = 206 #Encoder utilization
+DCGM_FI_DEV_DEC_UTIL = 207 #Decoder utilization
+# Fields 210, 211, 220, and 221 are internal-only. see dcgm_fields_internal.py
+DCGM_FI_DEV_XID_ERRORS = 230 #XID errors. The value is the specific XID error
+DCGM_FI_DEV_PCIE_MAX_LINK_GEN = 235 #PCIe Max Link Generation
+DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH = 236 #PCIe Max Link Width
+DCGM_FI_DEV_PCIE_LINK_GEN = 237 #PCIe Current Link Generation
+DCGM_FI_DEV_PCIE_LINK_WIDTH = 238 #PCIe Current Link Width
+#Violation counters
+DCGM_FI_DEV_POWER_VIOLATION = 240 #Power Violation time in usec
+DCGM_FI_DEV_THERMAL_VIOLATION = 241 #Thermal Violation time in usec
+DCGM_FI_DEV_SYNC_BOOST_VIOLATION = 242 #Sync Boost Violation time in usec
+DCGM_FI_DEV_BOARD_LIMIT_VIOLATION = 243 #Board Limit Violation time in usec.
+DCGM_FI_DEV_LOW_UTIL_VIOLATION = 244 #Low Utilization Violation time in usec.
+DCGM_FI_DEV_RELIABILITY_VIOLATION = 245 #Reliability Violation time in usec.
+DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION = 246 #App Clocks Violation time in usec.
+DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION = 247 #Base Clocks Violation time in usec.
+#Framebuffer usage
+DCGM_FI_DEV_FB_TOTAL = 250 #Total framebuffer memory in MB
+DCGM_FI_DEV_FB_FREE = 251 #Total framebuffer used in MB
+DCGM_FI_DEV_FB_USED = 252 #Total framebuffer free in MB
+DCGM_FI_DEV_FB_RESERVED = 253 #Total framebuffer reserved in MB
+#Device ECC Counters
+DCGM_FI_DEV_ECC_CURRENT = 300 #Current ECC mode for the device
+DCGM_FI_DEV_ECC_PENDING = 301 #Pending ECC mode for the device
+DCGM_FI_DEV_ECC_SBE_VOL_TOTAL = 310 #Total single bit volatile ecc errors
+DCGM_FI_DEV_ECC_DBE_VOL_TOTAL = 311 #Total double bit volatile ecc errors
+DCGM_FI_DEV_ECC_SBE_AGG_TOTAL = 312 #Total single bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_DBE_AGG_TOTAL = 313 #Total double bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_SBE_VOL_L1 = 314 #L1 cache single bit volatile ecc errors
+DCGM_FI_DEV_ECC_DBE_VOL_L1 = 315 #L1 cache double bit volatile ecc errors
+DCGM_FI_DEV_ECC_SBE_VOL_L2 = 316 #L2 cache single bit volatile ecc errors
+DCGM_FI_DEV_ECC_DBE_VOL_L2 = 317 #L2 cache double bit volatile ecc errors
+DCGM_FI_DEV_ECC_SBE_VOL_DEV = 318 #Device memory single bit volatile ecc errors
+DCGM_FI_DEV_ECC_DBE_VOL_DEV = 319 #Device memory double bit volatile ecc errors
+DCGM_FI_DEV_ECC_SBE_VOL_REG = 320 #Register file single bit volatile ecc errors
+DCGM_FI_DEV_ECC_DBE_VOL_REG = 321 #Register file double bit volatile ecc errors
+DCGM_FI_DEV_ECC_SBE_VOL_TEX = 322 #Texture memory single bit volatile ecc errors
+DCGM_FI_DEV_ECC_DBE_VOL_TEX = 323 #Texture memory double bit volatile ecc errors
+DCGM_FI_DEV_ECC_SBE_AGG_L1 = 324 #L1 cache single bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_DBE_AGG_L1 = 325 #L1 cache double bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_SBE_AGG_L2 = 326 #L2 cache single bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_DBE_AGG_L2 = 327 #L2 cache double bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_SBE_AGG_DEV = 328 #Device memory single bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_DBE_AGG_DEV = 329 #Device memory double bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_SBE_AGG_REG = 330 #Register File single bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_DBE_AGG_REG = 331 #Register File double bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_SBE_AGG_TEX = 332 #Texture memory single bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_DBE_AGG_TEX = 333 #Texture memory double bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_RETIRED_SBE = 390 #Number of retired pages because of single bit errors
+DCGM_FI_DEV_RETIRED_DBE = 391 #Number of retired pages because of double bit errors
+DCGM_FI_DEV_RETIRED_PENDING = 392 #Number of pages pending retirement
+#Row remapper fields (Ampere and newer)
+DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS = 393 #Number of remapped rows for uncorrectable errors
+DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS = 394 #Number of remapped rows for correctable errors
+DCGM_FI_DEV_ROW_REMAP_FAILURE = 395 #Whether remapping of rows has failed
+DCGM_FI_DEV_ROW_REMAP_PENDING = 396 #Whether remapping of rows is pending
+
+#Device NvLink Bandwidth and Error Counters
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 = 400 #NV Link flow control CRC Error Counter for Lane 0
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 = 401 #NV Link flow control CRC Error Counter for Lane 1
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 = 402 #NV Link flow control CRC Error Counter for Lane 2
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 = 403 #NV Link flow control CRC Error Counter for Lane 3
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 = 404 #NV Link flow control CRC Error Counter for Lane 4
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 = 405 #NV Link flow control CRC Error Counter for Lane 5
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = 409 #NV Link flow control CRC Error Counter total for all Lanes
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 = 410 #NV Link data CRC Error Counter for Lane 0
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 = 411 #NV Link data CRC Error Counter for Lane 1
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 = 412 #NV Link data CRC Error Counter for Lane 2
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 = 413 #NV Link data CRC Error Counter for Lane 3
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 = 414 #NV Link data CRC Error Counter for Lane 4
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 = 415 #NV Link data CRC Error Counter for Lane 5
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = 419 #NV Link data CRC Error Counter total for all Lanes
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 = 420 #NV Link Replay Error Counter for Lane 0
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 = 421 #NV Link Replay Error Counter for Lane 1
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 = 422 #NV Link Replay Error Counter for Lane 2
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 = 423 #NV Link Replay Error Counter for Lane 3
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 = 424 #NV Link Replay Error Counter for Lane 4
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 = 425 #NV Link Replay Error Counter for Lane 3
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL = 429 #NV Link Replay Error Counter total for all Lanes
+
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 = 430 #NV Link Recovery Error Counter for Lane 0
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 = 431 #NV Link Recovery Error Counter for Lane 1
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 = 432 #NV Link Recovery Error Counter for Lane 2
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 = 433 #NV Link Recovery Error Counter for Lane 3
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 = 434 #NV Link Recovery Error Counter for Lane 4
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 = 435 #NV Link Recovery Error Counter for Lane 5
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = 439 #NV Link Recovery Error Counter total for all Lanes
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 = 440 #NV Link Bandwidth Counter for Lane 0
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 = 441 #NV Link Bandwidth Counter for Lane 1
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 = 442 #NV Link Bandwidth Counter for Lane 2
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 = 443 #NV Link Bandwidth Counter for Lane 3
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 = 444 #NV Link Bandwidth Counter for Lane 4
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 = 445 #NV Link Bandwidth Counter for Lane 5
+DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL = 449 #NV Link Bandwidth Counter total for all Lanes
+DCGM_FI_DEV_GPU_NVLINK_ERRORS = 450 #GPU NVLink error information
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 = 451
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 = 452
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 = 453
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 = 454
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 = 455
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 = 456
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 = 406
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 = 407
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 = 408
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 = 481
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 = 482
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 = 483
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 = 457
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 = 458
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 = 459
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 = 460
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 = 461
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 = 462
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 = 416
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 = 417
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 = 418
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 = 484
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 = 485
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 = 486
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 = 463
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 = 464
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 = 465
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 = 466
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 = 467
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 = 468
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 = 426
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 = 427
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 = 428
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 = 487
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 = 488
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 = 489
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 = 469
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 = 470
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 = 471
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 = 472
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 = 473
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 = 474
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 = 436
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 = 437
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 = 438
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 = 491
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 = 492
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 = 493
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 = 475
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 = 476
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 = 477
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 = 478
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 = 479
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 = 480
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L12 = 446
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L13 = 447
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L14 = 448
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L15 = 494
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L16 = 495
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L17 = 496
+
+#Device Attributes associated with virtualization
+DCGM_FI_DEV_VIRTUAL_MODE = 500 #Operating mode of the GPU
+DCGM_FI_DEV_SUPPORTED_TYPE_INFO = 501 #Includes Count and Supported vGPU type information
+DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS = 502 #Includes Count and List of Creatable vGPU type IDs
+DCGM_FI_DEV_VGPU_INSTANCE_IDS = 503 #Includes Count and List of vGPU instance IDs
+DCGM_FI_DEV_VGPU_UTILIZATIONS = 504 #Utilization values for vGPUs running on the device
+DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION = 505 #Utilization values for processes running within vGPU VMs using the device
+DCGM_FI_DEV_ENC_STATS = 506 #Current encoder statistics for a given device
+DCGM_FI_DEV_FBC_STATS = 507 #Statistics of current active frame buffer capture sessions on a given device
+DCGM_FI_DEV_FBC_SESSIONS_INFO = 508 #Information about active frame buffer capture sessions on a target device
+DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS = 509 #Includes Count and currently Supported vGPU types on a device
+DCGM_FI_DEV_VGPU_TYPE_INFO = 510 #Includes Static info of vGPU types supported on a device
+DCGM_FI_DEV_VGPU_TYPE_NAME = 511 #Includes the name of a vGPU type supported on a device
+DCGM_FI_DEV_VGPU_TYPE_CLASS = 512 #Includes the class of a vGPU type supported on a device
+DCGM_FI_DEV_VGPU_TYPE_LICENSE = 513 #Includes the license info for a vGPU type supported on a device
+#Related to vGPU Instance IDs
+DCGM_FI_DEV_VGPU_VM_ID = 520 #vGPU VM ID
+DCGM_FI_DEV_VGPU_VM_NAME = 521 #vGPU VM name
+DCGM_FI_DEV_VGPU_TYPE = 522 #vGPU type of the vGPU instance
+DCGM_FI_DEV_VGPU_UUID = 523 #UUID of the vGPU instance
+DCGM_FI_DEV_VGPU_DRIVER_VERSION = 524 #Driver version of the vGPU instance
+DCGM_FI_DEV_VGPU_MEMORY_USAGE = 525 #Memory usage of the vGPU instance
+DCGM_FI_DEV_VGPU_LICENSE_STATUS = 526 #License status of the vGPU
+DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT = 527 #Frame rate limit of the vGPU instance
+DCGM_FI_DEV_VGPU_ENC_STATS = 528 #Current encoder statistics of the vGPU instance
+DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO = 529 #Information about all active encoder sessions on the vGPU instance
+DCGM_FI_DEV_VGPU_FBC_STATS = 530 #Statistics of current active frame buffer capture sessions on the vGPU instance
+DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO = 531 #Information about active frame buffer capture sessions on the vGPU instance
+DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE = 532 #License state information of the vGPU instance
+DCGM_FI_DEV_VGPU_PCI_ID = 533 #PCI Id of the vGPU instance
+DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID = 534 #GPU Instance Id of the vGPU instance
+#Internal fields reserve the range 600..699
+#below fields related to NVSwitch
+DCGM_FI_FIRST_NVSWITCH_FIELD_ID = 700 #Starting field ID of the NVSwitch instance
+DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX = 780
+DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX = 781
+DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS = 782
+DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS = 783
+DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS = 784
+DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS = 785
+DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS = 786
+DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS = 787
+DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS = 788
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 = 789
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 = 790
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 = 791
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 = 792
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 = 793
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 = 794
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 = 795
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 = 796
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 = 797
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 = 798
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 = 799
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 = 800
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 = 801
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 = 802
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 = 803
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 = 804
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 = 805
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 = 806
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 = 807
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 = 808
+DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 = 809
+DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 = 810
+DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 = 811
+DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 = 812
+DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 = 813
+DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 = 814
+DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 = 815
+DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 = 816
DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS = 856
DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS = 857
-
-#
-# Profiling Fields
-#
-# Ratio of time the graphics engine is active. The graphics engine is active if
-# a graphics/compute context is bound and the graphics pipe or compute pipe is
-# busy.
-DCGM_FI_PROF_GR_ENGINE_ACTIVE = 1001
-
-# The ratio of cycles an SM has at least 1 warp assigned
-DCGM_FI_PROF_SM_ACTIVE = 1002
-# (computed from the number of cycles and elapsed cycles)
-
-# The ratio of number of warps resident on an SM.
-DCGM_FI_PROF_SM_OCCUPANCY = 1003
-# (number of resident as a ratio of the theoretical
-# maximum number of warps per elapsed cycle)
-
-# The ratio of cycles the tensor (HMMA) pipe is active
-DCGM_FI_PROF_PIPE_TENSOR_ACTIVE = 1004
-# (off the peak sustained elapsed cycles)
-
-# The ratio of cycles the device memory interface is active sending or
-# receiving data.
-DCGM_FI_PROF_DRAM_ACTIVE = 1005
-# Ratio of cycles the fp64 pipe is active.
-DCGM_FI_PROF_PIPE_FP64_ACTIVE = 1006
-# Ratio of cycles the fp32 pipe is active.
-DCGM_FI_PROF_PIPE_FP32_ACTIVE = 1007
-# Ratio of cycles the fp16 pipe is active. This does not include HMMA.
-DCGM_FI_PROF_PIPE_FP16_ACTIVE = 1008
-# The number of bytes of active PCIe tx (transmit) data including both header
-# and payload.
-DCGM_FI_PROF_PCIE_TX_BYTES = 1009
-# The number of bytes of active PCIe rx (read) data including both header and
-# payload.
-DCGM_FI_PROF_PCIE_RX_BYTES = 1010
-# The number of bytes of active NvLink tx (transmit) data including both header
-# and payload.
-DCGM_FI_PROF_NVLINK_TX_BYTES = 1011
-# The number of bytes of active NvLink rx (receive) data including both header
-# and payload.
-DCGM_FI_PROF_NVLINK_RX_BYTES = 1012
-
-# greater than maximum fields above. This value can increase in the future
-DCGM_FI_MAX_FIELDS = 1013
-
-
-class struct_c_dcgm_field_meta_t(Structure):
+DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT = 858
+DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN = 859
+DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN = 860
+DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX = 861
+DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX = 862
+
+DCGM_FI_LAST_NVSWITCH_FIELD_ID = 899 #Last field ID of the NVSwitch instance
+'''
+Profiling Fields
+'''
+DCGM_FI_PROF_GR_ENGINE_ACTIVE = 1001 #Ratio of time the graphics engine is active. The graphics engine is
+#active if a graphics/compute context is bound and the graphics pipe or
+#compute pipe is busy.
+
+DCGM_FI_PROF_SM_ACTIVE = 1002 #The ratio of cycles an SM has at least 1 warp assigned
+#(computed from the number of cycles and elapsed cycles)
+
+DCGM_FI_PROF_SM_OCCUPANCY = 1003 #The ratio of number of warps resident on an SM.
+#(number of resident as a ratio of the theoretical
+#maximum number of warps per elapsed cycle)
+
+DCGM_FI_PROF_PIPE_TENSOR_ACTIVE = 1004 #The ratio of cycles the any tensor pipe is active
+#(off the peak sustained elapsed cycles)
+
+DCGM_FI_PROF_DRAM_ACTIVE = 1005 #The ratio of cycles the device memory interface is active sending or receiving data.
+DCGM_FI_PROF_PIPE_FP64_ACTIVE = 1006 #Ratio of cycles the fp64 pipe is active.
+DCGM_FI_PROF_PIPE_FP32_ACTIVE = 1007 #Ratio of cycles the fp32 pipe is active.
+DCGM_FI_PROF_PIPE_FP16_ACTIVE = 1008 #Ratio of cycles the fp16 pipe is active. This does not include HMMA.
+DCGM_FI_PROF_PCIE_TX_BYTES = 1009 #The number of bytes of active PCIe tx (transmit) data including both header and payload.
+DCGM_FI_PROF_PCIE_RX_BYTES = 1010 #The number of bytes of active PCIe rx (read) data including both header and payload.
+DCGM_FI_PROF_NVLINK_TX_BYTES = 1011 #The number of bytes of active NvLink tx (transmit) data including both header and payload.
+DCGM_FI_PROF_NVLINK_RX_BYTES = 1012 #The number of bytes of active NvLink rx (receive) data including both header and payload.
+DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE = 1013 #The ratio of cycles the IMMA tensor pipe is active (off the peak sustained elapsed cycles)
+DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE = 1014 #The ratio of cycles the HMMA tensor pipe is active (off the peak sustained elapsed cycles)
+DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE = 1015 #The ratio of cycles the tensor (DFMA) pipe is active (off the peak sustained elapsed cycles)
+DCGM_FI_PROF_PIPE_INT_ACTIVE = 1016 #Ratio of cycles the integer pipe is active.
+
+#Ratio of cycles each of the NVDEC engines are active.
+DCGM_FI_PROF_NVDEC0_ACTIVE = 1017
+DCGM_FI_PROF_NVDEC1_ACTIVE = 1018
+DCGM_FI_PROF_NVDEC2_ACTIVE = 1019
+DCGM_FI_PROF_NVDEC3_ACTIVE = 1020
+DCGM_FI_PROF_NVDEC4_ACTIVE = 1021
+DCGM_FI_PROF_NVDEC5_ACTIVE = 1022
+DCGM_FI_PROF_NVDEC6_ACTIVE = 1023
+DCGM_FI_PROF_NVDEC7_ACTIVE = 1024
+
+#Ratio of cycles each of the NVJPG engines are active.
+DCGM_FI_PROF_NVJPG0_ACTIVE = 1025
+DCGM_FI_PROF_NVJPG1_ACTIVE = 1026
+DCGM_FI_PROF_NVJPG2_ACTIVE = 1027
+DCGM_FI_PROF_NVJPG3_ACTIVE = 1028
+DCGM_FI_PROF_NVJPG4_ACTIVE = 1029
+DCGM_FI_PROF_NVJPG5_ACTIVE = 1030
+DCGM_FI_PROF_NVJPG6_ACTIVE = 1031
+DCGM_FI_PROF_NVJPG7_ACTIVE = 1032
+
+#Ratio of cycles each of the NVOFA engines are active.
+DCGM_FI_PROF_NVOFA0_ACTIVE = 1033
+'''
+The per-link number of bytes of active NvLink TX (transmit) or RX (transmit) data including both header and payload.
+For example: DCGM_FI_PROF_NVLINK_L0_TX_BYTES -> L0 TX
+To get the bandwidth for a link, add the RX and TX value together like
+total = DCGM_FI_PROF_NVLINK_L0_TX_BYTES + DCGM_FI_PROF_NVLINK_L0_RX_BYTES
+'''
+DCGM_FI_PROF_NVLINK_L0_TX_BYTES = 1040
+DCGM_FI_PROF_NVLINK_L0_RX_BYTES = 1041
+DCGM_FI_PROF_NVLINK_L1_TX_BYTES = 1042
+DCGM_FI_PROF_NVLINK_L1_RX_BYTES = 1043
+DCGM_FI_PROF_NVLINK_L2_TX_BYTES = 1044
+DCGM_FI_PROF_NVLINK_L2_RX_BYTES = 1045
+DCGM_FI_PROF_NVLINK_L3_TX_BYTES = 1046
+DCGM_FI_PROF_NVLINK_L3_RX_BYTES = 1047
+DCGM_FI_PROF_NVLINK_L4_TX_BYTES = 1048
+DCGM_FI_PROF_NVLINK_L4_RX_BYTES = 1049
+DCGM_FI_PROF_NVLINK_L5_TX_BYTES = 1050
+DCGM_FI_PROF_NVLINK_L5_RX_BYTES = 1051
+DCGM_FI_PROF_NVLINK_L6_TX_BYTES = 1052
+DCGM_FI_PROF_NVLINK_L6_RX_BYTES = 1053
+DCGM_FI_PROF_NVLINK_L7_TX_BYTES = 1054
+DCGM_FI_PROF_NVLINK_L7_RX_BYTES = 1055
+DCGM_FI_PROF_NVLINK_L8_TX_BYTES = 1056
+DCGM_FI_PROF_NVLINK_L8_RX_BYTES = 1057
+DCGM_FI_PROF_NVLINK_L9_TX_BYTES = 1058
+DCGM_FI_PROF_NVLINK_L9_RX_BYTES = 1059
+DCGM_FI_PROF_NVLINK_L10_TX_BYTES = 1060
+DCGM_FI_PROF_NVLINK_L10_RX_BYTES = 1061
+DCGM_FI_PROF_NVLINK_L11_TX_BYTES = 1062
+DCGM_FI_PROF_NVLINK_L11_RX_BYTES = 1063
+DCGM_FI_PROF_NVLINK_L12_TX_BYTES = 1064
+DCGM_FI_PROF_NVLINK_L12_RX_BYTES = 1065
+DCGM_FI_PROF_NVLINK_L13_TX_BYTES = 1066
+DCGM_FI_PROF_NVLINK_L13_RX_BYTES = 1067
+DCGM_FI_PROF_NVLINK_L14_TX_BYTES = 1068
+DCGM_FI_PROF_NVLINK_L14_RX_BYTES = 1069
+DCGM_FI_PROF_NVLINK_L15_TX_BYTES = 1070
+DCGM_FI_PROF_NVLINK_L15_RX_BYTES = 1071
+DCGM_FI_PROF_NVLINK_L16_TX_BYTES = 1072
+DCGM_FI_PROF_NVLINK_L16_RX_BYTES = 1073
+DCGM_FI_PROF_NVLINK_L17_TX_BYTES = 1074
+DCGM_FI_PROF_NVLINK_L17_RX_BYTES = 1075
+
+DCGM_FI_PROF_NVLINK_THROUGHPUT_FIRST = DCGM_FI_PROF_NVLINK_L0_TX_BYTES
+DCGM_FI_PROF_NVLINK_THROUGHPUT_LAST = DCGM_FI_PROF_NVLINK_L17_RX_BYTES
+
+#greater than maximum fields above. This value can increase in the future
+DCGM_FI_MAX_FIELDS = 1076
+
+
+class struct_c_dcgm_field_meta_t(dcgm_structs._DcgmStructure):
# struct_c_dcgm_field_meta_t structure
pass # opaque handle
@@ -723,7 +526,7 @@ class struct_c_dcgm_field_meta_t(Structure):
dcgm_field_meta_t = POINTER(struct_c_dcgm_field_meta_t)
-class _PrintableStructure(Structure):
+class _PrintableStructure(dcgm_structs._DcgmStructure):
"""
Abstract class that produces nicer __str__ output than ctypes.Structure.
e.g. instead of:
@@ -736,13 +539,11 @@ class _PrintableStructure(Structure):
e.g. class that has _field_ 'hex_value', c_uint could be formatted with
_fmt_ = {"hex_value" : "%08X"}
to produce nicer output.
- Default formatting string for all fields can be set with key ""
- like:
+ Default fomratting string for all fields can be set with key "" like:
_fmt_ = {"" : "%d MHz"} # e.g all values are numbers in MHz.
If not set it's assumed to be just "%s"
- Exact format of returned str from this class is subject to change in the
- future.
+ Exact format of returned str from this class is subject to change in the future.
"""
_fmt_ = {}
@@ -770,11 +571,8 @@ def __str__(self):
# Structure to hold formatting information for values
class c_dcgm_field_output_format_t(_PrintableStructure):
- _fields_ = [
- ("shortName", c_char * SHORTNAME_LENGTH),
- ("unit", c_char * UNIT_LENGTH),
- ("width", c_short),
- ]
+ _fields_ = [('shortName', c_char * SHORTNAME_LENGTH),
+ ('unit', c_char * UNIT_LENGTH), ('width', c_short)]
TAG_LENGTH = 48
@@ -793,24 +591,14 @@ class c_dcgm_field_meta_t(_PrintableStructure):
]
-# Class for maintaining properties for each sampling type like Power,
-# Utilization and Clock.
+# Class for maintaining properties for each sampling type like Power, Utilization and Clock.
class pySamplingProperties:
- """
- The instance of this class is used to hold information related to each
- sampling event type.
- """
+ '''
+ The instance of this class is used to hold information related to each sampling event type.
+ '''
- def __init__(
- self,
- name,
- sampling_type,
- sample_val_type,
- timeIntervalIdle,
- timeIntervalBoost,
- min_value,
- max_value,
- ):
+ def __init__(self, name, sampling_type, sample_val_type, timeIntervalIdle,
+ timeIntervalBoost, min_value, max_value):
self.name = name
self.sampling_type = sampling_type
self.timeIntervalIdle = timeIntervalIdle
@@ -827,19 +615,12 @@ def DcgmFieldsInit():
def DcgmFieldGetById(fieldId):
- """
+ '''
Get metadata for a field, given its fieldId
- Parameters
- ----------
- fieldId :
- Field ID to get metadata for.
-
- Returns
- -------
- c_dcgm_field_meta_t or None
- Returns c_dcgm_field_meta_t on success or None on error.
- """
+ :param fieldId: Field ID to get metadata for
+ :return: c_dcgm_field_meta_t struct on success. None on error.
+ '''
DcgmFieldsInit()
fn = dcgmFP("DcgmFieldGetById")
@@ -854,25 +635,18 @@ def DcgmFieldGetById(fieldId):
def DcgmFieldGetByTag(tag):
- """
+ '''
Get metadata for a field, given its string tag
- Parameters
- ---------
- tag :
- Field tag to get metadata for. Example 'brand'.
-
- Returns
- -------
- c_dcgm_field_meta_t or None
- Returns c_dcgm_field_meta_t on success or None on error.
- """
+ :param tag: Field tag to get metadata for. Example 'brand'
+ :return: c_dcgm_field_meta_t struct on success. None on error.
+ '''
DcgmFieldsInit()
c_dcgm_field_meta_t()
fn = dcgmFP("DcgmFieldGetByTag")
fn.restype = POINTER(c_dcgm_field_meta_t)
- c_field_meta_ptr = fn(c_char_p(tag))
+ c_field_meta_ptr = fn(c_char_p(tag.encode('utf-8')))
if not c_field_meta_ptr:
return None
diff --git a/model_analyzer/monitor/dcgm/dcgm_fields_collectd.py b/model_analyzer/monitor/dcgm/dcgm_fields_collectd.py
new file mode 100644
index 000000000..7a29edc9e
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/dcgm_fields_collectd.py
@@ -0,0 +1,671 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from model_analyzer.monitor.dcgm.dcgm_fields import *
+from model_analyzer.monitor.dcgm.dcgm_fields_internal import *
+import sys
+
+
+class CollectdMetadata:
+ '''
+ Constructor
+ @params:
+ name: string identifying the dcgm field. The field_name as opposed to
+ field_id.Address:port of the host to connect. Defaults to localhost
+ kind: collectd type string.
+ used: a bool indicating whether or not the field is to be defined in
+ a collectd types.db file when GenerateCollectdTypesDB() is called
+ (generally if this file is run as a python3 mainline). We enumerate
+ all the dcgm fields, but only generate types.db records for those
+ supported at the current time. Others may or may not have correct
+ collectd type definitions (generally one might be a guage where it
+ is more correctly a counter). The idea is that an intrepid user may
+ enable generation of additional dcgm fields that they wish to collect
+ but are not officially supported yet.
+ '''
+
+ def __init__(self, name, kind, used=False):
+ self.name = name
+ self.kind = kind
+ self.used = used
+
+
+# collectd metadata definition table.
+
+CollectdMetadataDict = {
+ DCGM_FI_DRIVER_VERSION:
+ None,
+ DCGM_FI_NVML_VERSION:
+ None,
+ DCGM_FI_PROCESS_NAME:
+ None,
+ DCGM_FI_CUDA_DRIVER_VERSION:
+ CollectdMetadata("cuda_driver_version", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_COUNT:
+ CollectdMetadata("device_count", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NAME:
+ None,
+ DCGM_FI_DEV_BRAND:
+ None,
+ DCGM_FI_DEV_NVML_INDEX:
+ CollectdMetadata("nvml_index", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_SERIAL:
+ None,
+ DCGM_FI_DEV_CPU_AFFINITY_0:
+ CollectdMetadata("cpu_affinity_0", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_CPU_AFFINITY_1:
+ CollectdMetadata("cpu_affinity_1", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_CPU_AFFINITY_2:
+ CollectdMetadata("cpu_affinity_2", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_CPU_AFFINITY_3:
+ CollectdMetadata("cpu_affinity_3", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_UUID:
+ None,
+ DCGM_FI_DEV_MINOR_NUMBER:
+ CollectdMetadata("minor_number", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_OEM_INFOROM_VER:
+ None,
+ DCGM_FI_DEV_ECC_INFOROM_VER:
+ None,
+ DCGM_FI_DEV_POWER_INFOROM_VER:
+ None,
+ DCGM_FI_DEV_INFOROM_IMAGE_VER:
+ None,
+ DCGM_FI_DEV_INFOROM_CONFIG_CHECK:
+ CollectdMetadata("inforom_config_checksum", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_PCI_BUSID:
+ None,
+ DCGM_FI_DEV_PCI_COMBINED_ID:
+ CollectdMetadata("pci_combined_id", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_PCI_SUBSYS_ID:
+ CollectdMetadata("pci_subsys_id", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_PCIE_TX_THROUGHPUT:
+ CollectdMetadata("pcie_tx_throughput", "value:GAUGE:0:U", True),
+ DCGM_FI_DEV_PCIE_RX_THROUGHPUT:
+ CollectdMetadata("pcie_rx_throughput", "value:GAUGE:0:U", True),
+ DCGM_FI_DEV_PCIE_REPLAY_COUNTER:
+ CollectdMetadata("pcie_replay_counter", "value:COUNTER:0:U", True),
+ DCGM_FI_DEV_SM_CLOCK:
+ CollectdMetadata("sm_clock", "value:GAUGE:0:U", True),
+ DCGM_FI_DEV_MEM_CLOCK:
+ CollectdMetadata("memory_clock", "value:GAUGE:0:U", True),
+ DCGM_FI_DEV_VIDEO_CLOCK:
+ CollectdMetadata("video_clock", "value:GAUGE:0:U", True),
+ DCGM_FI_DEV_APP_SM_CLOCK:
+ CollectdMetadata("sm_app_clock", "value:GAUGE:0:U", True),
+ DCGM_FI_DEV_APP_MEM_CLOCK:
+ CollectdMetadata("mem_app_clock", "value:GAUGE:0:U", True),
+ DCGM_FI_DEV_CLOCK_THROTTLE_REASONS:
+ CollectdMetadata("current_clock_throttle_reasons", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_MAX_SM_CLOCK:
+ CollectdMetadata("sm_max_clock", "value:GAUGE:0:U", True),
+ DCGM_FI_DEV_MAX_MEM_CLOCK:
+ CollectdMetadata("memory_max_clock", "value:GAUGE:0:U", True),
+ DCGM_FI_DEV_MAX_VIDEO_CLOCK:
+ CollectdMetadata("video_max_clock", "value:GAUGE:0:U", True),
+ DCGM_FI_DEV_AUTOBOOST:
+ CollectdMetadata("autoboost", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_GPU_TEMP:
+ CollectdMetadata("gpu_temp", "value:GAUGE:U:U", True),
+ DCGM_FI_DEV_MEM_MAX_OP_TEMP:
+ CollectdMetadata("gpu_mem_max_op_temp", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_GPU_MAX_OP_TEMP:
+ CollectdMetadata("gpu_max_op_temp", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_SLOWDOWN_TEMP:
+ CollectdMetadata("slowdown_temp", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_SHUTDOWN_TEMP:
+ CollectdMetadata("shutdown_temp", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_POWER_MGMT_LIMIT:
+ CollectdMetadata("power_management_limit", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN:
+ CollectdMetadata("power_management_limit_min", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX:
+ CollectdMetadata("power_management_limit_max", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF:
+ CollectdMetadata("power_management_limit_default", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_POWER_USAGE:
+ CollectdMetadata("power_usage", "value:GAUGE:0:U", True),
+ DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION:
+ CollectdMetadata("total_energy_consumption", "value:GAUGE:0:U",
+ True), # left as guage since zeroed at driver reload
+ DCGM_FI_DEV_ENFORCED_POWER_LIMIT:
+ CollectdMetadata("enforced_power_limit", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_PSTATE:
+ CollectdMetadata("pstate", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_FAN_SPEED:
+ CollectdMetadata("fan_speed", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_COMPUTE_MODE:
+ CollectdMetadata("compute_mode", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_PERSISTENCE_MODE:
+ CollectdMetadata("persistance_mode", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_MIG_MODE:
+ CollectdMetadata("mig_mode", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR:
+ None,
+ DCGM_FI_DEV_MIG_MAX_SLICES:
+ CollectdMetadata("mig_max_slices", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_CURRENT:
+ CollectdMetadata("ecc", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_PENDING:
+ CollectdMetadata("ecc_pending", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_SBE_VOL_TOTAL:
+ CollectdMetadata("ecc_sbe_volatile_total", "value:COUNTER:0:U", True),
+ DCGM_FI_DEV_ECC_DBE_VOL_TOTAL:
+ CollectdMetadata("ecc_dbe_volatile_total", "value:COUNTER:0:U", True),
+ DCGM_FI_DEV_ECC_SBE_AGG_TOTAL:
+ CollectdMetadata("ecc_sbe_aggregate_total", "value:COUNTER:0:U", True),
+ DCGM_FI_DEV_ECC_DBE_AGG_TOTAL:
+ CollectdMetadata("ecc_dbe_aggregate_total", "value:COUNTER:0:U", True),
+ DCGM_FI_DEV_ECC_SBE_VOL_L1:
+ CollectdMetadata("ecc_sbe_volatile_l1", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_DBE_VOL_L1:
+ CollectdMetadata("ecc_dbe_volatile_l1", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_SBE_VOL_L2:
+ CollectdMetadata("ecc_sbe_volatile_l2", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_DBE_VOL_L2:
+ CollectdMetadata("ecc_dbe_volatile_l2", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_SBE_VOL_DEV:
+ CollectdMetadata("ecc_sbe_volatile_device", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_DBE_VOL_DEV:
+ CollectdMetadata("ecc_dbe_volatile_device", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_SBE_VOL_REG:
+ CollectdMetadata("ecc_sbe_volatile_register", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_DBE_VOL_REG:
+ CollectdMetadata("ecc_dbe_volatile_register", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_SBE_VOL_TEX:
+ CollectdMetadata("ecc_sbe_volatile_texture", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_DBE_VOL_TEX:
+ CollectdMetadata("ecc_dbe_volatile_texture", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_SBE_AGG_L1:
+ CollectdMetadata("ecc_sbe_aggregate_l1", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_DBE_AGG_L1:
+ CollectdMetadata("ecc_dbe_aggregate_l1", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_SBE_AGG_L2:
+ CollectdMetadata("ecc_sbe_aggregate_l2", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_DBE_AGG_L2:
+ CollectdMetadata("ecc_dbe_aggregate_l2", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_SBE_AGG_DEV:
+ CollectdMetadata("ecc_sbe_aggregate_device", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_DBE_AGG_DEV:
+ CollectdMetadata("ecc_dbe_aggregate_device", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_SBE_AGG_REG:
+ CollectdMetadata("ecc_sbe_aggregate_register", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_DBE_AGG_REG:
+ CollectdMetadata("ecc_dbe_aggregate_register", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_SBE_AGG_TEX:
+ CollectdMetadata("ecc_sbe_aggregate_texture", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ECC_DBE_AGG_TEX:
+ CollectdMetadata("ecc_dbe_aggregate_texture", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_GPU_UTIL:
+ CollectdMetadata("gpu_utilization", "value:GAUGE:0.0:1.0", True),
+ DCGM_FI_DEV_MEM_COPY_UTIL:
+ CollectdMetadata("mem_copy_utilization", "value:GAUGE:0:100", True),
+ DCGM_FI_DEV_ENC_UTIL:
+ CollectdMetadata("enc_utilization", "value:GAUGE:0:100"),
+ DCGM_FI_DEV_DEC_UTIL:
+ CollectdMetadata("dec_utilization", "value:GAUGE:0:100"),
+ DCGM_FI_DEV_VBIOS_VERSION:
+ None,
+ DCGM_FI_DEV_BAR1_TOTAL:
+ CollectdMetadata("bar1_total", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_BAR1_USED:
+ CollectdMetadata("bar1_used", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_BAR1_FREE:
+ CollectdMetadata("bar1_free", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_FB_TOTAL:
+ CollectdMetadata("fb_total", "value:GAUGE:0.0:U", True),
+ DCGM_FI_DEV_FB_FREE:
+ CollectdMetadata("fb_free", "value:GAUGE:0.0:U", True),
+ DCGM_FI_DEV_FB_USED:
+ CollectdMetadata("fb_used", "value:GAUGE:0.0:U", True),
+ DCGM_FI_DEV_FB_RESERVED:
+ CollectdMetadata("fb_resv", "value:GAUGE:0.0:U", True),
+ DCGM_FI_DEV_VIRTUAL_MODE:
+ CollectdMetadata("virtualization_mode", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_VGPU_INSTANCE_IDS:
+ None,
+ DCGM_FI_DEV_VGPU_UTILIZATIONS:
+ None,
+ DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION:
+ None,
+ DCGM_FI_DEV_VGPU_VM_ID:
+ None,
+ DCGM_FI_DEV_VGPU_VM_NAME:
+ None,
+ DCGM_FI_DEV_VGPU_TYPE:
+ CollectdMetadata("vgpu_instance_type", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_VGPU_UUID:
+ None,
+ DCGM_FI_DEV_VGPU_DRIVER_VERSION:
+ None,
+ DCGM_FI_DEV_VGPU_MEMORY_USAGE:
+ CollectdMetadata("vgpu_instance_memory_usage", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE:
+ CollectdMetadata("vgpu_instance_license_state", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_VGPU_LICENSE_STATUS:
+ CollectdMetadata("vgpu_instance_license_status", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT:
+ CollectdMetadata("vgpu_instance_frame_rate_limit", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_VGPU_PCI_ID:
+ CollectdMetadata("vgpu_instance_pci_id", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_VGPU_ENC_STATS:
+ None,
+ DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO:
+ None,
+ DCGM_FI_DEV_VGPU_FBC_STATS:
+ None,
+ DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO:
+ None,
+ DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID:
+ None,
+ DCGM_FI_DEV_SUPPORTED_TYPE_INFO:
+ None,
+ DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS:
+ None,
+ DCGM_FI_DEV_VGPU_TYPE_INFO:
+ None,
+ DCGM_FI_DEV_VGPU_TYPE_NAME:
+ None,
+ DCGM_FI_DEV_VGPU_TYPE_CLASS:
+ None,
+ DCGM_FI_DEV_VGPU_TYPE_LICENSE:
+ None,
+ DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS:
+ None,
+ DCGM_FI_DEV_ENC_STATS:
+ None,
+ DCGM_FI_DEV_FBC_STATS:
+ None,
+ DCGM_FI_DEV_FBC_SESSIONS_INFO:
+ None,
+ DCGM_FI_DEV_ACCOUNTING_DATA:
+ None,
+ DCGM_FI_DEV_RETIRED_SBE:
+ CollectdMetadata("retired_pages_sbe", "value:COUNTER:0:U", True),
+ DCGM_FI_DEV_RETIRED_DBE:
+ CollectdMetadata("retired_pages_dbe", "value:COUNTER:0:U", True),
+ DCGM_FI_DEV_GRAPHICS_PIDS:
+ None,
+ DCGM_FI_DEV_COMPUTE_PIDS:
+ None,
+ DCGM_FI_DEV_SUPPORTED_CLOCKS:
+ None,
+ DCGM_FI_SYNC_BOOST:
+ None,
+ DCGM_FI_DEV_RETIRED_PENDING:
+ CollectdMetadata("retired_pages_pending", "value:GAUGE:0:1",
+ True), # boolean 1 = yes, 0 = no
+ DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS:
+ CollectdMetadata("uncorrectable_remapped_rows", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS:
+ CollectdMetadata("correctable_remapped_rows", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ROW_REMAP_FAILURE:
+ CollectdMetadata("row_remap_failure", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_ROW_REMAP_PENDING:
+ CollectdMetadata("row_remap_pending", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_INFOROM_CONFIG_VALID:
+ CollectdMetadata("inforom_config_valid", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_XID_ERRORS:
+ CollectdMetadata("xid_errors", "value:GAUGE:0:U", True),
+ DCGM_FI_DEV_PCIE_MAX_LINK_GEN:
+ CollectdMetadata("pcie_max_link_gen", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH:
+ CollectdMetadata("pcie_max_link_width", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_PCIE_LINK_GEN:
+ CollectdMetadata("pcie_link_gen", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_PCIE_LINK_WIDTH:
+ CollectdMetadata("pcie_link_width", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_POWER_VIOLATION:
+ CollectdMetadata("power_violation", "value:COUNTER:0:U", True),
+ DCGM_FI_DEV_THERMAL_VIOLATION:
+ CollectdMetadata("thermal_violation", "value:COUNTER:0:U", True),
+ DCGM_FI_GPU_TOPOLOGY_PCI:
+ None,
+ DCGM_FI_GPU_TOPOLOGY_NVLINK:
+ None,
+ DCGM_FI_GPU_TOPOLOGY_AFFINITY:
+ None,
+ DCGM_FI_DEV_SYNC_BOOST_VIOLATION:
+ CollectdMetadata("sync_boost_violation", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_BOARD_LIMIT_VIOLATION:
+ CollectdMetadata("board_limit_violation", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_LOW_UTIL_VIOLATION:
+ CollectdMetadata("low_util_violation", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_RELIABILITY_VIOLATION:
+ CollectdMetadata("reliability_violation", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION:
+ CollectdMetadata("app_clock_violation", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION:
+ CollectdMetadata("base_clock_violation", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES:
+ CollectdMetadata("mem_util_samples", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_GPU_UTIL_SAMPLES:
+ CollectdMetadata("gpu_util_samples", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0:
+ CollectdMetadata("nvlink_flit_crc_error_count_l0", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1:
+ CollectdMetadata("nvlink_flit_crc_error_count_l1", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2:
+ CollectdMetadata("nvlink_flit_crc_error_count_l2", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3:
+ CollectdMetadata("nvlink_flit_crc_error_count_l3", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4:
+ CollectdMetadata("nvlink_flit_crc_error_count_l4", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5:
+ CollectdMetadata("nvlink_flit_crc_error_count_l5", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL:
+ CollectdMetadata("nvlink_flit_crc_error_count_total",
+ "value:COUNTER:0:U", True),
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0:
+ CollectdMetadata("nvlink_data_crc_error_count_l0", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1:
+ CollectdMetadata("nvlink_data_crc_error_count_l1", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2:
+ CollectdMetadata("nvlink_data_crc_error_count_l2", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3:
+ CollectdMetadata("nvlink_data_crc_error_count_l3", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4:
+ CollectdMetadata("nvlink_data_crc_error_count_l4", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5:
+ CollectdMetadata("nvlink_data_crc_error_count_l5", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL:
+ CollectdMetadata("nvlink_data_crc_error_count_total",
+ "value:COUNTER:0:U", True),
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0:
+ CollectdMetadata("nvlink_replay_error_count_l0", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1:
+ CollectdMetadata("nvlink_replay_error_count_l1", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2:
+ CollectdMetadata("nvlink_replay_error_count_l2", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3:
+ CollectdMetadata("nvlink_replay_error_count_l3", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4:
+ CollectdMetadata("nvlink_replay_error_count_l4", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5:
+ CollectdMetadata("nvlink_replay_error_count_l5", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL:
+ CollectdMetadata("nvlink_replay_error_count_total", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0:
+ CollectdMetadata("nvlink_recovery_error_count_l0", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1:
+ CollectdMetadata("nvlink_recovery_error_count_l1", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2:
+ CollectdMetadata("nvlink_recovery_error_count_l2", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3:
+ CollectdMetadata("nvlink_recovery_error_count_l3", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4:
+ CollectdMetadata("nvlink_recovery_error_count_l4", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5:
+ CollectdMetadata("nvlink_recovery_error_count_l5", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL:
+ CollectdMetadata("nvlink_recovery_error_count_total",
+ "value:COUNTER:0:U", True),
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L0:
+ CollectdMetadata("nvlink_bandwidth_l0", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L1:
+ CollectdMetadata("nvlink_bandwidth_l1", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L2:
+ CollectdMetadata("nvlink_bandwidth_l2", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L3:
+ CollectdMetadata("nvlink_bandwidth_l3", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L4:
+ CollectdMetadata("nvlink_bandwidth_l4", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L5:
+ CollectdMetadata("nvlink_bandwidth_l5", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL:
+ CollectdMetadata("nvlink_bandwidth_total", "value:GAUGE:0:U", True),
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6:
+ CollectdMetadata("nvlink_flit_crc_error_count_l6", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7:
+ CollectdMetadata("nvlink_flit_crc_error_count_l7", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8:
+ CollectdMetadata("nvlink_flit_crc_error_count_l8", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9:
+ CollectdMetadata("nvlink_flit_crc_error_count_l9", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10:
+ CollectdMetadata("nvlink_flit_crc_error_count_l10", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11:
+ CollectdMetadata("nvlink_flit_crc_error_count_l11", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6:
+ CollectdMetadata("nvlink_data_crc_error_count_l6", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7:
+ CollectdMetadata("nvlink_data_crc_error_count_l7", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8:
+ CollectdMetadata("nvlink_data_crc_error_count_l8", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9:
+ CollectdMetadata("nvlink_data_crc_error_count_l9", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10:
+ CollectdMetadata("nvlink_data_crc_error_count_l10", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11:
+ CollectdMetadata("nvlink_data_crc_error_count_l11", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6:
+ CollectdMetadata("nvlink_replay_error_count_l6", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7:
+ CollectdMetadata("nvlink_replay_error_count_l7", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8:
+ CollectdMetadata("nvlink_replay_error_count_l8", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9:
+ CollectdMetadata("nvlink_replay_error_count_l9", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10:
+ CollectdMetadata("nvlink_replay_error_count_l10", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11:
+ CollectdMetadata("nvlink_replay_error_count_l11", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6:
+ CollectdMetadata("nvlink_recovery_error_count_l6", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7:
+ CollectdMetadata("nvlink_recovery_error_count_l7", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8:
+ CollectdMetadata("nvlink_recovery_error_count_l8", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9:
+ CollectdMetadata("nvlink_recovery_error_count_l9", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10:
+ CollectdMetadata("nvlink_recovery_error_count_l10", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11:
+ CollectdMetadata("nvlink_recovery_error_count_l11", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L6:
+ CollectdMetadata("nvlink_bandwidth_l6", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L7:
+ CollectdMetadata("nvlink_bandwidth_l7", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L8:
+ CollectdMetadata("nvlink_bandwidth_l8", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L9:
+ CollectdMetadata("nvlink_bandwidth_l9", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L10:
+ CollectdMetadata("nvlink_bandwidth_l10", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L11:
+ CollectdMetadata("nvlink_bandwidth_l11", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_MEMORY_TEMP:
+ CollectdMetadata("memory_temp", "value:GAUGE:U:U", True),
+ DCGM_FI_DEV_GPU_NVLINK_ERRORS:
+ CollectdMetadata("gpu_nvlink_errors", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX:
+ CollectdMetadata("nvswitch_link_bandwidth_tx", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX:
+ CollectdMetadata("nvswitch_link_bandwidth_rx", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS:
+ CollectdMetadata("nvswitch_link_fatal_errors", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS:
+ CollectdMetadata("nvswitch_link_non_fatal_errors", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS:
+ CollectdMetadata("nvswitch_link_recovery_errors", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS:
+ CollectdMetadata("nvswitch_link_flit_errors", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS:
+ CollectdMetadata("nvswitch_link_crc_errors", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS:
+ CollectdMetadata("nvswitch_link_ecc_errors", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0:
+ CollectdMetadata("nvswitch_link_latency_low_vc0", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1:
+ CollectdMetadata("nvswitch_link_latency_low_vc1", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2:
+ CollectdMetadata("nvswitch_link_latency_low_vc2", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3:
+ CollectdMetadata("nvswitch_link_latency_low_vc3", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0:
+ CollectdMetadata("nvswitch_link_latency_medium_vc0", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1:
+ CollectdMetadata("nvswitch_link_latency_medium_vc1", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2:
+ CollectdMetadata("nvswitch_link_latency_medium_vc2", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3:
+ CollectdMetadata("nvswitch_link_latency_medium_vc3", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0:
+ CollectdMetadata("nvswitch_link_latency_high_vc0", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1:
+ CollectdMetadata("nvswitch_link_latency_high_vc1", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2:
+ CollectdMetadata("nvswitch_link_latency_high_vc2", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3:
+ CollectdMetadata("nvswitch_link_latency_high_vc3", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0:
+ CollectdMetadata("nvswitch_link_latency_panic_vc0", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1:
+ CollectdMetadata("nvswitch_link_latency_panic_vc1", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2:
+ CollectdMetadata("nvswitch_link_latency_panic_vc2", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3:
+ CollectdMetadata("nvswitch_link_latency_panic_vc3", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0:
+ CollectdMetadata("nvswitch_link_latency_count_vc0", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1:
+ CollectdMetadata("nvswitch_link_latency_count_vc1", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2:
+ CollectdMetadata("nvswitch_link_latency_count_vc2", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3:
+ CollectdMetadata("nvswitch_link_latency_count_vc3", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0:
+ CollectdMetadata("nvswitch_link_crc_errors_lane0", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1:
+ CollectdMetadata("nvswitch_link_crc_errors_lane1", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2:
+ CollectdMetadata("nvswitch_link_crc_errors_lane2", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3:
+ CollectdMetadata("nvswitch_link_crc_errors_lane3", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0:
+ CollectdMetadata("nvswitch_link_ecc_errors_lane0", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1:
+ CollectdMetadata("nvswitch_link_ecc_errors_lane1", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2:
+ CollectdMetadata("nvswitch_link_ecc_errors_lane2", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3:
+ CollectdMetadata("nvswitch_link_ecc_errors_lane3", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS:
+ CollectdMetadata("nvswitch_fatal_error", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS:
+ CollectdMetadata("nvswitch_non_fatal_error", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT:
+ CollectdMetadata("nvswitch_temperature_current", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN:
+ CollectdMetadata("nvswitch_temperature_limit_slowdown",
+ "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN:
+ CollectdMetadata("nvswitch_temperature_limit_shutdown",
+ "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX:
+ CollectdMetadata("nvswitch_throughput_tx", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX:
+ CollectdMetadata("nvswitch_throughput_rx", "value:GAUGE:U:U"),
+ DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY:
+ CollectdMetadata("cuda_compute_capability", "value:GAUGE:U:U"),
+ DCGM_FI_PROF_GR_ENGINE_ACTIVE:
+ CollectdMetadata("gr_engine_active", "value:GAUGE:0.0:1.0", True),
+ DCGM_FI_PROF_SM_ACTIVE:
+ CollectdMetadata("sm_active", "value:GAUGE:0.0:1.0", True),
+ DCGM_FI_PROF_SM_OCCUPANCY:
+ CollectdMetadata("sm_occupancy", "value:GAUGE:0:U", True),
+ DCGM_FI_PROF_PIPE_TENSOR_ACTIVE:
+ CollectdMetadata("tensor_active", "value:GAUGE:0.0:1.0", True),
+ DCGM_FI_PROF_DRAM_ACTIVE:
+ CollectdMetadata("dram_active", "value:GAUGE:0.0:1.0", True),
+ DCGM_FI_PROF_PIPE_FP64_ACTIVE:
+ CollectdMetadata("fp64_active", "value:GAUGE:U:U"),
+ DCGM_FI_PROF_PIPE_FP32_ACTIVE:
+ CollectdMetadata("fp32_active", "value:GAUGE:U:U"),
+ DCGM_FI_PROF_PIPE_FP16_ACTIVE:
+ CollectdMetadata("fp16_active", "value:GAUGE:U:U"),
+ DCGM_FI_PROF_PCIE_TX_BYTES:
+ CollectdMetadata("pcie_tx_bytes", "value:GAUGE:U:U"),
+ DCGM_FI_PROF_PCIE_RX_BYTES:
+ CollectdMetadata("pcie_rx_bytes", "value:GAUGE:U:U"),
+ DCGM_FI_PROF_NVLINK_TX_BYTES:
+ CollectdMetadata("nvlink_tx_bytes", "value:GAUGE:U:U"),
+ DCGM_FI_PROF_NVLINK_RX_BYTES:
+ CollectdMetadata("nvlink_rx_bytes", "value:GAUGE:U:U"),
+ DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE:
+ CollectdMetadata("tensor_imma_active", "value:GAUGE:0.0:1.0", True),
+ DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE:
+ CollectdMetadata("tensor_hmma_active", "value:GAUGE:0.0:1.0", True),
+}
+
+__fieldDict = None
+
+
+def GenerateCollectdTypesDB():
+ length = max(
+ map(lambda x: len(x.name) if x else 0, CollectdMetadataDict.values()))
+
+ fmt = "{0:<" + str(length) + "}"
+ fail = False
+
+ for item in filter(None, CollectdMetadataDict.values()):
+ item_list = item.kind.split(':')
+
+ # Some rudimentary syntax checking.
+
+ if len(item_list) != 4:
+ sys.stderr.write(
+ 'Item ' + item.name +
+ ' has wrong number of collectd type fields - four required.\n')
+ fail = True
+
+ if item_list[1] not in ['GAUGE', 'COUNTER', 'DERIVE', 'ABSOLUTE']:
+ sys.stderr.write(
+ 'Item ' + item.name +
+ ' should be one of GAUGE, COUNTER, DERIVE, ABSOLUTE.\n')
+ fail = True
+
+ # We check this so we can enumerate all dcgm fields for possible
+ # inclusion, even if some are not (yet) formally supported.
+
+ if item.used:
+ print(fmt.format(item.name), item.kind)
+
+ if fail:
+ exit("Failed on db.types table syntax errors.\n")
+
+
+def GetFieldByName(name):
+ global __fieldDict
+
+ if name.isnumeric():
+ return int(name)
+
+ if __fieldDict == None:
+ __fieldDict = {}
+
+ for key in CollectdMetadataDict:
+ item = CollectdMetadataDict[key]
+
+ if item != None:
+ __fieldDict[item.name] = key
+
+ if name not in __fieldDict.keys():
+ return -1
+
+ return __fieldDict[name]
+
+
+if __name__ == '__main__':
+ GenerateCollectdTypesDB()
diff --git a/model_analyzer/monitor/dcgm/dcgm_fields_internal.py b/model_analyzer/monitor/dcgm/dcgm_fields_internal.py
new file mode 100644
index 000000000..9502c959a
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/dcgm_fields_internal.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##
+# Python bindings for the internal API of DCGM library (dcgm_fields_internal.hpp)
+##
+
+from ctypes import *
+from ctypes.util import find_library
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+
+# Provides access to functions
+dcgmFP = dcgm_structs._dcgmGetFunctionPointer
+
+#internal-only fields
+DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES = 210 #Memory utilization samples
+DCGM_FI_DEV_GPU_UTIL_SAMPLES = 211 #SM utilization samples
+DCGM_FI_DEV_GRAPHICS_PIDS = 220 #Graphics processes running on the GPU.
+DCGM_FI_DEV_COMPUTE_PIDS = 221 #Compute processes running on the GPU.
diff --git a/model_analyzer/monitor/dcgm/dcgm_fluentd.py b/model_analyzer/monitor/dcgm/dcgm_fluentd.py
new file mode 100644
index 000000000..24a345100
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/dcgm_fluentd.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from model_analyzer.monitor.dcgm.common.dcgm_client_main import main
+from model_analyzer.monitor.dcgm.DcgmJsonReader import DcgmJsonReader
+from socket import socket, AF_INET, SOCK_DGRAM
+
+# Displayed to the user
+FLUENTD_NAME = 'Fluentd'
+DEFAULT_FLUENTD_PORT = 24225
+
+# Fluentd Configuration
+# =====================
+# In order to use this client, Fluentd needs to accept json over udp.
+# The default port is 24225
+
+
+class DcgmFluentd(DcgmJsonReader):
+ ###########################################################################
+ def __init__(self, publish_hostname, publish_port, **kwargs):
+ self.m_sock = socket(AF_INET, SOCK_DGRAM)
+ self.m_dest = (publish_hostname, publish_port)
+ super(DcgmFluentd, self).__init__(**kwargs)
+
+ ###########################################################################
+ def SendToFluentd(self, payload):
+ self.m_sock.sendto(payload, self.m_dest)
+
+ ###########################################################################
+ def CustomJsonHandler(self, outJson):
+ self.SendToFluentd(outJson)
+
+
+if __name__ == '__main__': # pragma: no cover
+ main(DcgmFluentd, FLUENTD_NAME, DEFAULT_FLUENTD_PORT, add_target_host=True)
diff --git a/model_analyzer/monitor/dcgm/dcgm_prometheus.py b/model_analyzer/monitor/dcgm/dcgm_prometheus.py
new file mode 100644
index 000000000..f6f69a613
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/dcgm_prometheus.py
@@ -0,0 +1,326 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+import time
+import logging
+import os
+import argparse
+import sys
+import signal
+
+dir_path = os.path.dirname(os.path.realpath(__file__))
+parent_dir_path = os.path.abspath(os.path.join(dir_path, os.pardir))
+sys.path.insert(0, parent_dir_path)
+
+from model_analyzer.monitor.dcgm.DcgmReader import DcgmReader
+from model_analyzer.monitor.dcgm.common import dcgm_client_cli_parser as cli
+
+if 'DCGM_TESTING_FRAMEWORK' in os.environ:
+ try:
+ from prometheus_tester_api import start_http_server, Gauge
+ except:
+ logging.critical(
+ "prometheus_tester_api missing, reinstall test framework.")
+ sys.exit(3)
+else:
+ try:
+ from prometheus_client import start_http_server, Gauge
+ except ImportError:
+ pass
+ logging.critical(
+ "prometheus_client not installed, please run: \"pip install prometheus_client\""
+ )
+ sys.exit(3)
+
+DEFAULT_FIELDS = [
+ dcgm_fields.DCGM_FI_DEV_PCI_BUSID, #Needed for plugin_instance
+ dcgm_fields.DCGM_FI_DEV_POWER_USAGE,
+ dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
+ dcgm_fields.DCGM_FI_DEV_SM_CLOCK,
+ dcgm_fields.DCGM_FI_DEV_GPU_UTIL,
+ dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING,
+ dcgm_fields.DCGM_FI_DEV_RETIRED_SBE,
+ dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
+ dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_FB_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_FB_FREE,
+ dcgm_fields.DCGM_FI_DEV_FB_USED,
+ dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
+ dcgm_fields.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION,
+ dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION,
+ dcgm_fields.DCGM_FI_DEV_XID_ERRORS,
+ dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL,
+ dcgm_fields.DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL,
+]
+
+
+class DcgmPrometheus(DcgmReader):
+ ###########################################################################
+ def __init__(self):
+ #Have DCGM update its watches twice as fast as our update interval so we don't get out of phase by our update interval
+ updateIntervalUsec = int(
+ (1000000 * g_settings['prometheusPublishInterval']) / 2)
+ #Add our PID to our field group name so we can have multiple instances running
+ fieldGroupName = 'dcgm_prometheus_' + str(os.getpid())
+
+ DcgmReader.__init__(self,
+ ignoreList=g_settings['ignoreList'],
+ fieldIds=g_settings['publishFieldIds'],
+ updateFrequency=updateIntervalUsec,
+ fieldGroupName=fieldGroupName,
+ hostname=g_settings['dcgmHostName'])
+ self.m_existingGauge = {}
+
+ ###########################################################################
+ '''
+ This function is implemented from the base class : DcgmReader. It converts each
+ field / value from the fvs dictionary to a gauge and publishes the gauge to the
+ prometheus client server.
+
+ @params:
+ fvs : The fieldvalue dictionary that contains info about the values of field Ids for each gpuId.
+ '''
+
+ def CustomDataHandler(self, fvs):
+ if not self.m_existingGauge:
+ self.SetupGauges()
+
+ for _, fieldIds in self.m_publishFields.items():
+ if fieldIds is None:
+ continue
+
+ for fieldId in fieldIds:
+ if fieldId in self.m_dcgmIgnoreFields:
+ continue
+
+ g = self.m_existingGauge[fieldId]
+
+ for gpuId in list(fvs.keys()):
+ gpuFv = fvs[gpuId]
+ val = gpuFv[fieldId][-1]
+
+ #Skip blank values. Otherwise, we'd have to insert a placeholder blank value based on the fieldId
+ if val.isBlank:
+ continue
+
+ gpuUuid = self.m_gpuIdToUUId[gpuId]
+ gpuBusId = self.m_gpuIdToBusId[gpuId]
+ gpuUniqueId = gpuUuid if g_settings['sendUuid'] else gpuBusId
+
+ # pylint doesn't find the labels member for Gauge, but it exists. Ignore the warning
+ g.labels(gpuId, gpuUniqueId).set(val.value) # pylint: disable=no-member
+
+ logging.debug(
+ 'Sent GPU %d %s %s = %s' %
+ (gpuId, gpuUniqueId, self.m_fieldIdToInfo[fieldId].tag,
+ str(val.value)))
+
+ ###############################################################################
+ '''
+ NOTE: even though some fields are monotonically increasing and therefore fit the mold to be
+ counters, all are published as gauges so that DCGM is the sole authority on the state of the
+ system, preventing problems around down times, driver reboots, and the unlikely event of
+ flashing the inforom.
+ For specific information about which fields monotonically increase, see the API guide or
+ dcgm_fields.h
+ '''
+
+ def SetupGauges(self):
+ for _, fieldIds in self.m_publishFields.items():
+ if fieldIds is None:
+ continue
+
+ for fieldId in fieldIds:
+ if fieldId in self.m_dcgmIgnoreFields:
+ continue
+
+ uniqueIdName = 'GpuUuid' if g_settings[
+ 'sendUuid'] else 'GpuBusID'
+
+ fieldTag = self.m_fieldIdToInfo[fieldId].tag
+ self.m_existingGauge[fieldId] = Gauge("dcgm_" + fieldTag,
+ 'DCGM_PROMETHEUS',
+ ['GpuID', uniqueIdName])
+
+ ###############################################################################
+ '''
+ Scrape the fieldvalue data and publish. This function calls the process function of
+ the base class DcgmReader.
+ '''
+
+ def Scrape(self, data=None):
+ return self.Process()
+
+ ###############################################################################
+ def LogBasicInformation(self):
+ # Reconnect causes everything to get initialized
+ self.Reconnect()
+
+ logging.info('Started prometheus client')
+
+ fieldTagList = ''
+
+ for _, fieldIds in self.m_publishFields.items():
+ if fieldIds is None:
+ continue
+
+ for fieldId in fieldIds:
+ if fieldId in self.m_dcgmIgnoreFields:
+ continue
+
+ if fieldTagList == '':
+ fieldTagList = self.m_fieldIdToInfo[fieldId].tag
+ else:
+ fieldTagList = fieldTagList + ", %s" % (
+ self.m_fieldIdToInfo[fieldId].tag)
+
+ logging.info("Publishing fields: '%s'" % (fieldTagList))
+
+ ###############################################################################
+ def LogError(self, msg):
+ logging.error(msg)
+
+ ###############################################################################
+ def LogInfo(self, msg):
+ logging.info(msg)
+
+
+###############################################################################
+def exit_handler(signum, frame):
+ g_settings['shouldExit'] = True
+
+
+###############################################################################
+def main_loop(prometheus_obj, publish_interval):
+ try:
+ while True:
+ prometheus_obj.Scrape(prometheus_obj)
+ time.sleep(publish_interval)
+
+ if g_settings['shouldExit'] == True:
+ prometheus_obj.LogInfo('Received a signal...shutting down')
+ break
+ except KeyboardInterrupt:
+ print("Caught CTRL-C. Exiting")
+
+
+###############################################################################
+def initialize_globals():
+ '''
+ Name of the host.
+ '''
+ global g_settings
+ g_settings = {}
+
+ g_settings['shouldExit'] = False
+ '''
+ List of the ids that are present in g_settings['publishFieldIds'] but ignored for watch.
+ '''
+ g_settings['ignoreList'] = [
+ dcgm_fields.DCGM_FI_DEV_PCI_BUSID,
+ ]
+ '''
+ Those are initialized by the CLI parser. We only list them here for clarity.
+ '''
+ for key in [
+ 'dcgmHostName',
+ 'prometheusPort',
+ 'prometheusPublishInterval',
+ 'publishFieldIds',
+ ]:
+ g_settings[key] = None
+
+
+###############################################################################
+def parse_command_line():
+ parser = cli.create_parser(
+ name='Prometheus',
+ field_ids=DEFAULT_FIELDS,
+ )
+
+ cli.add_custom_argument(parser,
+ '--send-uuid',
+ dest='send_uuid',
+ default=False,
+ action='store_true',
+ help='Send GPU UUID instead of bus id')
+
+ args = cli.run_parser(parser)
+ field_ids = cli.get_field_ids(args)
+ numeric_log_level = cli.get_log_level(args)
+
+ # Defaults to localhost, so we need to set it to None
+ if args.embedded:
+ g_settings['dcgmHostName'] = None
+ else:
+ g_settings['dcgmHostName'] = args.hostname
+
+ g_settings['prometheusPort'] = args.publish_port
+
+ g_settings['prometheusPublishInterval'] = args.interval
+
+ logfile = args.logfile
+
+ g_settings['publishFieldIds'] = field_ids
+
+ g_settings['sendUuid'] = args.send_uuid
+
+ if logfile != None:
+ logging.basicConfig(level=numeric_log_level,
+ filename=logfile,
+ filemode='w+',
+ format='%(asctime)s %(levelname)s: %(message)s')
+ else:
+ logging.basicConfig(level=numeric_log_level,
+ stream=sys.stdout,
+ filemode='w+',
+ format='%(asctime)s %(levelname)s: %(message)s')
+
+
+###############################################################################
+def initialize_signal_handlers():
+ signal.signal(signal.SIGINT, exit_handler)
+ signal.signal(signal.SIGTERM, exit_handler)
+
+
+###############################################################################
+def main():
+ initialize_globals()
+
+ initialize_signal_handlers()
+
+ parse_command_line()
+
+ prometheus_obj = DcgmPrometheus()
+
+ logging.info("Starting Prometheus server on port " +
+ str(g_settings['prometheusPort']))
+
+ #start prometheus client server.
+ start_http_server(g_settings['prometheusPort'])
+
+ prometheus_obj.LogBasicInformation()
+
+ main_loop(prometheus_obj, g_settings['prometheusPublishInterval'])
+
+ prometheus_obj.Shutdown()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/model_analyzer/monitor/dcgm/dcgm_structs.py b/model_analyzer/monitor/dcgm/dcgm_structs.py
index e401c4181..233d15564 100755
--- a/model_analyzer/monitor/dcgm/dcgm_structs.py
+++ b/model_analyzer/monitor/dcgm/dcgm_structs.py
@@ -1,6 +1,4 @@
-#!/usr/bin/env python3
-
-# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -13,49 +11,30 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+##
+# Python bindings for "dcgm_structs.h"
+##
-import json
-import os
-import platform
-import string
+from ctypes import *
+from ctypes.util import find_library
import sys
+import os
import threading
-from ctypes import (
- CDLL,
- POINTER,
- Array,
- Structure,
- Union,
- c_bool,
- c_byte,
- c_char,
- c_char_p,
- c_double,
- c_int,
- c_int32,
- c_int64,
- c_longlong,
- c_short,
- c_uint,
- c_uint16,
- c_uint32,
- c_uint64,
- c_ulong,
- c_ushort,
- c_void_p,
- sizeof,
-)
-
-import distro
-
-import model_analyzer.monitor.dcgm.dcgm_value as dcgmvalue
+import string
+import json
+import model_analyzer.monitor.dcgm.dcgmvalue as dcgmvalue
+import platform
+from inspect import isclass
DCGM_MAX_STR_LENGTH = 256
DCGM_MAX_NUM_DEVICES = 32 # DCGM 2.0 and newer = 32. DCGM 1.8 and older = 16
DCGM_MAX_NUM_SWITCHES = 12
-DCGM_NVLINK_MAX_LINKS_PER_GPU = 12
+DCGM_NVLINK_MAX_LINKS_PER_GPU = 18
DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1 = 6
-DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH = 36
+DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY2 = 12
+DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH_V1 = 36 # Max NvLinks per NvSwitch pre-Hopper
+DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH = 64
+DCGM_LANE_MAX_LANES_PER_NVSWICH_LINK = 4
DCGM_MAX_CLOCKS = 256
DCGM_MAX_NUM_GROUPS = 64
DCGM_MAX_BLOB_LENGTH = 4096
@@ -66,8 +45,7 @@
DCGM_DEVICE_UUID_BUFFER_SIZE = 80
DCGM_MAX_FBC_SESSIONS = 256
-# When more than one value is returned from a query, which order should it be
-# returned in?
+#When more than one value is returned from a query, which order should it be returned in?
DCGM_ORDER_ASCENDING = 1
DCGM_ORDER_DESCENDING = 2
@@ -83,125 +61,72 @@
DCGM_FBC_SESSION_TYPE_VID = 3 # FB capture for a Vid buffer
DCGM_FBC_SESSION_TYPE_HWENC = 4 # FB capture for a NVENC HW buffer
-# C Type mappings #
-# Enums
+## C Type mappings ##
+## Enums
# Return types
_dcgmReturn_t = c_uint
-# Success
-DCGM_ST_OK = 0
-# A bad parameter was passed to a function
-DCGM_ST_BADPARAM = -1
-# A generic, unspecified error
-DCGM_ST_GENERIC_ERROR = -3
-# An out of memory error occurred
-DCGM_ST_MEMORY = -4
-# Setting not configured
-DCGM_ST_NOT_CONFIGURED = -5
-# Feature not supported
-DCGM_ST_NOT_SUPPORTED = -6
-# DCGM Init error
-DCGM_ST_INIT_ERROR = -7
-# When NVML returns error.
-DCGM_ST_NVML_ERROR = -8
-# Object is in pending state of something else
-DCGM_ST_PENDING = -9
-# Object is in undefined state
-DCGM_ST_UNINITIALIZED = -10
-# Requested operation timed out
-DCGM_ST_TIMEOUT = -11
-# Version mismatch between received and understood API
-DCGM_ST_VER_MISMATCH = -12
-# Unknown field id
-DCGM_ST_UNKNOWN_FIELD = -13
-# No data is available
-DCGM_ST_NO_DATA = -14
+DCGM_ST_OK = 0 # Success
+DCGM_ST_BADPARAM = -1 # A bad parameter was passed to a function
+DCGM_ST_GENERIC_ERROR = -3 # A generic, unspecified error
+DCGM_ST_MEMORY = -4 # An out of memory error occured
+DCGM_ST_NOT_CONFIGURED = -5 # Setting not configured
+DCGM_ST_NOT_SUPPORTED = -6 # Feature not supported
+DCGM_ST_INIT_ERROR = -7 # DCGM Init error
+DCGM_ST_NVML_ERROR = -8 # When NVML returns error.
+DCGM_ST_PENDING = -9 # Object is in pending state of something else
+DCGM_ST_UNINITIALIZED = -10 # Object is in undefined state
+DCGM_ST_TIMEOUT = -11 # Requested operation timed out
+DCGM_ST_VER_MISMATCH = -12 # Version mismatch between received and understood API
+DCGM_ST_UNKNOWN_FIELD = -13 # Unknown field id
+DCGM_ST_NO_DATA = -14 # No data is available
DCGM_ST_STALE_DATA = -15
-# The given field is not being updated by the cache manager
-DCGM_ST_NOT_WATCHED = -16
-# We are not permissioned to perform the desired action
-DCGM_ST_NO_PERMISSION = -17
-# GPU is no longer reachable
-DCGM_ST_GPU_IS_LOST = -18
-# GPU requires a reset
-DCGM_ST_RESET_REQUIRED = -19
-# Unable to find function
-DCGM_ST_FUNCTION_NOT_FOUND = -20
-# Connection to the host engine is not valid any longer
-DCGM_ST_CONNECTION_NOT_VALID = -21
-# This GPU is not supported by DCGM
-DCGM_ST_GPU_NOT_SUPPORTED = -22
-# The GPUs of the provided group are not compatible with each other for the
-# requested operation
-DCGM_ST_GROUP_INCOMPATIBLE = -23
+DCGM_ST_NOT_WATCHED = -16 # The given field is not being updated by the cache manager
+DCGM_ST_NO_PERMISSION = -17 # We are not permissioned to perform the desired action
+DCGM_ST_GPU_IS_LOST = -18 # GPU is no longer reachable
+DCGM_ST_RESET_REQUIRED = -19 # GPU requires a reset
+DCGM_ST_FUNCTION_NOT_FOUND = -20 # Unable to find function
+DCGM_ST_CONNECTION_NOT_VALID = -21 # Connection to the host engine is not valid any longer
+DCGM_ST_GPU_NOT_SUPPORTED = -22 # This GPU is not supported by DCGM
+DCGM_ST_GROUP_INCOMPATIBLE = -23 # The GPUs of the provided group are not compatible with each other for the requested operation
DCGM_ST_MAX_LIMIT = -24
-# DCGM library could not be found
-DCGM_ST_LIBRARY_NOT_FOUND = -25
-# Duplicate key passed to the function
-DCGM_ST_DUPLICATE_KEY = -26
-# GPU is already a part of a sync boost group
-DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27
-# GPU is a not a part of sync boost group
-DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP = -28
-# This operation cannot be performed when the host engine is running as
-# non-root
-DCGM_ST_REQUIRES_ROOT = -29
-# DCGM GPU Diagnostic was successfully executed, but reported an error.
-DCGM_ST_NVVS_ERROR = -30
-# An input argument is not large enough
-DCGM_ST_INSUFFICIENT_SIZE = -31
-# The given field ID is not supported by the API being called
-DCGM_ST_FIELD_UNSUPPORTED_BY_API = -32
-# This request is serviced by a module of DCGM that is not currently loaded
-DCGM_ST_MODULE_NOT_LOADED = -33
-# The requested operation could not be completed because the affected resource
-# is in use
-DCGM_ST_IN_USE = -34
-# The specified group is empty and this operation is not valid with an empty
-# group
-DCGM_ST_GROUP_IS_EMPTY = -35
-# Profiling is not supported for this group of GPUs or GPU
-DCGM_ST_PROFILING_NOT_SUPPORTED = -36
-# The third-party Profiling module returned an unrecoverable error
-DCGM_ST_PROFILING_LIBRARY_ERROR = -37
-# The requested profiling metrics cannot be collected in a single pass
-DCGM_ST_PROFILING_MULTI_PASS = -38
-# A diag instance is already running, cannot run a new diag until the current
-# one finishes.
-DCGM_ST_DIAG_ALREADY_RUNNING = -39
-# The DCGM GPU Diagnostic returned JSON that cannot be parsed
-DCGM_ST_DIAG_BAD_JSON = -40
-# Error while launching the DCGM GPU Diagnostic
-DCGM_ST_DIAG_BAD_LAUNCH = -41
-# There is too much variance while training the diagnostic
-DCGM_ST_DIAG_VARIANCE = -42
-# A field value met or exceeded the error threshold.
-DCGM_ST_DIAG_THRESHOLD_EXCEEDED = -43
-# The installed driver version is insufficient for this API
-DCGM_ST_INSUFFICIENT_DRIVER_VERSION = -44
-# The specified GPU instance does not exist
-DCGM_ST_INSTANCE_NOT_FOUND = -45
-# The specified GPU compute instance does not exist
-DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND = -46
-# Could not kill a child process within the retries
-DCGM_ST_CHILD_NOT_KILLED = -47
-# Detected an error in a 3rd-party library
-DCGM_ST_3RD_PARTY_LIBRARY_ERROR = -48
-# Not enough resources available
-DCGM_ST_INSUFFICIENT_RESOURCES = -49
-
-# All the GPUs on the node are added to the group
-DCGM_GROUP_DEFAULT = 0
-# Creates an empty group
-DCGM_GROUP_EMPTY = 1
-# All NvSwitches of the node are added to the group
-DCGM_GROUP_DEFAULT_NVSWITCHES = 2
-# All GPU instances of the node are added to the group
-DCGM_GROUP_DEFAULT_INSTANCES = 3
-# All compute instances of the node are added to the group
-DCGM_GROUP_DEFAULT_COMPUTE_INSTANCES = 4
-# All entities are added to this default group
-DCGM_GROUP_DEFAULT_ENTITIES = 5
+DCGM_ST_LIBRARY_NOT_FOUND = -25 # DCGM library could not be found
+DCGM_ST_DUPLICATE_KEY = -26 #Duplicate key passed to the function
+DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27 #GPU is already a part of a sync boost group
+DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP = -28 #GPU is a not a part of sync boost group
+DCGM_ST_REQUIRES_ROOT = -29 #This operation cannot be performed when the host engine is running as non-root
+DCGM_ST_NVVS_ERROR = -30 #DCGM GPU Diagnostic was successfully executed, but reported an error.
+DCGM_ST_INSUFFICIENT_SIZE = -31 #An input argument is not large enough
+DCGM_ST_FIELD_UNSUPPORTED_BY_API = -32 #The given field ID is not supported by the API being called
+DCGM_ST_MODULE_NOT_LOADED = -33 #This request is serviced by a module of DCGM that is not currently loaded
+DCGM_ST_IN_USE = -34 #The requested operation could not be completed because the affected resource is in use
+DCGM_ST_GROUP_IS_EMPTY = -35 # The specified group is empty and this operation is not valid with an empty group
+DCGM_ST_PROFILING_NOT_SUPPORTED = -36 # Profiling is not supported for this group of GPUs or GPU
+DCGM_ST_PROFILING_LIBRARY_ERROR = -37 # The third-party Profiling module returned an unrecoverable error
+DCGM_ST_PROFILING_MULTI_PASS = -38 # The requested profiling metrics cannot be collected in a single pass
+DCGM_ST_DIAG_ALREADY_RUNNING = -39 # A diag instance is already running, cannot run a new diag until the current one finishes.
+DCGM_ST_DIAG_BAD_JSON = -40 # The DCGM GPU Diagnostic returned JSON that cannot be parsed
+DCGM_ST_DIAG_BAD_LAUNCH = -41 # Error while launching the DCGM GPU Diagnostic
+DCGM_ST_DIAG_UNUSED = -42 # Unused
+DCGM_ST_DIAG_THRESHOLD_EXCEEDED = -43 # A field value met or exceeded the error threshold.
+DCGM_ST_INSUFFICIENT_DRIVER_VERSION = -44 # The installed driver version is insufficient for this API
+DCGM_ST_INSTANCE_NOT_FOUND = -45 # The specified GPU instance does not exist
+DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND = -46 # The specified GPU compute instance does not exist
+DCGM_ST_CHILD_NOT_KILLED = -47 # Couldn't kill a child process within the retries
+DCGM_ST_3RD_PARTY_LIBRARY_ERROR = -48 # Detected an error in a 3rd-party library
+DCGM_ST_INSUFFICIENT_RESOURCES = -49 # Not enough resources available
+DCGM_ST_PLUGIN_EXCEPTION = -50 # Exception thrown from a diagnostic plugin
+DCGM_ST_NVVS_ISOLATE_ERROR = -51 # The diagnostic returned an error that indicates the need for isolation
+DCGM_ST_NVVS_BINARY_NOT_FOUND = -52 # The NVVS binary was not found in the specified location
+DCGM_ST_NVVS_KILLED = -53 # The NVVS process was killed by a signal
+DCGM_ST_PAUSED = -54 # The hostengine and all modules are paused
+
+DCGM_GROUP_DEFAULT = 0 # All the GPUs on the node are added to the group
+DCGM_GROUP_EMPTY = 1 # Creates an empty group
+DCGM_GROUP_DEFAULT_NVSWITCHES = 2 # All NvSwitches of the node are added to the group
+DCGM_GROUP_DEFAULT_INSTANCES = 3 # All GPU instances of the node are added to the group
+DCGM_GROUP_DEFAULT_COMPUTE_INSTANCES = 4 # All compute instances of the node are added to the group
+DCGM_GROUP_DEFAULT_ENTITIES = 5 # All entities are added to this default group
DCGM_GROUP_ALL_GPUS = 0x7FFFFFFF
DCGM_GROUP_ALL_NVSWITCHES = 0x7FFFFFFE
@@ -209,26 +134,17 @@
DCGM_GROUP_ALL_COMPUTE_INSTANCES = 0x7FFFFFFC
DCGM_GROUP_ALL_ENTITIES = 0x7FFFFFFB
-# Maximum number of entities per entity group
-DCGM_GROUP_MAX_ENTITIES = 64
+DCGM_GROUP_MAX_ENTITIES = 64 #Maximum number of entities per entity group
-# The target configuration values to be applied
-DCGM_CONFIG_TARGET_STATE = 0
-# The current configuration state
-DCGM_CONFIG_CURRENT_STATE = 1
+DCGM_CONFIG_TARGET_STATE = 0 # The target configuration values to be applied
+DCGM_CONFIG_CURRENT_STATE = 1 # The current configuration state
-# Represents the power cap to be applied for each member of the group
-DCGM_CONFIG_POWER_CAP_INDIVIDUAL = 0
-# Represents the power budget for the entire group
-DCGM_CONFIG_POWER_BUDGET_GROUP = 1
+DCGM_CONFIG_POWER_CAP_INDIVIDUAL = 0 # Represents the power cap to be applied for each member of the group
+DCGM_CONFIG_POWER_BUDGET_GROUP = 1 # Represents the power budget for the entire group
-# Default compute mode -- multiple contexts per device
-DCGM_CONFIG_COMPUTEMODE_DEFAULT = 0
-# Compute-prohibited mode -- no contexts per device
-DCGM_CONFIG_COMPUTEMODE_PROHIBITED = 1
-# Compute-exclusive-process mode -- only one context per device, usable from
-# multiple threads at a time
-DCGM_CONFIG_COMPUTEMODE_EXCLUSIVE_PROCESS = 2
+DCGM_CONFIG_COMPUTEMODE_DEFAULT = 0 # Default compute mode -- multiple contexts per device
+DCGM_CONFIG_COMPUTEMODE_PROHIBITED = 1 # Compute-prohibited mode -- no contexts per device
+DCGM_CONFIG_COMPUTEMODE_EXCLUSIVE_PROCESS = 2 #* Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time
DCGM_TOPOLOGY_BOARD = 0x1
DCGM_TOPOLOGY_SINGLE = 0x2
@@ -249,19 +165,26 @@
DCGM_TOPOLOGY_NVLINK11 = 0x40000
DCGM_TOPOLOGY_NVLINK12 = 0x80000
-# Diagnostic per gpu tests - fixed indices for
-# dcgmDiagResponsePerGpu_t.results[]
+# Diagnostic per gpu tests - fixed indices for dcgmDiagResponsePerGpu_t.results[]
DCGM_MEMORY_INDEX = 0
DCGM_DIAGNOSTIC_INDEX = 1
DCGM_PCI_INDEX = 2
-DCGM_SM_PERF_INDEX = 3
-DCGM_TARGETED_PERF_INDEX = 4
+DCGM_SM_STRESS_INDEX = 3
+DCGM_TARGETED_STRESS_INDEX = 4
DCGM_TARGETED_POWER_INDEX = 5
DCGM_MEMORY_BANDWIDTH_INDEX = 6
-DCGM_PER_GPU_TEST_COUNT = 7
+DCGM_MEMTEST_INDEX = 7
+DCGM_PULSE_TEST_INDEX = 8
+DCGM_EUD_TEST_INDEX = 9
+DCGM_UNUSED2_TEST_INDEX = 10
+DCGM_UNUSED3_TEST_INDEX = 11
+DCGM_UNUSED4_TEST_INDEX = 12
+DCGM_UNUSED5_TEST_INDEX = 13
+DCGM_PER_GPU_TEST_COUNT_V7 = 9
+DCGM_PER_GPU_TEST_COUNT_V8 = 13
# DCGM Diag Level One test indices
-DCGM_SWTEST_BLACKLIST = 0
+DCGM_SWTEST_DENYLIST = 0
DCGM_SWTEST_NVML_LIBRARY = 1
DCGM_SWTEST_CUDA_MAIN_LIBRARY = 2
DCGM_SWTEST_CUDA_RUNTIME_LIBRARY = 3
@@ -284,70 +207,116 @@ class DCGM_INTROSPECT_STATE(object):
# Lib loading
dcgmLib = None
libLoadLock = threading.Lock()
-# Incremented on each dcgmInit and decremented on dcgmShutdown
-_dcgmLib_refcount = 0
+_dcgmLib_refcount = 0 # Incremented on each dcgmInit and decremented on dcgmShutdown
class DCGMError(Exception):
- """
- Class to return error values for DCGM
- """
-
+ """ Class to return error values for DCGM """
_valClassMapping = dict()
# List of currently known error codes
_error_code_to_string = {
- DCGM_ST_OK: "Success",
- DCGM_ST_BADPARAM: "Bad parameter passed to function",
- DCGM_ST_GENERIC_ERROR: "Generic unspecified error",
- DCGM_ST_MEMORY: "Out of memory error",
- DCGM_ST_NOT_CONFIGURED: "Setting not configured",
- DCGM_ST_NOT_SUPPORTED: "Feature not supported",
- DCGM_ST_INIT_ERROR: "DCGM initialization error",
- DCGM_ST_NVML_ERROR: "NVML error",
- DCGM_ST_PENDING: "Object is in a pending state",
- DCGM_ST_UNINITIALIZED: "Object is in an undefined state",
- DCGM_ST_TIMEOUT: "Timeout",
- DCGM_ST_VER_MISMATCH: "API version mismatch",
- DCGM_ST_UNKNOWN_FIELD: "Unknown field",
- DCGM_ST_NO_DATA: "No data is available",
- DCGM_ST_STALE_DATA: "Data is considered stale",
- DCGM_ST_NOT_WATCHED: "Field is not being updated",
- DCGM_ST_NO_PERMISSION: "Not permissioned",
- DCGM_ST_GPU_IS_LOST: "GPU is unreachable",
- DCGM_ST_RESET_REQUIRED: "GPU requires a reset",
- DCGM_ST_FUNCTION_NOT_FOUND: "Unable to find function",
- DCGM_ST_CONNECTION_NOT_VALID: "The connection to the host engine is not valid any longer",
- DCGM_ST_GPU_NOT_SUPPORTED: "This GPU is not supported by DCGM",
- DCGM_ST_GROUP_INCOMPATIBLE: "GPUs are incompatible with each other for\
- the requested operation",
- DCGM_ST_MAX_LIMIT: "Max limit reached for the object",
- DCGM_ST_LIBRARY_NOT_FOUND: "DCGM library could not be found",
- DCGM_ST_DUPLICATE_KEY: "Duplicate key passed to function",
- DCGM_ST_GPU_IN_SYNC_BOOST_GROUP: "GPU is already a part of a sync boost group",
- DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP: "GPU is not a part of the sync boost group",
- DCGM_ST_REQUIRES_ROOT: "This operation is not supported when the host engine\
- is running as non root",
- DCGM_ST_NVVS_ERROR: "DCGM GPU Diagnostic returned an error.",
- DCGM_ST_INSUFFICIENT_SIZE: "An input argument is not large enough",
- DCGM_ST_FIELD_UNSUPPORTED_BY_API: "The given field ID is not supported by the API being called",
- DCGM_ST_MODULE_NOT_LOADED: "This request is serviced by a module of DCGM that\
- is not currently loaded",
- DCGM_ST_IN_USE: "The requested operation could not be completed because\
- the affected resource is in use",
- DCGM_ST_GROUP_IS_EMPTY: "The specified group is empty, and this operation\
- is incompatible with an empty group",
- DCGM_ST_PROFILING_NOT_SUPPORTED: "Profiling is not supported for this group of GPUs or GPU",
- DCGM_ST_PROFILING_LIBRARY_ERROR: "The third-party Profiling module returned an unrecoverable error",
- DCGM_ST_PROFILING_MULTI_PASS: "The requested profiling metrics\
- cannot be collected in a single pass",
- DCGM_ST_DIAG_ALREADY_RUNNING: "A diag instance is already running, cannot\
- run a new diag until the current one finishes",
- DCGM_ST_DIAG_BAD_JSON: "The GPU Diagnostic returned Json that cannot be parsed.",
- DCGM_ST_DIAG_BAD_LAUNCH: "Error while launching the GPU Diagnostic.",
- DCGM_ST_DIAG_VARIANCE: "The results of training DCGM GPU Diagnostic cannot\
- be trusted because they vary too much from run to run",
- DCGM_ST_DIAG_THRESHOLD_EXCEEDED: "A field value met or exceeded the error threshold.",
- DCGM_ST_INSUFFICIENT_DRIVER_VERSION: "The installed driver version is insufficient for this API",
+ DCGM_ST_OK:
+ "Success",
+ DCGM_ST_BADPARAM:
+ "Bad parameter passed to function",
+ DCGM_ST_GENERIC_ERROR:
+ "Generic unspecified error",
+ DCGM_ST_MEMORY:
+ "Out of memory error",
+ DCGM_ST_NOT_CONFIGURED:
+ "Setting not configured",
+ DCGM_ST_NOT_SUPPORTED:
+ "Feature not supported",
+ DCGM_ST_INIT_ERROR:
+ "DCGM initialization error",
+ DCGM_ST_NVML_ERROR:
+ "NVML error",
+ DCGM_ST_PENDING:
+ "Object is in a pending state",
+ DCGM_ST_UNINITIALIZED:
+ "Object is in an undefined state",
+ DCGM_ST_TIMEOUT:
+ "Timeout",
+ DCGM_ST_VER_MISMATCH:
+ "API version mismatch",
+ DCGM_ST_UNKNOWN_FIELD:
+ "Unknown field",
+ DCGM_ST_NO_DATA:
+ "No data is available",
+ DCGM_ST_STALE_DATA:
+ "Data is considered stale",
+ DCGM_ST_NOT_WATCHED:
+ "Field is not being updated",
+ DCGM_ST_NO_PERMISSION:
+ "Not permissioned",
+ DCGM_ST_GPU_IS_LOST:
+ "GPU is unreachable",
+ DCGM_ST_RESET_REQUIRED:
+ "GPU requires a reset",
+ DCGM_ST_FUNCTION_NOT_FOUND:
+ "Unable to find function",
+ DCGM_ST_CONNECTION_NOT_VALID:
+ "The connection to the host engine is not valid any longer",
+ DCGM_ST_GPU_NOT_SUPPORTED:
+ "This GPU is not supported by DCGM",
+ DCGM_ST_GROUP_INCOMPATIBLE:
+ "GPUs are incompatible with each other for the requested operation",
+ DCGM_ST_MAX_LIMIT:
+ "Max limit reached for the object",
+ DCGM_ST_LIBRARY_NOT_FOUND:
+ "DCGM library could not be found",
+ DCGM_ST_DUPLICATE_KEY:
+ "Duplicate key passed to function",
+ DCGM_ST_GPU_IN_SYNC_BOOST_GROUP:
+ "GPU is already a part of a sync boost group",
+ DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP:
+ "GPU is not a part of the sync boost group",
+ DCGM_ST_REQUIRES_ROOT:
+ "This operation is not supported when the host engine is running as non root",
+ DCGM_ST_NVVS_ERROR:
+ "DCGM GPU Diagnostic returned an error.",
+ DCGM_ST_INSUFFICIENT_SIZE:
+ "An input argument is not large enough",
+ DCGM_ST_FIELD_UNSUPPORTED_BY_API:
+ "The given field ID is not supported by the API being called",
+ DCGM_ST_MODULE_NOT_LOADED:
+ "This request is serviced by a module of DCGM that is not currently loaded",
+ DCGM_ST_IN_USE:
+ "The requested operation could not be completed because the affected resource is in use",
+ DCGM_ST_GROUP_IS_EMPTY:
+ "The specified group is empty, and this operation is incompatible with an empty group",
+ DCGM_ST_PROFILING_NOT_SUPPORTED:
+ "Profiling is not supported for this group of GPUs or GPU",
+ DCGM_ST_PROFILING_LIBRARY_ERROR:
+ "The third-party Profiling module returned an unrecoverable error",
+ DCGM_ST_PROFILING_MULTI_PASS:
+ "The requested profiling metrics cannot be collected in a single pass",
+ DCGM_ST_DIAG_ALREADY_RUNNING:
+ "A diag instance is already running, cannot run a new diag until the current one finishes",
+ DCGM_ST_DIAG_BAD_JSON:
+ "The GPU Diagnostic returned Json that cannot be parsed.",
+ DCGM_ST_DIAG_BAD_LAUNCH:
+ "Error while launching the GPU Diagnostic.",
+ DCGM_ST_DIAG_UNUSED:
+ "Unused error code",
+ DCGM_ST_DIAG_THRESHOLD_EXCEEDED:
+ "A field value met or exceeded the error threshold.",
+ DCGM_ST_INSUFFICIENT_DRIVER_VERSION:
+ "The installed driver version is insufficient for this API",
+ DCGM_ST_INSTANCE_NOT_FOUND:
+ "The specified GPU instance does not exist",
+ DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND:
+ "The specified GPU compute instance does not exist",
+ DCGM_ST_CHILD_NOT_KILLED:
+ "Couldn't kill a child process within the retries",
+ DCGM_ST_3RD_PARTY_LIBRARY_ERROR:
+ "Detected an error in a 3rd-party library",
+ DCGM_ST_INSUFFICIENT_RESOURCES:
+ "Not enough resources available",
+ DCGM_ST_PLUGIN_EXCEPTION:
+ "Exception thrown from a diagnostic plugin",
+ DCGM_ST_NVVS_ISOLATE_ERROR:
+ "The diagnostic returned an error that indicates the need for isolation",
}
def __new__(typ, value):
@@ -366,11 +335,9 @@ def __str__(self):
try:
if self.value not in DCGMError._error_code_to_string:
DCGMError._error_code_to_string[self.value] = str(
- _dcgmErrorString(self.value)
- )
+ _dcgmErrorString(self.value))
msg = DCGMError._error_code_to_string[self.value]
- # Ensure we catch all exceptions, otherwise the error code will be
- # hidden in a traceback
+ # Ensure we catch all exceptions, otherwise the error code will be hidden in a traceback
except BaseException:
msg = "DCGM Error with code %d" % self.value
@@ -383,16 +350,16 @@ def __str__(self):
def __eq__(self, other):
return self.value == other.value
+ def __hash__(self):
+ return hash(self.value)
+
def SetAdditionalInfo(self, msg):
"""
- Sets msg as additional information returned by the string
- representation of DCGMError and subclasses. Example output for
- DCGMError_Uninitialized subclass, with msg set to 'more info msg
- here' is "DCGMError_Uninitialized: Object is in an undefined state:
- 'more info msg here'".
-
- Ensure that msg is a string or an object for which the __str__()
- method does not throw an error
+ Sets msg as additional information returned by the string representation of DCGMError and subclasses.
+ Example output for DCGMError_Uninitialized subclass, with msg set to 'more info msg here' is
+ "DCGMError_Uninitialized: Object is in an undefined state: 'more info msg here'".
+
+ Ensure that msg is a string or an object for which the __str__() method does not throw an error
"""
self.info = msg
@@ -402,33 +369,34 @@ def dcgmExceptionClass(error_code):
def _extractDCGMErrorsAsClasses():
- """
+ '''
Generates a hierarchy of classes on top of DCGMLError class.
- Each DCGM Error gets a new DCGMError subclass. This way try,except blocks
- can filter appropriate exceptions more easily.
+ Each DCGM Error gets a new DCGMError subclass. This way try,except blocks can filter appropriate
+ exceptions more easily.
DCGMError is a parent class. Each DCGM_ST_* gets it's own subclass.
e.g. DCGM_ST_UNINITIALIZED will be turned into DCGMError_Uninitialized
- """
-
+ '''
this_module = sys.modules[__name__]
dcgmErrorsNames = filter(lambda x: x.startswith("DCGM_ST_"), dir(this_module))
for err_name in dcgmErrorsNames:
# e.g. Turn DCGM_ST_UNINITIALIZED into DCGMError_Uninitialized
class_name = "DCGMError_" + string.capwords(
- err_name.replace("DCGM_ST_", ""), "_"
- ).replace("_", "")
+ err_name.replace("DCGM_ST_", ""), "_").replace("_", "")
err_val = getattr(this_module, err_name)
def gen_new(val):
+
def new(typ):
+ # pylint: disable=E1121
obj = DCGMError.__new__(typ, val)
return obj
return new
- new_error_class = type(class_name, (DCGMError,), {"__new__": gen_new(err_val)})
+ new_error_class = type(class_name, (DCGMError,),
+ {'__new__': gen_new(err_val)})
new_error_class.__module__ = __name__
setattr(this_module, class_name, new_error_class)
DCGMError._valClassMapping[err_val] = new_error_class
@@ -445,7 +413,118 @@ class struct_c_dcgmUnit_t(Structure):
_dcgmUnit_t = POINTER(struct_c_dcgmUnit_t)
-class _PrintableStructure(Structure):
+class _WrappedStructure():
+
+ def __init__(self, obj):
+ self.__dict__["_obj"] = obj
+
+ def __getattr__(self, key):
+ value = getattr(self._obj, key)
+ if isinstance(value, bytes):
+ return value.decode('utf-8')
+ if isclass(value):
+ return _WrappedStructure(value)
+ return value
+
+ def __getitem__(self, key):
+ value = self._obj[key]
+ if isinstance(value, bytes):
+ return value.decode('utf-8')
+ if isclass(value):
+ return _WrappedStructure(value)
+ return value
+
+ def __setattr__(self, key, raw_value):
+
+ def find_field_type(fields, key):
+ field = (f[1] for f in fields if f[0] == key)
+ try:
+ return next(field)
+ except StopIteration:
+ return None
+
+ if (key == '_obj'):
+ raise RuntimeError("Cannot set _obj")
+
+ value = raw_value
+ fieldtype = find_field_type(self._obj._fields_, key)
+
+ if fieldtype == c_uint and not isinstance(value, c_uint32):
+ value = int(value)
+ elif fieldtype == c_int and not isinstance(value, c_int32):
+ value = int(value)
+ elif isinstance(raw_value, str):
+ value = raw_value.encode('utf-8')
+
+ self._obj[key] = value
+ return value
+
+
+class _DcgmStructure(Structure):
+
+ def __getattribute__(self, key):
+ value = super().__getattribute__(key)
+ if isinstance(value, bytes):
+ return value.decode('utf-8')
+ if isclass(value):
+ return _WrappedStructure(value)
+ return value
+
+ def __setattr__(self, key, raw_value):
+
+ def find_field_type(fields, key):
+ field = (f[1] for f in fields if f[0] == key)
+ try:
+ return next(field)
+ except StopIteration:
+ return None
+
+ value = raw_value
+ fieldtype = find_field_type(self._fields_, key)
+
+ if fieldtype == c_uint and not isinstance(value, c_uint32):
+ value = int(value)
+ elif fieldtype == c_int and not isinstance(value, c_int32):
+ value = int(value)
+ elif isinstance(raw_value, str):
+ value = raw_value.encode('utf-8')
+
+ return super().__setattr__(key, value)
+
+
+class DcgmUnion(Union):
+
+ def __getattribute__(self, key):
+ value = super().__getattribute__(key)
+ if isinstance(value, bytes):
+ return value.decode('utf-8')
+ if isclass(value):
+ return _WrappedStructure(value)
+ return value
+
+ def __setattr__(self, key, raw_value):
+
+ def find_field_type(fields, key):
+ field = (f[1] for f in fields if f[0] == key)
+ try:
+ return next(field)
+ except StopIteration:
+ return None
+
+ value = raw_value
+ fieldtype = find_field_type(self._fields_, key)
+
+ if fieldtype == c_uint and not isinstance(value, c_uint32):
+ value = int(value)
+ elif fieldtype == c_int and not isinstance(value, c_int32):
+ value = int(value)
+ elif isinstance(raw_value, str):
+ value = raw_value.encode('utf-8')
+
+ return super().__setattr__(key, value)
+
+
+class _PrintableStructure(_DcgmStructure):
"""
Abstract class that produces nicer __str__ output than ctypes.Structure.
e.g. instead of:
@@ -458,15 +537,12 @@ class _PrintableStructure(Structure):
e.g. class that has _field_ 'hex_value', c_uint could be formatted with
_fmt_ = {"hex_value" : "%08X"}
to produce nicer output.
- Default formatting string for all fields can be set with key ""
- like:
+ Default fomratting string for all fields can be set with key "" like:
_fmt_ = {"" : "%d MHz"} # e.g all values are numbers in MHz.
If not set it's assumed to be just "%s"
- Exact format of returned str from this class is subject to change in the
- future.
+ Exact format of returned str from this class is subject to change in the future.
"""
-
_fmt_ = {}
def __str__(self):
@@ -480,7 +556,7 @@ def __str__(self):
elif "" in self._fmt_:
fmt = self._fmt_[""]
result.append(("%s: " + fmt) % (key, value))
- return self.__class__.__name__ + "(" + string.join(result, ", ") + ")"
+ return self.__class__.__name__ + "(" + ", ".join(result) + ")"
def FieldsSizeof(self):
size = 0
@@ -489,10 +565,8 @@ def FieldsSizeof(self):
return size
+#JSON serializer for DCGM structures
class DcgmJSONEncoder(json.JSONEncoder):
- """
- JSON serializer for DCGM structures
- """
def default(self, o): # pylint: disable=method-hidden
if isinstance(o, _PrintableStructure):
@@ -519,21 +593,18 @@ def default(self, o): # pylint: disable=method-hidden
retVal.append(subVal)
return retVal
- # Let the parent class handle this/fail
+ #Let the parent class handle this/fail
return json.JSONEncoder.default(self, o)
+# Creates a unique version number for each struct
def make_dcgm_version(struct, ver):
- """
- Creates a unique version number for each struct
- """
-
return sizeof(struct) | (ver << 24)
-# Function access
-# function pointers are cached to prevent unnecessary libLoadLock locking
-_dcgmGetFunctionPointer_cache = dict()
+# Function access ##
+_dcgmGetFunctionPointer_cache = dict(
+) # function pointers are cached to prevent unnecessary libLoadLock locking
def _dcgmGetFunctionPointer(name):
@@ -557,18 +628,14 @@ def _dcgmGetFunctionPointer(name):
libLoadLock.release()
-#
-# C function wrappers
-#
+# C function wrappers ##
def _LoadDcgmLibrary(libDcgmPath=None):
"""
Load the library if it isn't loaded already
- :param libDcgmPath: Optional path to the libdcgm*.so libraries. Will use
- system defaults if not specified.
+ :param libDcgmPath: Optional path to the libdcgm*.so libraries. Will use system defaults if not specified.
:type libDcgmPath: str
:return: None
"""
-
global dcgmLib
if dcgmLib is None:
@@ -580,30 +647,24 @@ def _LoadDcgmLibrary(libDcgmPath=None):
if dcgmLib is None:
try:
if sys.platform[:3] == "win":
- # cdecl calling convention load nvml.dll from
- # %ProgramFiles%/NVIDIA Corporation/NVSMI/nvml.dll
+ # cdecl calling convention
+ # load nvml.dll from %ProgramFiles%/NVIDIA Corporation/NVSMI/nvml.dll
dcgmLib = CDLL(
os.path.join(
os.getenv("ProgramFiles", "C:/Program Files"),
- "NVIDIA Corporation/NVSMI/dcgm.dll",
- )
- )
+ "NVIDIA Corporation/NVSMI/dcgm.dll"))
else:
- if not libDcgmPath:
- (
- dist_name,
- dist_version,
- dist_id,
- ) = distro.linux_distribution(full_distribution_name=0)
- dist_name = dist_name.lower()
- if dist_name in {"ubuntu", "debian"}:
- libDcgmPath = "/usr/lib/{}-linux-gnu".format(
- platform.machine()
- )
- elif dist_name in {"fedora", "redhat", "centos", "suse"}:
- libDcgmPath = "/usr/lib64"
-
- dcgmLib = CDLL(os.path.join(libDcgmPath, "libdcgm.so.2"))
+ if libDcgmPath:
+ lib_file = os.path.join(libDcgmPath, "libdcgm.so.3")
+ else:
+ # Try Debian-based distros
+ lib_file = '/usr/lib/{}-linux-gnu/libdcgm.so.3'.format(
+ platform.machine())
+ if not os.path.isfile(lib_file):
+ # Presume Redhat-based distros
+ lib_file = '/usr/lib64/libdcgm.so.3'
+
+ dcgmLib = CDLL(lib_file)
except OSError as ose:
_dcgmCheckReturn(DCGM_ST_LIBRARY_NOT_FOUND)
@@ -652,20 +713,23 @@ def _dcgmErrorString(result):
return str
+# Represents a link object. type should be one of DCGM_FE_GPU or
+# DCGM_FE_SWITCH. gpuId or switchID the associated gpu or switch;
+#
+class c_dcgm_link_t(_PrintableStructure):
+ _fields = [('type', c_uint8), ('index', c_uint8), ('id', c_uint16)]
+
+
class c_dcgmConnectV2Params_v1(_PrintableStructure):
- _fields_ = [("version", c_uint), ("persistAfterDisconnect", c_uint)]
+ _fields_ = [('version', c_uint), ('persistAfterDisconnect', c_uint)]
c_dcgmConnectV2Params_version1 = make_dcgm_version(c_dcgmConnectV2Params_v1, 1)
class c_dcgmConnectV2Params_v2(_PrintableStructure):
- _fields_ = [
- ("version", c_uint),
- ("persistAfterDisconnect", c_uint),
- ("timeoutMs", c_uint),
- ("addressIsUnixSocket", c_uint),
- ]
+ _fields_ = [('version', c_uint), ('persistAfterDisconnect', c_uint),
+ ('timeoutMs', c_uint), ('addressIsUnixSocket', c_uint)]
c_dcgmConnectV2Params_version2 = make_dcgm_version(c_dcgmConnectV2Params_v2, 2)
@@ -683,22 +747,22 @@ class c_dcgmHostengineHealth_v1(_PrintableStructure):
dcgmHostengineHealth_version = dcgmHostengineHealth_version1
-# Represents memory and proc clocks for a device
+#Represents memory and proc clocks for a device
class c_dcgmClockSet_v1(_PrintableStructure):
_fields_ = [
- ("version", c_uint),
- ("memClock", c_uint), # Memory Clock
- ("smClock", c_uint), # SM Clock
+ ('version', c_uint),
+ ('memClock', c_uint), #/* Memory Clock */
+ ('smClock', c_uint) #/* SM Clock */
]
-# Represents a entityGroupId + entityId pair to uniquely identify a given
-# entityId inside a group of entities
+# Represents a entityGroupId + entityId pair to uniquely identify a given entityId inside
+# a group of entities
# Added in DCGM 1.5.0
class c_dcgmGroupEntityPair_t(_PrintableStructure):
_fields_ = [
- ("entityGroupId", c_uint32), # Entity Group ID entity belongs to
- ("entityId", c_uint32), # Entity ID of the entity
+ ('entityGroupId', c_uint32), #Entity Group ID entity belongs to
+ ('entityId', c_uint32) #Entity ID of the entity
]
@@ -707,12 +771,10 @@ class c_dcgmGroupEntityPair_t(_PrintableStructure):
# * Added in DCGM 1.5.0
# */
class c_dcgmGroupInfo_v2(_PrintableStructure):
- _fields_ = [
- ("version", c_uint),
- ("count", c_uint),
- ("groupName", c_char * DCGM_MAX_STR_LENGTH),
- ("entityList", c_dcgmGroupEntityPair_t * DCGM_GROUP_MAX_ENTITIES),
- ]
+ _fields_ = [('version', c_uint), ('count', c_uint),
+ ('groupName', c_char * DCGM_MAX_STR_LENGTH),
+ ('entityList',
+ c_dcgmGroupEntityPair_t * DCGM_GROUP_MAX_ENTITIES)]
c_dcgmGroupInfo_version2 = make_dcgm_version(c_dcgmGroupInfo_v2, 2)
@@ -723,19 +785,25 @@ class c_dcgmGroupInfo_v2(_PrintableStructure):
DcgmMigProfileGpuInstanceSlice3 = 3 # GPU instance slice 3
DcgmMigProfileGpuInstanceSlice4 = 4 # GPU instance slice 4
DcgmMigProfileGpuInstanceSlice7 = 5 # GPU instance slice 7
+DcgmMigProfileGpuInstanceSlice8 = 6 # GPU instance slice 8
+DcgmMigProfileGpuInstanceSlice6 = 7 # GPU instance slice 6
+DcgmMigProfileGpuInstanceSlice1Rev1 = 8 # GPU instance slice 1 revision 1
+DcgmMigProfileGpuInstanceSlice2Rev1 = 9 # GPU instance slice 2 revision 1
+DcgmMigProfileGpuInstanceSlice1Rev2 = 10 # GPU instance slice 1 revision 2
DcgmMigProfileComputeInstanceSlice1 = 30 # compute instance slice 1
DcgmMigProfileComputeInstanceSlice2 = 31 # compute instance slice 2
DcgmMigProfileComputeInstanceSlice3 = 32 # compute instance slice 3
DcgmMigProfileComputeInstanceSlice4 = 33 # compute instance slice 4
DcgmMigProfileComputeInstanceSlice7 = 34 # compute instance slice 7
+DcgmMigProfileComputeInstanceSlice8 = 35 # compute instance slice 8
+DcgmMigProfileComputeInstanceSlice6 = 36 # compute instance slice 6
+DcgmMigProfileComputeInstanceSlice1Rev1 = 37 # compute instance slice 1 revision 1
+# /**
+# * Represents a pair of entity pairings to uniquely identify an entity and its place in the hierarchy.
+# */
class c_dcgmMigHierarchyInfo_t(_PrintableStructure):
- """
- Represents a pair of entity pairings to uniquely identify an entity and
- its place in the hierarchy.
- """
-
_fields_ = [
("entity", c_dcgmGroupEntityPair_t),
("parent", c_dcgmGroupEntityPair_t),
@@ -743,36 +811,50 @@ class c_dcgmMigHierarchyInfo_t(_PrintableStructure):
]
-DCGM_MAX_INSTANCES_PER_GPU = 7
-# There can never be more compute instances per GPU than instances per GPU
-# because a compute instance is part of an instance
+class c_dcgmMigEntityInfo_t(_PrintableStructure):
+ _fields_ = [
+ ('gpuUuid', c_char * 128), # GPU UUID
+ ('nvmlGpuIndex', c_uint), # GPU index from NVML
+ ('nvmlInstanceId', c_uint), # GPU instance index within GPU
+ ('nvmlComputeInstanceId',
+ c_uint), # GPU Compute instance index within GPU instance
+ ('nvmlMigProfileId',
+ c_uint), # Unique profile ID for GPU or Compute instances
+ ('nvmlProfileSlices', c_uint), # Number of slices in the MIG profile
+ ]
+
+
+class c_dcgmMigHierarchyInfo_v2(_PrintableStructure):
+ _fields_ = [
+ ('entity', c_dcgmGroupEntityPair_t),
+ ('parent', c_dcgmGroupEntityPair_t),
+ ('info', c_dcgmMigEntityInfo_t),
+ ]
+
+
+DCGM_MAX_INSTANCES_PER_GPU = 8
+# There can never be more compute instances per GPU than instances per GPU because a compute instance
+# is part of an instance
DCGM_MAX_COMPUTE_INSTANCES_PER_GPU = DCGM_MAX_INSTANCES_PER_GPU
-# Currently, there cannot be more than 14 instances + compute instances. There
-# are always 7 compute instances and never more than 7 instances
+# Currently, there cannot be more than 14 instances + compute instances. There are always 7 compute instances
+# and never more than 7 instances
DCGM_MAX_TOTAL_INSTANCES = 14
DCGM_MAX_HIERARCHY_INFO = DCGM_MAX_NUM_DEVICES * DCGM_MAX_TOTAL_INSTANCES
DCGM_MAX_INSTANCES = DCGM_MAX_NUM_DEVICES * DCGM_MAX_INSTANCES_PER_GPU
-# The maximum compute instances are always the same as the maximum instances
-# because each compute instances is part of an instance
+# The maximum compute instances are always the same as the maximum instances because each compute instances
+# is part of an instance
DCGM_MAX_COMPUTE_INSTANCES = DCGM_MAX_INSTANCES
-# Ask the hostengine to wait to process reconfiguring the GPUs
-DCGM_MIG_RECONFIG_DELAY_PROCESSING = 0x1
+DCGM_MIG_RECONFIG_DELAY_PROCESSING = 0x1 # Ask the hostengine to wait to process reconfiguring the GPUs
-class c_dcgmMigHierarchy_v1(_PrintableStructure):
- """
- Structure to store the GPU hierarchy for a system
- """
-
- _fields_ = [
- ("version", c_uint),
- ("count", c_uint),
- ("entityList", c_dcgmMigHierarchyInfo_t * DCGM_MAX_HIERARCHY_INFO),
- ]
+class c_dcgmMigHierarchy_v2(_PrintableStructure):
+ _fields_ = [('version', c_uint), ('count', c_uint),
+ ('entityList',
+ c_dcgmMigHierarchyInfo_v2 * DCGM_MAX_HIERARCHY_INFO)]
-c_dcgmMigHierarchy_version1 = make_dcgm_version(c_dcgmMigHierarchy_v1, 1)
+c_dcgmMigHierarchy_version2 = make_dcgm_version(c_dcgmMigHierarchy_v2, 2)
class c_dcgmDeleteMigEntity_v1(_PrintableStructure):
@@ -786,13 +868,11 @@ class c_dcgmDeleteMigEntity_v1(_PrintableStructure):
c_dcgmDeleteMigEntity_version1 = make_dcgm_version(c_dcgmDeleteMigEntity_v1, 1)
-#
-# Enum values for the kinds of MIG creations
-#
-# Create a GPU instance
-DcgmMigCreateGpuInstance = 0
-# Create a compute instance
-DcgmMigCreateComputeInstance = 1
+# /**
+# * Enum values for the kinds of MIG creations
+# */
+DcgmMigCreateGpuInstance = 0 # Create a GPU instance
+DcgmMigCreateComputeInstance = 1 # Create a compute instance
class c_dcgmCreateMigEntity_v1(_PrintableStructure):
@@ -808,331 +888,391 @@ class c_dcgmCreateMigEntity_v1(_PrintableStructure):
c_dcgmCreateMigEntity_version1 = make_dcgm_version(c_dcgmCreateMigEntity_v1, 1)
+# /**
+# * Structure to represent error attributes
+# */
class c_dcgmErrorInfo_v1(_PrintableStructure):
- """
- Structure to represent error attributes
- """
-
- _fields_ = [("gpuId", c_uint), ("fieldId", c_ushort), ("status", c_int)]
+ _fields_ = [('gpuId', c_uint), ('fieldId', c_ushort), ('status', c_int)]
+# /**
+# * Represents list of supported clocks for a device
+# */
class c_dcgmDeviceSupportedClockSets_v1(_PrintableStructure):
- """
- Represents list of supported clocks for a device
- """
-
- _fields_ = [
- ("version", c_uint),
- ("count", c_uint),
- ("clockSet", c_dcgmClockSet_v1 * DCGM_MAX_CLOCKS),
- ]
+ _fields_ = [('version', c_uint), ('count', c_uint),
+ ('clockSet', c_dcgmClockSet_v1 * DCGM_MAX_CLOCKS)]
+# /**
+# * Represents accounting information for a device and pid
+# */
class c_dcgmDevicePidAccountingStats_v1(_PrintableStructure):
- """
- epresents accounting information for a device and pid
- """
-
- _fields_ = [
- ("version", c_uint32),
- ("pid", c_uint32),
- ("gpuUtilization", c_uint32),
- ("memoryUtilization", c_uint32),
- ("maxMemoryUsage", c_uint64),
- ("startTimestamp", c_uint64),
- ("activeTimeUsec", c_uint64),
- ]
+ _fields_ = [('version', c_uint32), ('pid', c_uint32),
+ ('gpuUtilization', c_uint32), ('memoryUtilization', c_uint32),
+ ('maxMemoryUsage', c_uint64), ('startTimestamp', c_uint64),
+ ('activeTimeUsec', c_uint64)]
+# /**
+# * Represents thermal information
+# */
class c_dcgmDeviceThermals_v1(_PrintableStructure):
- """
- Represents thermal information
- """
-
- _fields_ = [("version", c_uint), ("slowdownTemp", c_uint), ("shutdownTemp", c_uint)]
+ _fields_ = [('version', c_uint), ('slowdownTemp', c_uint),
+ ('shutdownTemp', c_uint)]
+# /**
+# * Represents various power limits
+# */
class c_dcgmDevicePowerLimits_v1(_PrintableStructure):
- """
- Represents various power limits
- """
-
- _fields_ = [
- ("version", c_uint),
- ("curPowerLimit", c_uint),
- ("defaultPowerLimit", c_uint),
- ("enforcedPowerLimit", c_uint),
- ("minPowerLimit", c_uint),
- ("maxPowerLimit", c_uint),
- ]
+ _fields_ = [('version', c_uint), ('curPowerLimit', c_uint),
+ ('defaultPowerLimit', c_uint), ('enforcedPowerLimit', c_uint),
+ ('minPowerLimit', c_uint), ('maxPowerLimit', c_uint)]
+# /**
+# * Represents device identifiers
+# */
class c_dcgmDeviceIdentifiers_v1(_PrintableStructure):
- """
- Represents device identifiers
- """
-
- _fields_ = [
- ("version", c_uint),
- ("brandName", c_char * DCGM_MAX_STR_LENGTH),
- ("deviceName", c_char * DCGM_MAX_STR_LENGTH),
- ("pciBusId", c_char * DCGM_MAX_STR_LENGTH),
- ("serial", c_char * DCGM_MAX_STR_LENGTH),
- ("uuid", c_char * DCGM_MAX_STR_LENGTH),
- ("vbios", c_char * DCGM_MAX_STR_LENGTH),
- ("inforomImageVersion", c_char * DCGM_MAX_STR_LENGTH),
- ("pciDeviceId", c_uint32),
- ("pciSubSystemId", c_uint32),
- ("driverVersion", c_char * DCGM_MAX_STR_LENGTH),
- ("virtualizationMode", c_uint32),
- ]
+ _fields_ = [('version', c_uint),
+ ('brandName', c_char * DCGM_MAX_STR_LENGTH),
+ ('deviceName', c_char * DCGM_MAX_STR_LENGTH),
+ ('pciBusId', c_char * DCGM_MAX_STR_LENGTH),
+ ('serial', c_char * DCGM_MAX_STR_LENGTH),
+ ('uuid', c_char * DCGM_MAX_STR_LENGTH),
+ ('vbios', c_char * DCGM_MAX_STR_LENGTH),
+ ('inforomImageVersion', c_char * DCGM_MAX_STR_LENGTH),
+ ('pciDeviceId', c_uint32), ('pciSubSystemId', c_uint32),
+ ('driverVersion', c_char * DCGM_MAX_STR_LENGTH),
+ ('virtualizationMode', c_uint32)]
+# /**
+# * Represents memory utilization
+# */
class c_dcgmDeviceMemoryUsage_v1(_PrintableStructure):
- """
- Represents memory utilization
- """
-
- _fields_ = [
- ("version", c_uint),
- ("bar1Total", c_uint),
- ("fbTotal", c_uint),
- ("fbUsed", c_uint),
- ("fbFree", c_uint),
- ]
+ _fields_ = [('version', c_uint), ('bar1Total', c_uint), ('fbTotal', c_uint),
+ ('fbUsed', c_uint), ('fbFree', c_uint)]
+# /**
+# * Represents utilization values of vGPUs running on the device
+# */
class c_dcgmDeviceVgpuUtilInfo_v1(_PrintableStructure):
- """
- Represents utilization values of vGPUs running on the device
- """
-
- _fields_ = [
- ("version", c_uint),
- ("vgpuId", c_uint),
- ("smUtil", c_uint),
- ("memUtil", c_uint),
- ("encUtil", c_uint),
- ("decUtil", c_uint),
- ]
+ _fields_ = [('version', c_uint), ('vgpuId', c_uint), ('smUtil', c_uint),
+ ('memUtil', c_uint), ('encUtil', c_uint), ('decUtil', c_uint)]
# /**
# * Utilization values for processes running within vGPU VMs using the device
# */
class c_dcgmDeviceVgpuProcessUtilInfo_v1(_PrintableStructure):
- _fields_ = [
- ("version", c_uint),
- ("vgpuId", c_uint),
- ("pid", c_uint),
- ("processName", c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
- ("smUtil", c_uint),
- ("memUtil", c_uint),
- ("encUtil", c_uint),
- ("decUtil", c_uint),
- ]
+ _fields_ = [('version', c_uint), ('vgpuId', c_uint), ('pid', c_uint),
+ ('processName', c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
+ ('smUtil', c_uint), ('memUtil', c_uint), ('encUtil', c_uint),
+ ('decUtil', c_uint)]
# /**
# * Represents current encoder statistics for the given device/vGPU instance
# */
class c_dcgmDeviceEncStats_v1(_PrintableStructure):
- _fields_ = [
- ("version", c_uint),
- ("sessionCount", c_uint),
- ("averageFps", c_uint),
- ("averageLatency", c_uint),
- ]
+ _fields_ = [('version', c_uint), ('sessionCount', c_uint),
+ ('averageFps', c_uint), ('averageLatency', c_uint)]
+# /**
+# * Represents information about active encoder sessions on the given vGPU instance
+# */
class c_dcgmDeviceVgpuEncSessions_v1(_PrintableStructure):
- """
- Represents information about active encoder sessions on the given vGPU
- instance
- """
+ _fields_ = [('version', c_uint), ('vgpuId', c_uint), ('sessionId', c_uint),
+ ('pid', c_uint), ('codecType', c_uint), ('hResolution', c_uint),
+ ('vResolution', c_uint), ('averageFps', c_uint),
+ ('averageLatency', c_uint)]
+
+
+# /**
+# * Represents current frame buffer capture sessions statistics for the given device/vGPU instance
+# */
+class c_dcgmDeviceFbcStats_v1(_PrintableStructure):
+ _fields_ = [('version', c_uint), ('sessionCount', c_uint),
+ ('averageFps', c_uint), ('averageLatency', c_uint)]
+
+# /**
+# * Represents information about active FBC session on the given device/vGPU instance
+# */
+class c_dcgmDeviceFbcSessionInfo_t(_PrintableStructure):
+ _fields_ = [('version', c_uint), ('sessionId', c_uint), ('pid', c_uint),
+ ('vgpuId', c_uint), ('displayOrdinal', c_uint),
+ ('sessionType', c_uint), ('sessionFlags', c_uint),
+ ('hMaxResolution', c_uint), ('vMaxResolution', c_uint),
+ ('hResolution', c_uint), ('vResolution', c_uint),
+ ('averageFps', c_uint), ('averageLatency', c_uint)]
+
+
+# /**
+# * Represents all the active FBC sessions on the given device/vGPU instance
+# */
+class c_dcgmDeviceFbcSessions_v1(_PrintableStructure):
+ _fields_ = [('version', c_uint), ('sessionCount', c_uint),
+ ('sessionInfo',
+ c_dcgmDeviceFbcSessionInfo_t * DCGM_MAX_FBC_SESSIONS)]
+
+
+# /**
+# * Represents static info related to vGPU types supported on the device
+# */
+class c_dcgmDeviceVgpuTypeInfo_v1(_PrintableStructure):
+ _fields_ = [('version', c_uint), ('vgpuTypeId', c_uint),
+ ('vgpuTypeName', c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
+ ('vgpuTypeClass', c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
+ ('vgpuTypeLicense', c_char * DCGM_GRID_LICENSE_BUFFER_SIZE),
+ ('deviceId', c_uint), ('subsystemId', c_uint),
+ ('numDisplayHeads', c_uint), ('maxInstances', c_uint),
+ ('frameRateLimit', c_uint), ('maxResolutionX', c_uint),
+ ('maxResolutionY', c_uint), ('fbTotal', c_uint)]
+
+
+class c_dcgmDeviceVgpuTypeInfo_v2(_PrintableStructure):
+ _fields_ = [('version', c_uint), ('vgpuTypeId', c_uint),
+ ('vgpuTypeName', c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
+ ('vgpuTypeClass', c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
+ ('vgpuTypeLicense', c_char * DCGM_GRID_LICENSE_BUFFER_SIZE),
+ ('deviceId', c_uint), ('subsystemId', c_uint),
+ ('numDisplayHeads', c_uint), ('maxInstances', c_uint),
+ ('frameRateLimit', c_uint), ('maxResolutionX', c_uint),
+ ('maxResolutionY', c_uint), ('fbTotal', c_uint),
+ ('gpuInstanceProfileId', c_uint)]
+
+
+dcgmDeviceVgpuTypeInfo_version2 = make_dcgm_version(c_dcgmDeviceVgpuTypeInfo_v2,
+ 2)
+
+
+class c_dcgmDeviceSettings_v2(_PrintableStructure):
_fields_ = [
- ("version", c_uint),
- ("vgpuId", c_uint),
- ("sessionId", c_uint),
- ("pid", c_uint),
- ("codecType", c_uint),
- ("hResolution", c_uint),
- ("vResolution", c_uint),
- ("averageFps", c_uint),
- ("averageLatency", c_uint),
+ ('version', c_uint),
+ ('persistenceModeEnabled', c_uint),
+ ('migModeEnabled', c_uint),
+ ('confidentialComputeMode', c_uint),
]
-class c_dcgmDeviceFbcStats_v1(_PrintableStructure):
- """
- Represents current frame buffer capture sessions statistics for the given
- device/vGPU instance
- """
+# /**
+# * Represents attributes corresponding to a device
+# */
+class c_dcgmDeviceAttributes_deprecated_v1(_PrintableStructure):
+ _fields_ = [('version', c_uint),
+ ('clockSets', c_dcgmDeviceSupportedClockSets_v1),
+ ('thermalSettings', c_dcgmDeviceThermals_v1),
+ ('powerLimits', c_dcgmDevicePowerLimits_v1),
+ ('identifiers', c_dcgmDeviceIdentifiers_v1),
+ ('memoryUsage', c_dcgmDeviceMemoryUsage_v1),
+ ('unused', c_char * 208)]
+
+
+dcgmDeviceAttributes_deprecated_version1 = make_dcgm_version(
+ c_dcgmDeviceAttributes_deprecated_v1, 1)
+
+# /**
+# * Represents attributes corresponding to a device
+# */
+class c_dcgmDeviceAttributes_v3(_PrintableStructure):
_fields_ = [
- ("version", c_uint),
- ("sessionCount", c_uint),
- ("averageFps", c_uint),
- ("averageLatency", c_uint),
+ ('version', c_uint),
+ ('clockSets', c_dcgmDeviceSupportedClockSets_v1),
+ ('thermalSettings', c_dcgmDeviceThermals_v1),
+ ('powerLimits', c_dcgmDevicePowerLimits_v1),
+ ('identifiers', c_dcgmDeviceIdentifiers_v1),
+ ('memoryUsage', c_dcgmDeviceMemoryUsage_v1),
+ ('settings', c_dcgmDeviceSettings_v2),
]
-class c_dcgmDeviceFbcSessionInfo_t(_PrintableStructure):
- """
- Represents information about active FBC session on the given device/vGPU
- instance
- """
+dcgmDeviceAttributes_version3 = make_dcgm_version(c_dcgmDeviceAttributes_v3, 3)
+
+# /**
+# * Represents attributes info for a MIG device
+# */
+class c_dcgmDeviceMigAttributesInfo_v1(_PrintableStructure):
_fields_ = [
- ("version", c_uint),
- ("sessionId", c_uint),
- ("pid", c_uint),
- ("vgpuId", c_uint),
- ("displayOrdinal", c_uint),
- ("sessionType", c_uint),
- ("sessionFlags", c_uint),
- ("hMaxResolution", c_uint),
- ("vMaxResolution", c_uint),
- ("hResolution", c_uint),
- ("vResolution", c_uint),
- ("averageFps", c_uint),
- ("averageLatency", c_uint),
+ ('version', c_uint),
+ ('gpuInstanceId', c_uint),
+ ('computeInstanceId', c_uint),
+ ('multiprocessorCount', c_uint),
+ ('sharedCopyEngineCount', c_uint),
+ ('sharedDecoderCount', c_uint),
+ ('sharedEncoderCount', c_uint),
+ ('sharedJpegCount', c_uint),
+ ('sharedOfaCount', c_uint),
+ ('gpuInstanceSliceCount', c_uint),
+ ('computeInstanceSliceCount', c_uint),
+ ('memorySizeMB', c_uint64),
]
-class c_dcgmDeviceFbcSessions_v1(_PrintableStructure):
- """
- Represents all the active FBC sessions on the given device/vGPU instance
- """
+dcgmDeviceMigAttributesInfo_version1 = make_dcgm_version(
+ c_dcgmDeviceMigAttributesInfo_v1, 1)
+
+# /**
+# * Represents attributes for a MIG device
+# */
+class c_dcgmDeviceMigAttributes_v1(_PrintableStructure):
_fields_ = [
- ("version", c_uint),
- ("sessionCount", c_uint),
- ("sessionInfo", c_dcgmDeviceFbcSessionInfo_t * DCGM_MAX_FBC_SESSIONS),
+ ('version', c_uint),
+ ('migDevicesCount', c_uint),
+ ('migAttributesInfo', c_dcgmDeviceMigAttributesInfo_v1),
]
-class c_dcgmDeviceVgpuTypeInfo_v1(_PrintableStructure):
- """
- Represents static info related to vGPU types supported on the device
- """
+dcgmDeviceMigAttributes_version1 = make_dcgm_version(
+ c_dcgmDeviceMigAttributes_v1, 1)
+
+# /**
+# * Represents GPU instance profile information
+# */
+class c_dcgmGpuInstanceProfileInfo_v1(_PrintableStructure):
_fields_ = [
- ("version", c_uint),
- ("vgpuTypeId", c_uint),
- ("vgpuTypeName", c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
- ("vgpuTypeClass", c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
- ("vgpuTypeLicense", c_char * DCGM_GRID_LICENSE_BUFFER_SIZE),
- ("deviceId", c_uint),
- ("subsystemId", c_uint),
- ("numDisplayHeads", c_uint),
- ("maxInstances", c_uint),
- ("frameRateLimit", c_uint),
- ("maxResolutionX", c_uint),
- ("maxResolutionY", c_uint),
- ("fbTotal", c_uint),
+ ('version', c_uint),
+ ('id', c_uint),
+ ('isP2pSupported', c_uint),
+ ('sliceCount', c_uint),
+ ('instanceCount', c_uint),
+ ('multiprocessorCount', c_uint),
+ ('copyEngineCount', c_uint),
+ ('decoderCount', c_uint),
+ ('encoderCount', c_uint),
+ ('jpegCount', c_uint),
+ ('ofaCount', c_uint),
+ ('memorySizeMB', c_uint64),
]
-class c_dcgmDeviceSettings_v1(_PrintableStructure):
+dcgmGpuInstanceProfileInfo_version1 = make_dcgm_version(
+ c_dcgmGpuInstanceProfileInfo_v1, 1)
+
+
+# /**
+# * Represents GPU instance profiles
+# */
+class c_dcgmGpuInstanceProfiles_v1(_PrintableStructure):
_fields_ = [
- ("version", c_uint),
- ("persistenceModeEnabled", c_uint),
- ("migModeEnabled", c_uint),
+ ('version', c_uint),
+ ('profileCount', c_uint),
+ ('profileInfo', c_dcgmGpuInstanceProfileInfo_v1),
]
-class c_dcgmDeviceAttributes_v1(_PrintableStructure):
- """
- Represents attributes corresponding to a device
- """
+dcgmGpuInstanceProfiles_version1 = make_dcgm_version(
+ c_dcgmGpuInstanceProfiles_v1, 1)
+
+# /**
+# * Represents Compute instance profile information
+# */
+class c_dcgmComputeInstanceProfileInfo_v1(_PrintableStructure):
_fields_ = [
- ("version", c_uint),
- ("clockSets", c_dcgmDeviceSupportedClockSets_v1),
- ("thermalSettings", c_dcgmDeviceThermals_v1),
- ("powerLimits", c_dcgmDevicePowerLimits_v1),
- ("identifiers", c_dcgmDeviceIdentifiers_v1),
- ("memoryUsage", c_dcgmDeviceMemoryUsage_v1),
- ("unused", c_char * 208),
+ ('version', c_uint),
+ ('gpuInstanceId', c_uint),
+ ('id', c_uint),
+ ('sliceCount', c_uint),
+ ('instanceCount', c_uint),
+ ('multiprocessorCount', c_uint),
+ ('sharedCopyEngineCount', c_uint),
+ ('sharedDecoderCount', c_uint),
+ ('sharedEncoderCount', c_uint),
+ ('sharedJpegCount', c_uint),
+ ('sharedOfaCount', c_uint),
]
-dcgmDeviceAttributes_version1 = make_dcgm_version(c_dcgmDeviceAttributes_v1, 1)
-
+dcgmComputeInstanceProfileInfo_version1 = make_dcgm_version(
+ c_dcgmComputeInstanceProfileInfo_v1, 1)
-class c_dcgmDeviceAttributes_v2(_PrintableStructure):
- """
- Represents attributes corresponding to a device
- """
+# /**
+# * Represents Compute instance profiles
+# */
+class c_dcgmComputeInstanceProfiles_v1(_PrintableStructure):
_fields_ = [
- ("version", c_uint),
- ("clockSets", c_dcgmDeviceSupportedClockSets_v1),
- ("thermalSettings", c_dcgmDeviceThermals_v1),
- ("powerLimits", c_dcgmDevicePowerLimits_v1),
- ("identifiers", c_dcgmDeviceIdentifiers_v1),
- ("memoryUsage", c_dcgmDeviceMemoryUsage_v1),
- ("settings", c_dcgmDeviceSettings_v1),
+ ('version', c_uint),
+ ('profileCount', c_uint),
+ ('profileInfo', c_dcgmComputeInstanceProfileInfo_v1),
]
-dcgmDeviceAttributes_version2 = make_dcgm_version(c_dcgmDeviceAttributes_v2, 2)
+dcgmComputeInstanceProfiles_version1 = make_dcgm_version(
+ c_dcgmComputeInstanceProfiles_v1, 1)
+# /**
+# * Represents vGPU attributes corresponding to a device
+# */
class c_dcgmVgpuDeviceAttributes_v6(_PrintableStructure):
- """
- Represents vGPU attributes corresponding to a device
- """
-
_fields_ = [
- ("version", c_uint),
- ("activeVgpuInstanceCount", c_uint),
- ("activeVgpuInstanceIds", c_uint * DCGM_MAX_VGPU_INSTANCES_PER_PGPU),
- ("creatableVgpuTypeCount", c_uint),
- ("creatableVgpuTypeIds", c_uint * DCGM_MAX_VGPU_TYPES_PER_PGPU),
- ("supportedVgpuTypeCount", c_uint),
- (
- "supportedVgpuTypeInfo",
- c_dcgmDeviceVgpuTypeInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU,
- ),
- ("vgpuUtilInfo", c_dcgmDeviceVgpuUtilInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU),
- ("gpuUtil", c_uint),
- ("memCopyUtil", c_uint),
- ("encUtil", c_uint),
- ("decUtil", c_uint),
+ ('version', c_uint), ('activeVgpuInstanceCount', c_uint),
+ ('activeVgpuInstanceIds', c_uint * DCGM_MAX_VGPU_INSTANCES_PER_PGPU),
+ ('creatableVgpuTypeCount', c_uint),
+ ('creatableVgpuTypeIds', c_uint * DCGM_MAX_VGPU_TYPES_PER_PGPU),
+ ('supportedVgpuTypeCount', c_uint),
+ ('supportedVgpuTypeInfo',
+ c_dcgmDeviceVgpuTypeInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU),
+ ('vgpuUtilInfo',
+ c_dcgmDeviceVgpuUtilInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU),
+ ('gpuUtil', c_uint), ('memCopyUtil', c_uint), ('encUtil', c_uint),
+ ('decUtil', c_uint)
]
-dcgmVgpuDeviceAttributes_version6 = make_dcgm_version(c_dcgmVgpuDeviceAttributes_v6, 1)
+dcgmVgpuDeviceAttributes_version6 = make_dcgm_version(
+ c_dcgmVgpuDeviceAttributes_v6, 1)
-class c_dcgmVgpuInstanceAttributes_v1(_PrintableStructure):
- """
- Represents attributes specific to vGPU instance
- """
-
+class c_dcgmVgpuDeviceAttributes_v7(_PrintableStructure):
_fields_ = [
- ("version", c_uint),
- ("vmId", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
- ("vmName", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
- ("vgpuTypeId", c_uint),
- ("vgpuUuid", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
- ("vgpuDriverVersion", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
- ("fbUsage", c_uint),
- ("licenseStatus", c_uint),
- ("frameRateLimit", c_uint),
+ ('version', c_uint), ('activeVgpuInstanceCount', c_uint),
+ ('activeVgpuInstanceIds', c_uint * DCGM_MAX_VGPU_INSTANCES_PER_PGPU),
+ ('creatableVgpuTypeCount', c_uint),
+ ('creatableVgpuTypeIds', c_uint * DCGM_MAX_VGPU_TYPES_PER_PGPU),
+ ('supportedVgpuTypeCount', c_uint),
+ ('supportedVgpuTypeInfo',
+ c_dcgmDeviceVgpuTypeInfo_v2 * DCGM_MAX_VGPU_TYPES_PER_PGPU),
+ ('vgpuUtilInfo',
+ c_dcgmDeviceVgpuUtilInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU),
+ ('gpuUtil', c_uint), ('memCopyUtil', c_uint), ('encUtil', c_uint),
+ ('decUtil', c_uint)
]
+dcgmVgpuDeviceAttributes_version7 = make_dcgm_version(
+ c_dcgmVgpuDeviceAttributes_v7, 7)
+
+
+# /**
+# * Represents attributes specific to vGPU instance
+# */
+class c_dcgmVgpuInstanceAttributes_v1(_PrintableStructure):
+ _fields_ = [('version', c_uint),
+ ('vmId', c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
+ ('vmName', c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
+ ('vgpuTypeId', c_uint),
+ ('vgpuUuid', c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
+ ('vgpuDriverVersion', c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
+ ('fbUsage', c_uint), ('licenseStatus', c_uint),
+ ('frameRateLimit', c_uint)]
+
+
dcgmVgpuInstanceAttributes_version1 = make_dcgm_version(
- c_dcgmVgpuInstanceAttributes_v1, 1
-)
+ c_dcgmVgpuInstanceAttributes_v1, 1)
class c_dcgmConfigPowerLimit(_PrintableStructure):
- _fields_ = [("type", c_uint), ("val", c_uint)]
+ _fields_ = [('type', c_uint), ('val', c_uint)]
class c_dcgmConfigPerfStateSettings_t(_PrintableStructure):
@@ -1146,12 +1286,12 @@ class c_dcgmConfigPerfStateSettings_t(_PrintableStructure):
class c_dcgmDeviceConfig_v1(_PrintableStructure):
_fields_ = [
# version must always be first
- ("version", c_uint),
- ("gpuId", c_uint),
- ("mEccMode", c_uint),
- ("mComputeMode", c_uint),
- ("mPerfState", c_dcgmConfigPerfStateSettings_t),
- ("mPowerLimit", c_dcgmConfigPowerLimit),
+ ('version', c_uint),
+ ('gpuId', c_uint),
+ ('mEccMode', c_uint),
+ ('mComputeMode', c_uint),
+ ('mPerfState', c_dcgmConfigPerfStateSettings_t),
+ ('mPowerLimit', c_dcgmConfigPowerLimit)
]
@@ -1162,16 +1302,16 @@ class c_dcgmDeviceConfig_v1(_PrintableStructure):
class c_dcgmDeviceVgpuConfig_v1(_PrintableStructure):
_fields_ = [
# version must always be first
- ("version", c_uint),
- ("gpuId", c_uint),
- ("mEccMode", c_uint),
- ("mComputeMode", c_uint),
- ("mPerfState", c_dcgmConfigPerfStateSettings_t),
- ("mPowerLimit", c_dcgmConfigPowerLimit),
+ ('version', c_uint),
+ ('gpuId', c_uint),
+ ('mEccMode', c_uint),
+ ('mComputeMode', c_uint),
+ ('mPerfState', c_dcgmConfigPerfStateSettings_t),
+ ('mPowerLimit', c_dcgmConfigPowerLimit)
]
def SetBlank(self):
- # Does not set version or gpuId
+ #Does not set version or gpuId
self.mEccMode = dcgmvalue.DCGM_INT32_BLANK
self.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK
self.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
@@ -1224,7 +1364,8 @@ class c_dcgmUnwatchFieldValue_v1(_PrintableStructure):
_fields_ = []
-dcgmUnwatchFieldValue_version1 = make_dcgm_version(c_dcgmUnwatchFieldValue_v1, 1)
+dcgmUnwatchFieldValue_version1 = make_dcgm_version(c_dcgmUnwatchFieldValue_v1,
+ 1)
class c_dcgmUpdateAllFields_v1(_PrintableStructure):
@@ -1233,9 +1374,19 @@ class c_dcgmUpdateAllFields_v1(_PrintableStructure):
dcgmUpdateAllFields_version1 = make_dcgm_version(c_dcgmUpdateAllFields_v1, 1)
-dcgmGetMultipleValuesForField_version1 = 1
+dcgmGetMultipleValuesForFieldResponse_version1 = 1
+
+# policy enums (and table indices)
+DCGM_POLICY_COND_IDX_DBE = 0
+DCGM_POLICY_COND_IDX_PCI = 1
+DCGM_POLICY_COND_IDX_MAX_PAGES_RETIRED = 2
+DCGM_POLICY_COND_IDX_THERMAL = 3
+DCGM_POLICY_COND_IDX_POWER = 4
+DCGM_POLICY_COND_IDX_NVLINK = 5
+DCGM_POLICY_COND_IDX_XID = 6
+DCGM_POLICY_COND_IDX_MAX = 7
-# policy enums
+# policy enum bitmasks
DCGM_POLICY_COND_DBE = 0x1
DCGM_POLICY_COND_PCI = 0x2
DCGM_POLICY_COND_MAX_PAGES_RETIRED = 0x4
@@ -1251,12 +1402,13 @@ class c_dcgmUpdateAllFields_v1(_PrintableStructure):
DCGM_POLICY_ISOLATION_NONE = 0
DCGM_POLICY_ACTION_NONE = 0
-DCGM_POLICY_ACTION_GPURESET = 1 # Deprecated
+DCGM_POLICY_ACTION_GPURESET = 1 #Deprecated
DCGM_POLICY_VALID_NONE = 0
DCGM_POLICY_VALID_SV_SHORT = 1
DCGM_POLICY_VALID_SV_MED = 2
DCGM_POLICY_VALID_SV_LONG = 3
+DCGM_POLICY_VALID_SV_XLONG = 4
DCGM_POLICY_FAILURE_NONE = 0
@@ -1264,6 +1416,7 @@ class c_dcgmUpdateAllFields_v1(_PrintableStructure):
DCGM_DIAG_LVL_SHORT = 10
DCGM_DIAG_LVL_MED = 20
DCGM_DIAG_LVL_LONG = 30
+DCGM_DIAG_LVL_XLONG = 40
DCGM_DIAG_RESULT_PASS = 0
DCGM_DIAG_RESULT_SKIP = 1
@@ -1272,7 +1425,7 @@ class c_dcgmUpdateAllFields_v1(_PrintableStructure):
DCGM_DIAG_RESULT_NOT_RUN = 4
-class c_dcgmPolicyConditionParmTypes_t(Union):
+class c_dcgmPolicyConditionParmTypes_t(DcgmUnion):
_fields_ = [
("boolean", c_bool),
("llval", c_longlong),
@@ -1280,7 +1433,7 @@ class c_dcgmPolicyConditionParmTypes_t(Union):
class c_dcgmPolicyConditionParms_t(_PrintableStructure):
- _fields_ = [("tag", c_uint), ("val", c_dcgmPolicyConditionParmTypes_t)]
+ _fields_ = [('tag', c_uint), ('val', c_dcgmPolicyConditionParmTypes_t)]
class c_dcgmPolicy_v1(_PrintableStructure):
@@ -1303,39 +1456,39 @@ class c_dcgmPolicy_v1(_PrintableStructure):
class c_dcgmPolicyConditionPci_t(_PrintableStructure):
_fields_ = [
("timestamp", c_longlong), # timestamp of the error
- ("counter", c_uint), # value of the PCIe replay counter
+ ("counter", c_uint) # value of the PCIe replay counter
]
class c_dcgmPolicyConditionDbe_t(_PrintableStructure):
- LOCATIONS = {"L1": 0, "L2": 1, "DEVICE": 2, "REGISTER": 3, "TEXTURE": 4}
+ LOCATIONS = {'L1': 0, 'L2': 1, 'DEVICE': 2, 'REGISTER': 3, 'TEXTURE': 4}
_fields_ = [
- ("timestamp", c_longlong), # timestamp of the error
+ ("timestamp", c_longlong), # timestamp of the error
("location", c_int), # location of the error (one of self.LOCATIONS)
- ("numerrors", c_uint), # number of errors
+ ("numerrors", c_uint) # number of errors
]
class c_dcgmPolicyConditionMpr_t(_PrintableStructure):
_fields_ = [
- ("timestamp", c_longlong), # timestamp of the error
- ("sbepages", c_uint), # number of pending pages due to SBE
- ("dbepages", c_uint), # number of pending pages due to DBE
+ ("timestamp", c_longlong), # timestamp of the error
+ ("sbepages", c_uint), # number of pending pages due to SBE
+ ("dbepages", c_uint) # number of pending pages due to DBE
]
class c_dcgmPolicyConditionThermal_t(_PrintableStructure):
_fields_ = [
("timestamp", c_longlong), # timestamp of the error
- ("thermalViolation", c_uint), # Temperature reached that violated policy
+ ("thermalViolation", c_uint) # Temperature reached that violated policy
]
class c_dcgmPolicyConditionPower_t(_PrintableStructure):
_fields_ = [
("timestamp", c_longlong), # timestamp of the error
- ("powerViolation", c_uint), # Power value reached that violated policyy
+ ("powerViolation", c_uint) # Power value reached that violated policyy
]
@@ -1343,59 +1496,49 @@ class c_dcgmPolicyConditionNvlink_t(_PrintableStructure):
_fields_ = [
("timestamp", c_longlong), # timestamp of the error
("fieldId", c_ushort), # FieldId of the nvlink error counter
- ("counter", c_uint), # Error value reached that violated policyy
+ ("counter", c_uint) # Error value reached that violated policyy
]
class c_dcgmPolicyConditionXID_t(_PrintableStructure):
_fields_ = [
("timestamp", c_longlong), # timestamp of the error
- ("errnum", c_uint), # XID error number
+ ("errnum", c_uint) # XID error number
]
class c_dcgmPolicyCallbackResponse_v1(_PrintableStructure):
- class Value(Union):
+
+ class Value(DcgmUnion):
# implement more of the fields when a test requires them
_fields_ = [
- ("dbe", c_dcgmPolicyConditionDbe_t), # ECC DBE return structure
- ("pci", c_dcgmPolicyConditionPci_t), # PCI replay error return structure
- (
- "mpr",
- c_dcgmPolicyConditionMpr_t,
- ), # Max retired pages limit return structure
- (
- "thermal",
- c_dcgmPolicyConditionThermal_t,
- ), # Thermal policy violations return structure
- (
- "power",
- c_dcgmPolicyConditionPower_t,
- ), # Power policy violations return structure
- (
- "nvlink",
- c_dcgmPolicyConditionNvlink_t,
+ ("dbe", c_dcgmPolicyConditionDbe_t
+ ), # ECC DBE return structure
+ ("pci", c_dcgmPolicyConditionPci_t
+ ), # PCI replay error return structure
+ ("mpr", c_dcgmPolicyConditionMpr_t
+ ), # Max retired pages limit return structure
+ ("thermal", c_dcgmPolicyConditionThermal_t
+ ), # Thermal policy violations return structure
+ ("power", c_dcgmPolicyConditionPower_t
+ ), # Power policy violations return structure
+ ("nvlink", c_dcgmPolicyConditionNvlink_t
), # Nvlink policy violations return structure..
- (
- "xid",
- c_dcgmPolicyConditionXID_t,
- ), # XID policy violations return structure
+ ("xid", c_dcgmPolicyConditionXID_t
+ ) # XID policy violations return structure
]
_fields_ = [
("version", c_uint),
("condition", c_int), # an OR'ed list of DCGM_POLICY_COND_*
- ("val", Value),
+ ("val", Value)
]
-class c_dcgmFieldValue_v1_value(Union):
- _fields_ = [
- ("i64", c_int64),
- ("dbl", c_double),
- ("str", c_char * DCGM_MAX_STR_LENGTH),
- ("blob", c_byte * DCGM_MAX_BLOB_LENGTH),
- ]
+class c_dcgmFieldValue_v1_value(DcgmUnion):
+ _fields_ = [('i64', c_int64), ('dbl', c_double),
+ ('str', c_char * DCGM_MAX_STR_LENGTH),
+ ('blob', c_byte * DCGM_MAX_BLOB_LENGTH)]
# This structure is used to represent value for the field to be queried.
@@ -1414,8 +1557,7 @@ class c_dcgmFieldValue_v1(_PrintableStructure):
dcgmFieldValue_version1 = make_dcgm_version(c_dcgmFieldValue_v1, 1)
-# This structure is used to represent value for the field to be queried
-# (version 2)
+# This structure is used to represent value for the field to be queried (version 2)
class c_dcgmFieldValue_v2(_PrintableStructure):
_fields_ = [
# version must always be first
@@ -1433,7 +1575,7 @@ class c_dcgmFieldValue_v2(_PrintableStructure):
dcgmFieldValue_version2 = make_dcgm_version(c_dcgmFieldValue_v2, 2)
-# Field value flags used by dcgm_agent.dcgmEntitiesGetLatestValues()
+#Field value flags used by dcgm_agent.dcgmEntitiesGetLatestValues()
DCGM_FV_FLAG_LIVE_DATA = 0x00000001
DCGM_HEALTH_WATCH_PCIE = 0x1
@@ -1458,7 +1600,7 @@ class c_dcgmFieldValue_v2(_PrintableStructure):
class c_dcgmDiagErrorDetail_t(_PrintableStructure):
- _fields_ = [("msg", c_char * 1024), ("code", c_uint)]
+ _fields_ = [('msg', c_char * 1024), ('code', c_uint)]
DCGM_HEALTH_WATCH_MAX_INCIDENTS = DCGM_GROUP_MAX_ENTITIES
@@ -1486,37 +1628,36 @@ class c_dcgmHealthResponse_v4(_PrintableStructure):
class c_dcgmHealthSetParams_v2(_PrintableStructure):
- _fields_ = [
- ("version", c_uint32),
- ("groupId", c_void_p),
- ("systems", c_uint32),
- ("updateInterval", c_int64),
- ("maxKeepAge", c_double),
- ]
+ _fields_ = [('version', c_uint32), ('groupId', c_void_p),
+ ('systems', c_uint32), ('updateInterval', c_int64),
+ ('maxKeepAge', c_double)]
dcgmHealthSetParams_version2 = make_dcgm_version(c_dcgmHealthSetParams_v2, 2)
-# Pid info structs
+#Pid info structs
class c_dcgmStatSummaryInt64_t(_PrintableStructure):
- _fields_ = [("minValue", c_int64), ("maxValue", c_int64), ("average", c_int64)]
+ _fields_ = [('minValue', c_int64), ('maxValue', c_int64),
+ ('average', c_int64)]
class c_dcgmStatSummaryInt32_t(_PrintableStructure):
- _fields_ = [("minValue", c_int32), ("maxValue", c_int32), ("average", c_int32)]
+ _fields_ = [('minValue', c_int32), ('maxValue', c_int32),
+ ('average', c_int32)]
class c_dcgmStatSummaryFp64_t(_PrintableStructure):
- _fields_ = [("minValue", c_double), ("maxValue", c_double), ("average", c_double)]
+ _fields_ = [('minValue', c_double), ('maxValue', c_double),
+ ('average', c_double)]
class c_dcgmProcessUtilInfo_t(_PrintableStructure):
- _fields_ = [("pid", c_uint), ("smUtil", c_double), ("memUtil", c_double)]
+ _fields_ = [('pid', c_uint), ('smUtil', c_double), ('memUtil', c_double)]
class c_dcgmHealthResponseInfo_t(_PrintableStructure):
- _fields_ = [("system", c_uint), ("health", c_uint)]
+ _fields_ = [('system', c_uint), ('health', c_uint)]
DCGM_MAX_PID_INFO_NUM = 16
@@ -1524,167 +1665,153 @@ class c_dcgmHealthResponseInfo_t(_PrintableStructure):
class c_dcgmPidSingleInfo_t(_PrintableStructure):
_fields_ = [
- ("gpuId", c_uint32),
- ("energyConsumed", c_int64),
- ("pcieRxBandwidth", c_dcgmStatSummaryInt64_t),
- ("pcieTxBandwidth", c_dcgmStatSummaryInt64_t),
- ("pcieReplays", c_int64),
- ("startTime", c_int64),
- ("endTime", c_int64),
- ("processUtilization", c_dcgmProcessUtilInfo_t),
- ("smUtilization", c_dcgmStatSummaryInt32_t),
- ("memoryUtilization", c_dcgmStatSummaryInt32_t),
- ("eccSingleBit", c_uint32), # Deprecated
- ("eccDoubleBit", c_uint32),
- ("memoryClock", c_dcgmStatSummaryInt32_t),
- ("smClock", c_dcgmStatSummaryInt32_t),
- ("numXidCriticalErrors", c_int32),
- ("xidCriticalErrorsTs", c_int64 * 10),
- ("numOtherComputePids", c_int32),
- ("otherComputePids", c_uint32 * DCGM_MAX_PID_INFO_NUM),
- ("numOtherGraphicsPids", c_int32),
- ("otherGraphicsPids", c_uint32 * DCGM_MAX_PID_INFO_NUM),
- ("maxGpuMemoryUsed", c_int64),
- ("powerViolationTime", c_int64),
- ("thermalViolationTime", c_int64),
- ("reliabilityViolationTime", c_int64),
- ("boardLimitViolationTime", c_int64),
- ("lowUtilizationTime", c_int64),
- ("syncBoostTime", c_int64),
- ("overallHealth", c_uint),
- ("incidentCount", c_uint),
- ("systems", c_dcgmHealthResponseInfo_t * DCGM_HEALTH_WATCH_COUNT_V1),
+ ('gpuId', c_uint32),
+ ('energyConsumed', c_int64),
+ ('pcieRxBandwidth', c_dcgmStatSummaryInt64_t),
+ ('pcieTxBandwidth', c_dcgmStatSummaryInt64_t),
+ ('pcieReplays', c_int64),
+ ('startTime', c_int64),
+ ('endTime', c_int64),
+ ('processUtilization', c_dcgmProcessUtilInfo_t),
+ ('smUtilization', c_dcgmStatSummaryInt32_t),
+ ('memoryUtilization', c_dcgmStatSummaryInt32_t),
+ ('eccSingleBit', c_uint32), #Deprecated
+ ('eccDoubleBit', c_uint32),
+ ('memoryClock', c_dcgmStatSummaryInt32_t),
+ ('smClock', c_dcgmStatSummaryInt32_t),
+ ('numXidCriticalErrors', c_int32),
+ ('xidCriticalErrorsTs', c_int64 * 10),
+ ('numOtherComputePids', c_int32),
+ ('otherComputePids', c_uint32 * DCGM_MAX_PID_INFO_NUM),
+ ('numOtherGraphicsPids', c_int32),
+ ('otherGraphicsPids', c_uint32 * DCGM_MAX_PID_INFO_NUM),
+ ('maxGpuMemoryUsed', c_int64),
+ ('powerViolationTime', c_int64),
+ ('thermalViolationTime', c_int64),
+ ('reliabilityViolationTime', c_int64),
+ ('boardLimitViolationTime', c_int64),
+ ('lowUtilizationTime', c_int64),
+ ('syncBoostTime', c_int64),
+ ('overallHealth', c_uint),
+ ('incidentCount', c_uint),
+ ('systems', c_dcgmHealthResponseInfo_t * DCGM_HEALTH_WATCH_COUNT_V1)
]
class c_dcgmPidInfo_v2(_PrintableStructure):
- _fields_ = [
- ("version", c_uint32),
- ("pid", c_uint32),
- ("unused", c_uint32),
- ("numGpus", c_int32),
- ("summary", c_dcgmPidSingleInfo_t),
- ("gpus", c_dcgmPidSingleInfo_t * DCGM_MAX_NUM_DEVICES),
- ]
+ _fields_ = [('version', c_uint32), ('pid', c_uint32), ('unused', c_uint32),
+ ('numGpus', c_int32), ('summary', c_dcgmPidSingleInfo_t),
+ ('gpus', c_dcgmPidSingleInfo_t * DCGM_MAX_NUM_DEVICES)]
dcgmPidInfo_version2 = make_dcgm_version(c_dcgmPidInfo_v2, 2)
class c_dcgmRunningProcess_v1(_PrintableStructure):
- _fields_ = [("version", c_uint32), ("pid", c_uint32), ("memoryUsed", c_uint64)]
+ _fields_ = [('version', c_uint32), ('pid', c_uint32),
+ ('memoryUsed', c_uint64)]
dcgmRunningProcess_version1 = make_dcgm_version(c_dcgmRunningProcess_v1, 1)
+c_dcgmRunningProcess_t = c_dcgmRunningProcess_v1
+
class c_dcgmGpuUsageInfo_t(_PrintableStructure):
_fields_ = [
- ("gpuId", c_uint32),
- ("energyConsumed", c_int64),
- ("powerUsage", c_dcgmStatSummaryFp64_t),
- ("pcieRxBandwidth", c_dcgmStatSummaryInt64_t),
- ("pcieTxBandwidth", c_dcgmStatSummaryInt64_t),
- ("pcieReplays", c_int64),
- ("startTime", c_int64),
- ("endTime", c_int64),
- ("smUtilization", c_dcgmStatSummaryInt32_t),
- ("memoryUtilization", c_dcgmStatSummaryInt32_t),
- ("eccSingleBit", c_uint32), # Deprecated
- ("eccDoubleBit", c_uint32),
- ("memoryClock", c_dcgmStatSummaryInt32_t),
- ("smClock", c_dcgmStatSummaryInt32_t),
- ("numXidCriticalErrors", c_int32),
- ("xidCriticalErrorsTs", c_int64 * 10),
- ("numComputePids", c_int32),
- ("computePids", c_dcgmProcessUtilInfo_t * DCGM_MAX_PID_INFO_NUM),
- ("numGraphicsPids", c_int32),
- ("graphicsPids", c_dcgmProcessUtilInfo_t * DCGM_MAX_PID_INFO_NUM),
- ("maxGpuMemoryUsed", c_int64),
- ("powerViolationTime", c_int64),
- ("thermalViolationTime", c_int64),
- ("reliabilityViolationTime", c_int64),
- ("boardLimitViolationTime", c_int64),
- ("lowUtilizationTime", c_int64),
- ("syncBoostTime", c_int64),
- ("overallHealth", c_uint),
- ("incidentCount", c_uint),
- ("systems", c_dcgmHealthResponseInfo_t * DCGM_HEALTH_WATCH_COUNT_V1),
+ ('gpuId', c_uint32),
+ ('energyConsumed', c_int64),
+ ('powerUsage', c_dcgmStatSummaryFp64_t),
+ ('pcieRxBandwidth', c_dcgmStatSummaryInt64_t),
+ ('pcieTxBandwidth', c_dcgmStatSummaryInt64_t),
+ ('pcieReplays', c_int64),
+ ('startTime', c_int64),
+ ('endTime', c_int64),
+ ('smUtilization', c_dcgmStatSummaryInt32_t),
+ ('memoryUtilization', c_dcgmStatSummaryInt32_t),
+ ('eccSingleBit', c_uint32), #Deprecated
+ ('eccDoubleBit', c_uint32),
+ ('memoryClock', c_dcgmStatSummaryInt32_t),
+ ('smClock', c_dcgmStatSummaryInt32_t),
+ ('numXidCriticalErrors', c_int32),
+ ('xidCriticalErrorsTs', c_int64 * 10),
+ ('numComputePids', c_int32),
+ ('computePids', c_dcgmProcessUtilInfo_t * DCGM_MAX_PID_INFO_NUM),
+ ('numGraphicsPids', c_int32),
+ ('graphicsPids', c_dcgmProcessUtilInfo_t * DCGM_MAX_PID_INFO_NUM),
+ ('maxGpuMemoryUsed', c_int64),
+ ('powerViolationTime', c_int64),
+ ('thermalViolationTime', c_int64),
+ ('reliabilityViolationTime', c_int64),
+ ('boardLimitViolationTime', c_int64),
+ ('lowUtilizationTime', c_int64),
+ ('syncBoostTime', c_int64),
+ ('overallHealth', c_uint),
+ ('incidentCount', c_uint),
+ ('systems', c_dcgmHealthResponseInfo_t * DCGM_HEALTH_WATCH_COUNT_V1)
]
class c_dcgmJobInfo_v3(_PrintableStructure):
- _fields_ = [
- ("version", c_uint32),
- ("numGpus", c_int32),
- ("summary", c_dcgmGpuUsageInfo_t),
- ("gpus", c_dcgmGpuUsageInfo_t * DCGM_MAX_NUM_DEVICES),
- ]
+ _fields_ = [('version', c_uint32), ('numGpus', c_int32),
+ ('summary', c_dcgmGpuUsageInfo_t),
+ ('gpus', c_dcgmGpuUsageInfo_t * DCGM_MAX_NUM_DEVICES)]
dcgmJobInfo_version3 = make_dcgm_version(c_dcgmJobInfo_v3, 3)
class c_dcgmDiagTestResult_v2(_PrintableStructure):
- _fields_ = [
- ("result", c_uint),
- ("error", c_dcgmDiagErrorDetail_t),
- ("info", c_char * 1024),
- ]
+ _fields_ = [('result', c_uint), ('error', c_dcgmDiagErrorDetail_t),
+ ('info', c_char * 1024)]
-class c_dcgmDiagResponsePerGpu_v2(_PrintableStructure):
- _fields_ = [
- ("gpuId", c_uint),
- ("hwDiagnosticReturn", c_uint),
- ("results", c_dcgmDiagTestResult_v2 * DCGM_PER_GPU_TEST_COUNT),
- ]
+class c_dcgmDiagResponsePerGpu_v4(_PrintableStructure):
+ _fields_ = [('gpuId', c_uint), ('hwDiagnosticReturn', c_uint),
+ ('results',
+ c_dcgmDiagTestResult_v2 * DCGM_PER_GPU_TEST_COUNT_V8)]
DCGM_SWTEST_COUNT = 10
LEVEL_ONE_MAX_RESULTS = 16
-class c_dcgmDiagResponse_v6(_PrintableStructure):
+class c_dcgmDiagResponse_v8(_PrintableStructure):
_fields_ = [
- ("version", c_uint),
- ("gpuCount", c_uint),
- ("levelOneTestCount", c_uint),
- ("levelOneResults", c_dcgmDiagTestResult_v2 * LEVEL_ONE_MAX_RESULTS),
- ("perGpuResponses", c_dcgmDiagResponsePerGpu_v2 * DCGM_MAX_NUM_DEVICES),
- ("systemError", c_dcgmDiagErrorDetail_t),
- ("trainingMsg", c_char * 1024),
+ ('version', c_uint), ('gpuCount', c_uint),
+ ('levelOneTestCount', c_uint),
+ ('levelOneResults', c_dcgmDiagTestResult_v2 * LEVEL_ONE_MAX_RESULTS),
+ ('perGpuResponses', c_dcgmDiagResponsePerGpu_v4 * DCGM_MAX_NUM_DEVICES),
+ ('systemError', c_dcgmDiagErrorDetail_t), ('_unused', c_char * 1024)
]
-dcgmDiagResponse_version6 = make_dcgm_version(c_dcgmDiagResponse_v6, 6)
+dcgmDiagResponse_version8 = make_dcgm_version(c_dcgmDiagResponse_v8, 8)
DCGM_AFFINITY_BITMASK_ARRAY_SIZE = 8
class c_dcgmDeviceTopologyPath_t(_PrintableStructure):
- _fields_ = [("gpuId", c_uint32), ("path", c_uint32), ("localNvLinkIds", c_uint32)]
+ _fields_ = [('gpuId', c_uint32), ('path', c_uint32),
+ ('localNvLinkIds', c_uint32)]
class c_dcgmDeviceTopology_v1(_PrintableStructure):
- _fields_ = [
- ("version", c_uint32),
- ("cpuAffinityMask", c_ulong * DCGM_AFFINITY_BITMASK_ARRAY_SIZE),
- ("numGpus", c_uint32),
- ("gpuPaths", c_dcgmDeviceTopologyPath_t * (DCGM_MAX_NUM_DEVICES - 1)),
- ]
+ _fields_ = [('version', c_uint32),
+ ('cpuAffinityMask', c_ulong * DCGM_AFFINITY_BITMASK_ARRAY_SIZE),
+ ('numGpus', c_uint32),
+ ('gpuPaths',
+ c_dcgmDeviceTopologyPath_t * (DCGM_MAX_NUM_DEVICES - 1))]
dcgmDeviceTopology_version1 = make_dcgm_version(c_dcgmDeviceTopology_v1, 1)
class c_dcgmGroupTopology_v1(_PrintableStructure):
- _fields_ = [
- ("version", c_uint32),
- ("groupCpuAffinityMask", c_ulong * DCGM_AFFINITY_BITMASK_ARRAY_SIZE),
- ("numaOptimalFlag", c_uint32),
- ("slowestPath", c_uint32),
- ]
+ _fields_ = [('version', c_uint32),
+ ('groupCpuAffinityMask',
+ c_ulong * DCGM_AFFINITY_BITMASK_ARRAY_SIZE),
+ ('numaOptimalFlag', c_uint32), ('slowestPath', c_uint32)]
dcgmGroupTopology_version1 = make_dcgm_version(c_dcgmGroupTopology_v1, 1)
@@ -1697,202 +1824,50 @@ class c_dcgmGroupTopology_v1(_PrintableStructure):
class c_dcgmFieldGroupInfo_v1(_PrintableStructure):
- _fields_ = [
- ("version", c_uint32),
- ("numFieldIds", c_uint32),
- ("fieldGroupId", c_void_p),
- ("fieldGroupName", c_char * DCGM_MAX_STR_LENGTH),
- ("fieldIds", c_uint16 * DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP),
- ]
+ _fields_ = [('version', c_uint32), ('numFieldIds', c_uint32),
+ ('fieldGroupId', c_void_p),
+ ('fieldGroupName', c_char * DCGM_MAX_STR_LENGTH),
+ ('fieldIds', c_uint16 * DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP)]
dcgmFieldGroupInfo_version1 = make_dcgm_version(c_dcgmFieldGroupInfo_v1, 1)
class c_dcgmAllFieldGroup_v1(_PrintableStructure):
- _fields_ = [
- ("version", c_uint32),
- ("numFieldGroups", c_uint32),
- ("fieldGroups", c_dcgmFieldGroupInfo_v1 * DCGM_MAX_NUM_FIELD_GROUPS),
- ]
+ _fields_ = [('version', c_uint32), ('numFieldGroups', c_uint32),
+ ('fieldGroups',
+ c_dcgmFieldGroupInfo_v1 * DCGM_MAX_NUM_FIELD_GROUPS)]
dcgmAllFieldGroup_version1 = make_dcgm_version(c_dcgmAllFieldGroup_v1, 1)
-class DCGM_INTROSPECT_LVL(object):
- """
- Identifies a level to retrieve field introspection info for
- """
-
- INVALID = 0
- FIELD = 1
- FIELD_GROUP = 2
- ALL_FIELDS = 3
-
-
-class c_dcgmIntrospectContext_v1(_PrintableStructure):
- """
- Identifies the retrieval context for introspection API calls.
- """
-
- _fields_ = [
- ("version", c_uint32),
- # one of DCGM_INTROSPECT_LVL_?
- ("introspectLvl", c_int),
- # Only needed if \ref introspectLvl is FIELD_GROUP
- ("fieldGroupId", c_void_p),
- ]
-
-
-dcgmIntrospectContext_version1 = make_dcgm_version(c_dcgmIntrospectContext_v1, 1)
-
-
class c_dcgmIntrospectMemory_v1(_PrintableStructure):
_fields_ = [
- ("version", c_uint32),
- (
- # The total number of bytes being used to store all of the fields
- # being watched
- "bytesUsed",
- c_longlong,
- ),
+ ('version', c_uint32),
+ ('bytesUsed', c_longlong
+ ) # The total number of bytes being used to store all of the fields being watched
]
dcgmIntrospectMemory_version1 = make_dcgm_version(c_dcgmIntrospectMemory_v1, 1)
-class c_dcgmIntrospectFieldsExecTime_v1(_PrintableStructure):
- _fields_ = [
- (
- # version number (dcgmIntrospectFieldsExecTime_version)
- "version",
- c_uint32,
- ),
- (
- # the mean update frequency of all fields
- "meanUpdateFreqUsec",
- c_longlong,
- ),
- (
- # the sum of every field's most recent execution time after they
- # have been normalized to \ref meanUpdateFreqUsec.
- # This is roughly how long it takes to update fields every \ref
- # meanUpdateFreqUsec
- "recentUpdateUsec",
- c_double,
- ),
- (
- # The total amount of time, ever, that has been spent updating all
- # the fields
- "totalEverUpdateUsec",
- c_longlong,
- ),
- ]
-
-
-dcgmIntrospectFieldsExecTime_version1 = make_dcgm_version(
- c_dcgmIntrospectFieldsExecTime_v1, 1
-)
-
-
-class c_dcgmIntrospectFullFieldsExecTime_v2(_PrintableStructure):
- """
- Full introspection info for field execution time
- """
-
- _fields_ = [
- ("version", c_uint32),
- (
- "aggregateInfo",
- c_dcgmIntrospectFieldsExecTime_v1,
- ), # info that includes global and device scope
- (
- "hasGlobalInfo",
- c_int,
- ), # 0 means \ref globalInfo is populated, !0 means it's not
- (
- "globalInfo",
- c_dcgmIntrospectFieldsExecTime_v1,
- ), # info that only includes global field scope
- (
- "gpuInfoCount",
- c_uint,
- ), # count of how many entries in \ref gpuInfo are populated
- (
- "gpuIdsForGpuInfo",
- c_uint * DCGM_MAX_NUM_DEVICES,
- ), # the GPU ID at a given index identifies which gpu
- # the corresponding entry in \ref gpuInfo is from
- (
- "gpuInfo",
- c_dcgmIntrospectFieldsExecTime_v1 * DCGM_MAX_NUM_DEVICES,
- ), # info that is separated by the
- # GPU ID that the watches were for
- ]
-
-
-dcgmIntrospectFullFieldsExecTime_version2 = make_dcgm_version(
- c_dcgmIntrospectFullFieldsExecTime_v2, 2
-)
-
-
-class c_dcgmIntrospectFullMemory_v1(_PrintableStructure):
- """
- Full introspection info for field memory
- """
-
- _fields_ = [
- ("version", c_uint32),
- (
- "aggregateInfo",
- c_dcgmIntrospectMemory_v1,
- ), # info that includes global and device scope
- (
- "hasGlobalInfo",
- c_int,
- ), # 0 means \ref globalInfo is populated, !0 means it's not
- (
- "globalInfo",
- c_dcgmIntrospectMemory_v1,
- ), # info that only includes global field scope
- (
- "gpuInfoCount",
- c_uint,
- ), # count of how many entries in \ref gpuInfo are populated
- (
- "gpuIdsForGpuInfo",
- c_uint * DCGM_MAX_NUM_DEVICES,
- ), # the GPU ID at a given index identifies which gpu
- # the corresponding entry in \ref gpuInfo is from
- (
- "gpuInfo",
- c_dcgmIntrospectMemory_v1 * DCGM_MAX_NUM_DEVICES,
- ), # info that is separated by the
- # GPU ID that the watches were for
- ]
-
-
-dcgmIntrospectFullMemory_version1 = make_dcgm_version(c_dcgmIntrospectFullMemory_v1, 1)
-
-
class c_dcgmIntrospectCpuUtil_v1(_PrintableStructure):
_fields_ = [
- ("version", c_uint32), # version number (dcgmIntrospectCpuUtil_version)
- ("total", c_double), # fraction of device's CPU resources that were used
- (
- "kernel",
- c_double,
- ), # fraction of device's CPU resources that were used in kernel mode
- (
- "user",
- c_double,
- ), # fraction of device's CPU resources that were used in user mode
+ ('version', c_uint32
+ ), #!< version number (dcgmIntrospectCpuUtil_version)
+ ('total', c_double
+ ), #!< fraction of device's CPU resources that were used
+ ('kernel', c_double
+ ), #!< fraction of device's CPU resources that were used in kernel mode
+ ('user', c_double
+ ), #!< fraction of device's CPU resources that were used in user mode
]
-dcgmIntrospectCpuUtil_version1 = make_dcgm_version(c_dcgmIntrospectCpuUtil_v1, 1)
+dcgmIntrospectCpuUtil_version1 = make_dcgm_version(c_dcgmIntrospectCpuUtil_v1,
+ 1)
DCGM_MAX_CONFIG_FILE_LEN = 10000
DCGM_MAX_TEST_NAMES = 20
@@ -1907,229 +1882,173 @@ class c_dcgmIntrospectCpuUtil_v1(_PrintableStructure):
# Flags options for running the GPU diagnostic
DCGM_RUN_FLAGS_VERBOSE = 0x0001
DCGM_RUN_FLAGS_STATSONFAIL = 0x0002
+# UNUSED
DCGM_RUN_FLAGS_TRAIN = 0x0004
+# UNUSED
DCGM_RUN_FLAGS_FORCE_TRAIN = 0x0008
-# Enable fail early checks for the Targeted Stress, Targeted Power, SM Stress,
-# and Diagnostic tests
-DCGM_RUN_FLAGS_FAIL_EARLY = 0x0010
+DCGM_RUN_FLAGS_FAIL_EARLY = 0x0010 # Enable fail early checks for the Targeted Stress, Targeted Power, SM Stress, and Diagnostic tests
-class c_dcgmRunDiag_v6(_PrintableStructure):
+class c_dcgmRunDiag_v7(_PrintableStructure):
_fields_ = [
- ("version", c_uint), # version of this message
- (
- # flags specifying binary options for running it. Currently verbose
- # and stats on fail
- "flags",
- c_uint,
- ),
- (
- "debugLevel",
- c_uint,
+ ('version', c_uint), # version of this message
+ ('flags', c_uint
+ ), # flags specifying binary options for running it. Currently verbose and stats on fail
+ ('debugLevel', c_uint
), # 0-5 for the debug level the GPU diagnostic will use for logging
- (
- # group of GPUs to verify. Cannot be specified together with
- # gpuList.
- "groupId",
- c_void_p,
- ),
- ("validate", c_uint), # 0-3 for which tests to run. Optional.
- (
- "testNames",
- c_char * DCGM_MAX_TEST_NAMES * DCGM_MAX_TEST_NAMES_LEN,
- ), # Specified list of test names. Optional.
- (
- # Parameters to set for specified tests in the format:
- # testName.parameterName=parameterValue. Optional.
- "testParms",
- c_char * DCGM_MAX_TEST_PARMS * DCGM_MAX_TEST_PARMS_LEN,
- ),
- (
- # Comma-separated list of gpus. Cannot be specified with the
- # groupId.
- "gpuList",
- c_char * DCGM_GPU_LIST_LEN,
- ),
- (
- "debugLogFile",
- c_char * DCGM_PATH_LEN,
+ ('groupId', c_void_p
+ ), # group of GPUs to verify. Cannot be specified together with gpuList.
+ ('validate', c_uint), # 0-3 for which tests to run. Optional.
+ ('testNames', c_char * DCGM_MAX_TEST_NAMES *
+ DCGM_MAX_TEST_NAMES_LEN), # Specifed list of test names. Optional.
+ ('testParms', c_char * DCGM_MAX_TEST_PARMS * DCGM_MAX_TEST_PARMS_LEN
+ ), # Parameters to set for specified tests in the format: testName.parameterName=parameterValue. Optional.
+ ('fakeGpuList', c_char * DCGM_GPU_LIST_LEN
+ ), # Comma-separated list of fake gpus. Cannot be specified with the groupId or gpuList.
+ ('gpuList', c_char * DCGM_GPU_LIST_LEN
+ ), # Comma-separated list of gpus. Cannot be specified with the groupId.
+ ('debugLogFile', c_char * DCGM_PATH_LEN
), # Alternate name for the debug log file that should be used
- (
- "statsPath",
- c_char * DCGM_PATH_LEN,
+ ('statsPath', c_char * DCGM_PATH_LEN
), # Path that the plugin's statistics files should be written to
- (
- "configFileContents",
- c_char * DCGM_MAX_CONFIG_FILE_LEN,
+ ('configFileContents', c_char * DCGM_MAX_CONFIG_FILE_LEN
), # Contents of nvvs config file (likely yaml)
- (
- # Throttle reasons to ignore as either integer mask or csv list of
- # reasons
- "throttleMask",
- c_char * DCGM_THROTTLE_MASK_LEN,
- ),
- ("pluginPath", c_char * DCGM_PATH_LEN), # Custom path to the diagnostic plugins
- ("trainingValues", c_uint), # Number of iterations for training.
- (
- # Acceptable training variance as a percentage of the value.
- # (0-100)
- "trainingVariance",
- c_uint,
- ),
- (
- # Acceptable training tolerance as a percentage of the value.
- # (0-100)
- "trainingTolerance",
- c_uint,
- ),
- (
- "goldenValuesFile",
- c_char * DCGM_PATH_LEN,
- ), # The path where the golden values should be recorded
- (
- # How often the fail early checks should occur when
- # DCGM_RUN_FLAGS_FAIL_EARLY is set.
- "failCheckInterval",
- c_uint,
- ),
+ ('throttleMask', c_char * DCGM_THROTTLE_MASK_LEN
+ ), # Throttle reasons to ignore as either integer mask or csv list of reasons
+ ('pluginPath',
+ c_char * DCGM_PATH_LEN), # Custom path to the diagnostic plugins
+ ('_unusedInt1', c_uint), # Unused
+ ('_unusedInt2', c_uint), # Unused
+ ('_unusedInt3', c_uint), # Unused
+ ('_unusedBuf', c_char * DCGM_PATH_LEN), # Unused
+ ('failCheckInterval', c_uint
+ ), # How often the fail early checks should occur when DCGM_RUN_FLAGS_FAIL_EARLY is set.
]
-dcgmRunDiag_version6 = make_dcgm_version(c_dcgmRunDiag_v6, 6)
+dcgmRunDiag_version7 = make_dcgm_version(c_dcgmRunDiag_v7, 7)
# Latest c_dcgmRunDiag class
-c_dcgmRunDiag_t = c_dcgmRunDiag_v6
+c_dcgmRunDiag_t = c_dcgmRunDiag_v7
# Latest version for dcgmRunDiag_t
-dcgmRunDiag_version = dcgmRunDiag_version6
+dcgmRunDiag_version = dcgmRunDiag_version7
-# Flags for dcgmGetEntityGroupEntities's flags parameter
-# Only return entities that are supported by DCGM.
-DCGM_GEGE_FLAG_ONLY_SUPPORTED = 0x00000001
+#Flags for dcgmGetEntityGroupEntities's flags parameter
+DCGM_GEGE_FLAG_ONLY_SUPPORTED = 0x00000001 #Only return entities that are supported by DCGM.
-# Identifies a GPU NVLink error type returned by DCGM_FI_DEV_GPU_NVLINK_ERRORS
-# NVLink link recovery error occurred
-DCGM_GPU_NVLINK_ERROR_RECOVERY_REQUIRED = 1
-# NVLink link fatal error occurred
-DCGM_GPU_NVLINK_ERROR_FATAL = 2
+#Identifies a GPU NVLink error type returned by DCGM_FI_DEV_GPU_NVLINK_ERRORS
+DCGM_GPU_NVLINK_ERROR_RECOVERY_REQUIRED = 1 # NVLink link recovery error occurred
+DCGM_GPU_NVLINK_ERROR_FATAL = 2 # NVLink link fatal error occurred
# Topology hints for dcgmSelectGpusByTopology()
DCGM_TOPO_HINT_F_NONE = 0x00000000 # No hints specified
-# Ignore the health of the GPUs when picking GPUs for job execution.
-DCGM_TOPO_HINT_F_IGNOREHEALTH = 0x00000001
+DCGM_TOPO_HINT_F_IGNOREHEALTH = 0x00000001 # Ignore the health of the GPUs when picking GPUs for job execution.
# By default, only healthy GPUs are considered.
class c_dcgmTopoSchedHint_v1(_PrintableStructure):
_fields_ = [
- ("version", c_uint), # version of this message
- ("inputGpuIds", c_uint64), # bitmask of the GPU ids to choose from
- ("numGpus", c_uint32), # the number of GPUs that DCGM should choose
- (
- "hintFlags",
- c_uint64,
- ), # Hints to ignore certain factors for the scheduling hint
+ ('version', c_uint), # version of this message
+ ('inputGpuIds', c_uint64), # bitmask of the GPU ids to choose from
+ ('numGpus', c_uint32), # the number of GPUs that DCGM should chooose
+ ('hintFlags',
+ c_uint64), # Hints to ignore certain factors for the scheduling hint
]
dcgmTopoSchedHint_version1 = make_dcgm_version(c_dcgmTopoSchedHint_v1, 1)
-# DCGM NvLink link states used by c_dcgmNvLinkGpuLinkStatus_v1 & 2 and
-# c_dcgmNvLinkNvSwitchLinkStatus_t's linkState field
-# NvLink is unsupported by this GPU (Default for GPUs)
-DcgmNvLinkLinkStateNotSupported = 0
-# NvLink is supported for this link but this link is disabled (Default for
-# NvSwitches)
-DcgmNvLinkLinkStateDisabled = 1
-# This NvLink link is down (inactive)
-DcgmNvLinkLinkStateDown = 2
-# This NvLink link is up (active)
-DcgmNvLinkLinkStateUp = 3
+#DCGM NvLink link states used by c_dcgmNvLinkGpuLinkStatus_v1 & 2 and c_dcgmNvLinkNvSwitchLinkStatus_t's linkState field
+DcgmNvLinkLinkStateNotSupported = 0 # NvLink is unsupported by this GPU (Default for GPUs)
+DcgmNvLinkLinkStateDisabled = 1 # NvLink is supported for this link but this link is disabled (Default for NvSwitches)
+DcgmNvLinkLinkStateDown = 2 # This NvLink link is down (inactive)
+DcgmNvLinkLinkStateUp = 3 # This NvLink link is up (active)
# State of NvLink links for a GPU
class c_dcgmNvLinkGpuLinkStatus_v1(_PrintableStructure):
_fields_ = [
- ("entityId", c_uint32), # Entity ID of the GPU (gpuId)
- (
- "linkState",
- c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1,
- ), # Link state of each link of this GPU
+ ('entityId', c_uint32), # Entity ID of the GPU (gpuId)
+ ('linkState', c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1
+ ), #Link state of each link of this GPU
]
# State of NvLink links for a GPU
class c_dcgmNvLinkGpuLinkStatus_v2(_PrintableStructure):
_fields_ = [
- ("entityId", c_uint32), # Entity ID of the GPU (gpuId)
- (
- "linkState",
- c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_GPU,
- ), # Link state of each link of this GPU
+ ('entityId', c_uint32), # Entity ID of the GPU (gpuId)
+ ('linkState', c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY2
+ ), #Link state of each link of this GPU
+ ]
+
+
+class c_dcgmNvLinkGpuLinkStatus_v3(_PrintableStructure):
+ _fields_ = [
+ ('entityId', c_uint32), # Entity ID of the GPU (gpuId)
+ ('linkState', c_uint32 *
+ DCGM_NVLINK_MAX_LINKS_PER_GPU), #Link state of each link of this GPU
]
-# State of NvLink links for a NvSwitch
-class c_dcgmNvLinkNvSwitchLinkStatus_t(_PrintableStructure):
+#State of NvLink links for a NvSwitch
+class c_dcgmNvLinkNvSwitchLinkStatus_v1(_PrintableStructure):
_fields_ = [
- ("entityId", c_uint32), # Entity ID of the NvSwitch (physicalId)
- (
- "linkState",
- c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH,
- ), # Link state of each link of this NvSwitch
+ ('entityId', c_uint32), # Entity ID of the NvSwitch (physicalId)
+ ('linkState', c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH_V1
+ ) #Link state of each link of this NvSwitch
]
-class c_dcgmNvLinkStatus_v1(_PrintableStructure):
+class c_dcgmNvLinkStatus_v2(_PrintableStructure):
"""
NvSwitch link status for all GPUs and NvSwitches in the system
"""
_fields_ = [
- (
- "version",
- c_uint32,
+ ('version', c_uint32
), # version of this message. Should be dcgmNvLinkStatus_version1
- ("numGpus", c_uint32), # Number of GPUs populated in gpus[]
- (
- "gpus",
- c_dcgmNvLinkGpuLinkStatus_v1 * DCGM_MAX_NUM_DEVICES,
- ), # Per-GPU NvLink link statuses
- ("numNvSwitches", c_uint32), # Number of NvSwitches populated in nvSwitches[]
- (
- "nvSwitches",
- c_dcgmNvLinkNvSwitchLinkStatus_t * DCGM_MAX_NUM_SWITCHES,
- ), # Per-NvSwitch NvLink link statuses
+ ('numGpus', c_uint32), # Number of GPUs populated in gpus[]
+ ('gpus', c_dcgmNvLinkGpuLinkStatus_v2 *
+ DCGM_MAX_NUM_DEVICES), #Per-GPU NvLink link statuses
+ ('numNvSwitches',
+ c_uint32), # Number of NvSwitches populated in nvSwitches[]
+ ('nvSwitches', c_dcgmNvLinkNvSwitchLinkStatus_v1 * DCGM_MAX_NUM_SWITCHES
+ ) #Per-NvSwitch NvLink link statuses
]
-dcgmNvLinkStatus_version1 = make_dcgm_version(c_dcgmNvLinkStatus_v1, 1)
+dcgmNvLinkStatus_version2 = make_dcgm_version(c_dcgmNvLinkStatus_v2, 2)
-class c_dcgmNvLinkStatus_v2(_PrintableStructure):
- """
- NvSwitch link status for all GPUs and NvSwitches in the system
- """
+#State of NvLink links for a NvSwitch
+class c_dcgmNvLinkNvSwitchLinkStatus_v2(_PrintableStructure):
+ _fields_ = [
+ ('entityId', c_uint32), # Entity ID of the NvSwitch (physicalId)
+ ('linkState', c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH
+ ) #Link state of each link of this NvSwitch
+ ]
+
+class c_dcgmNvLinkStatus_v3(_PrintableStructure):
+ '''
+ NvSwitch link status for all GPUs and NvSwitches in the system
+ '''
_fields_ = [
- (
- "version",
- c_uint32,
+ ('version', c_uint32
), # version of this message. Should be dcgmNvLinkStatus_version1
- ("numGpus", c_uint32), # Number of GPUs populated in gpus[]
- (
- "gpus",
- c_dcgmNvLinkGpuLinkStatus_v2 * DCGM_MAX_NUM_DEVICES,
- ), # Per-GPU NvLink link statuses
- ("numNvSwitches", c_uint32), # Number of NvSwitches populated in nvSwitches[]
- (
- "nvSwitches",
- c_dcgmNvLinkNvSwitchLinkStatus_t * DCGM_MAX_NUM_SWITCHES,
- ), # Per-NvSwitch NvLink link statuses
+ ('numGpus', c_uint32), # Number of GPUs populated in gpus[]
+ ('gpus', c_dcgmNvLinkGpuLinkStatus_v3 *
+ DCGM_MAX_NUM_DEVICES), #Per-GPU NvLink link statuses
+ ('numNvSwitches',
+ c_uint32), # Number of NvSwitches populated in nvSwitches[]
+ ('nvSwitches', c_dcgmNvLinkNvSwitchLinkStatus_v2 * DCGM_MAX_NUM_SWITCHES
+ ) #Per-NvSwitch NvLink link statuses
]
-dcgmNvLinkStatus_version2 = make_dcgm_version(c_dcgmNvLinkStatus_v2, 2)
+dcgmNvLinkStatus_version3 = make_dcgm_version(c_dcgmNvLinkStatus_v3, 3)
# Bitmask values for dcgmGetFieldIdSummary
DCGM_SUMMARY_MIN = 0x00000001
@@ -2143,7 +2062,8 @@ class c_dcgmNvLinkStatus_v2(_PrintableStructure):
class c_dcgmSummaryResponse_t(_PrintableStructure):
- class ResponseValue(Union):
+
+ class ResponseValue(DcgmUnion):
_fields_ = [
("i64", c_int64),
("dbl", c_double),
@@ -2169,7 +2089,8 @@ class c_dcgmFieldSummaryRequest_v1(_PrintableStructure):
]
-dcgmFieldSummaryRequest_version1 = make_dcgm_version(c_dcgmFieldSummaryRequest_v1, 1)
+dcgmFieldSummaryRequest_version1 = make_dcgm_version(
+ c_dcgmFieldSummaryRequest_v1, 1)
# Module IDs
DcgmModuleIdCore = 0 # Core DCGM
@@ -2184,90 +2105,61 @@ class c_dcgmFieldSummaryRequest_v1(_PrintableStructure):
DcgmModuleIdCount = 9 # 1 greater than largest ID above
# Module Status
-# Module has not been loaded yet
-DcgmModuleStatusNotLoaded = 0
-# Module has been blacklisted from being loaded
-DcgmModuleStatusBlacklisted = 1
-# Loading the module failed
-DcgmModuleStatusFailed = 2
-# Module has been loaded
-DcgmModuleStatusLoaded = 3
+DcgmModuleStatusNotLoaded = 0 # Module has not been loaded yet
+DcgmModuleStatusDenylisted = 1 # Module has been added to the denylist so it can't be loaded
+DcgmModuleStatusFailed = 2 # Loading the module failed
+DcgmModuleStatusLoaded = 3 # Module has been loaded
+DcgmModuleStatusUnloaded = 4 # Module has been unloaded
+DcgmModuleStatusPaused = 5 # Module has been paused. Implies it's been loaded
DCGM_MODULE_STATUSES_CAPACITY = 16
class c_dcgmModuleGetStatusesModule_t(_PrintableStructure):
_fields_ = [
- ("id", c_uint32), # One of DcgmModuleId*
- ("status", c_uint32), # One of DcgmModuleStatus*
+ ('id', c_uint32), #One of DcgmModuleId*
+ ('status', c_uint32), #One of DcgmModuleStatus*
]
class c_dcgmModuleGetStatuses_v1(_PrintableStructure):
_fields_ = [
- ("version", c_uint),
- ("numStatuses", c_uint32),
- ("statuses", c_dcgmModuleGetStatusesModule_t * DCGM_MODULE_STATUSES_CAPACITY),
- ]
-
-
-dcgmModuleGetStatuses_version1 = make_dcgm_version(c_dcgmModuleGetStatuses_v1, 1)
-
-# Maximum number of metric ID groups that can exist in DCGM
-DCGM_PROF_MAX_NUM_GROUPS = 10
-# Maximum number of field IDs that can be in a single DCGM profiling metric
-# group
-DCGM_PROF_MAX_FIELD_IDS_PER_GROUP = 8
-
-
-class c_dcgmProfMetricGroupInfo_t(_PrintableStructure):
- _fields_ = [
- ("majorId", c_ushort),
- ("minorId", c_ushort),
- ("numFieldIds", c_uint32),
- ("fieldIds", c_ushort * DCGM_PROF_MAX_FIELD_IDS_PER_GROUP),
+ ('version', c_uint),
+ ('numStatuses', c_uint32),
+ ('statuses',
+ c_dcgmModuleGetStatusesModule_t * DCGM_MODULE_STATUSES_CAPACITY),
]
-class c_dcgmProfGetMetricGroups_v2(_PrintableStructure):
- _fields_ = [
- ("version", c_uint32),
- ("unused", c_uint32),
- ("groupId", c_void_p),
- ("numMetricGroups", c_uint32),
- ("unused1", c_uint32),
- ("metricGroups", c_dcgmProfMetricGroupInfo_t * DCGM_PROF_MAX_NUM_GROUPS),
- ]
+dcgmModuleGetStatuses_version1 = make_dcgm_version(c_dcgmModuleGetStatuses_v1,
+ 1)
+DCGM_PROF_MAX_NUM_GROUPS_V2 = 10 # Maximum number of metric ID groups that can exist in DCGM
+DCGM_PROF_MAX_FIELD_IDS_PER_GROUP_V2 = 64 # Maximum number of field IDs that can be in a single DCGM profiling metric group
-dcgmProfGetMetricGroups_version1 = make_dcgm_version(c_dcgmProfGetMetricGroups_v2, 2)
-
-class c_dcgmProfWatchFields_v1(_PrintableStructure):
+class c_dcgmProfMetricGroupInfo_v2(_PrintableStructure):
_fields_ = [
- ("version", c_uint32),
- ("groupId", c_void_p),
- ("numFieldIds", c_uint32),
- ("fieldIds", c_ushort * 16),
- ("updateFreq", c_int64),
- ("maxKeepAge", c_double),
- ("maxKeepSamples", c_int32),
- ("flags", c_uint32),
+ ('majorId', c_ushort),
+ ('minorId', c_ushort),
+ ('numFieldIds', c_uint32),
+ ('fieldIds', c_ushort * DCGM_PROF_MAX_FIELD_IDS_PER_GROUP_V2),
]
-dcgmProfWatchFields_version1 = make_dcgm_version(c_dcgmProfWatchFields_v1, 1)
-
-
-class c_dcgmProfUnwatchFields_v1(_PrintableStructure):
+class c_dcgmProfGetMetricGroups_v3(_PrintableStructure):
_fields_ = [
- ("version", c_uint32),
- ("groupId", c_void_p),
- ("flags", c_uint32),
+ ('version', c_uint32),
+ ('unused', c_uint32),
+ ('gpuId', c_uint32),
+ ('numMetricGroups', c_uint32),
+ ('metricGroups',
+ c_dcgmProfMetricGroupInfo_v2 * DCGM_PROF_MAX_NUM_GROUPS_V2),
]
-dcgmProfUnwatchFields_version1 = make_dcgm_version(c_dcgmProfUnwatchFields_v1, 1)
+dcgmProfGetMetricGroups_version3 = make_dcgm_version(
+ c_dcgmProfGetMetricGroups_v3, 3)
class c_dcgmVersionInfo_v2(_PrintableStructure):
diff --git a/model_analyzer/monitor/dcgm/dcgm_telegraf.py b/model_analyzer/monitor/dcgm/dcgm_telegraf.py
new file mode 100644
index 000000000..63563662e
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/dcgm_telegraf.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from model_analyzer.monitor.dcgm.common.dcgm_client_main import main
+from model_analyzer.monitor.dcgm.DcgmJsonReader import DcgmJsonReader
+from socket import socket, AF_INET, SOCK_DGRAM
+
+# Displayed to the user
+TELEGRAF_NAME = 'Telegraf'
+DEFAULT_TELEGRAF_PORT = 8094
+
+# Telegraf Configuration
+# ======================
+#
+# In order for Telegraf to understand the format of the data sent by this
+# module, it needs to be configured with the input plugin below
+#
+# If you modify the list of published fields, you will need to add non-numeric
+# ones as tag_keys for Telegraf to store them
+#
+# [[inputs.socket_listener]]
+# name_override = "dcgm"
+# service_address = "udp://:8094"
+# data_format = "json"
+# tag_keys = [
+# "compute_pids",
+# "driver_version",
+# "gpu_uuid",
+# "nvml_version",
+# "process_name",
+# "xid_errors"
+# ]
+
+
+class DcgmTelegraf(DcgmJsonReader):
+ ###########################################################################
+ def __init__(self, publish_hostname, publish_port, **kwargs):
+ self.m_sock = socket(AF_INET, SOCK_DGRAM)
+ self.m_dest = (publish_hostname, publish_port)
+ super(DcgmTelegraf, self).__init__(**kwargs)
+
+ ###########################################################################
+ def SendToTelegraf(self, payload):
+ self.m_sock.sendto(payload, self.m_dest)
+
+ ###########################################################################
+ def CustomJsonHandler(self, outJson):
+ self.SendToTelegraf(outJson)
+
+
+if __name__ == '__main__': # pragma: no cover
+ main(DcgmTelegraf,
+ TELEGRAF_NAME,
+ DEFAULT_TELEGRAF_PORT,
+ add_target_host=True)
diff --git a/model_analyzer/monitor/dcgm/dcgmvalue.py b/model_analyzer/monitor/dcgm/dcgmvalue.py
new file mode 100644
index 000000000..d26625d50
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/dcgmvalue.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Base value for integer blank. can be used as an unspecified blank
+DCGM_INT32_BLANK = 0x7ffffff0
+DCGM_INT64_BLANK = 0x7ffffffffffffff0
+
+# Base value for double blank. 2 ** 47. FP 64 has 52 bits of mantissa,
+#so 47 bits can still increment by 1 and represent each value from 0-15
+DCGM_FP64_BLANK = 140737488355328.0
+
+DCGM_STR_BLANK = "<<>>"
+
+# Represents an error where data was not found
+DCGM_INT32_NOT_FOUND = (DCGM_INT32_BLANK + 1)
+DCGM_INT64_NOT_FOUND = (DCGM_INT64_BLANK + 1)
+DCGM_FP64_NOT_FOUND = (DCGM_FP64_BLANK + 1.0)
+DCGM_STR_NOT_FOUND = "<<>>"
+
+# Represents an error where fetching the value is not supported
+DCGM_INT32_NOT_SUPPORTED = (DCGM_INT32_BLANK + 2)
+DCGM_INT64_NOT_SUPPORTED = (DCGM_INT64_BLANK + 2)
+DCGM_FP64_NOT_SUPPORTED = (DCGM_FP64_BLANK + 2.0)
+DCGM_STR_NOT_SUPPORTED = "<<>>"
+
+# Represents and error where fetching the value is not allowed with our current credentials
+DCGM_INT32_NOT_PERMISSIONED = (DCGM_INT32_BLANK + 3)
+DCGM_INT64_NOT_PERMISSIONED = (DCGM_INT64_BLANK + 3)
+DCGM_FP64_NOT_PERMISSIONED = (DCGM_FP64_BLANK + 3.0)
+DCGM_STR_NOT_PERMISSIONED = "<<>>"
+
+
+###############################################################################
+# Functions to check if a value is blank or not
+def DCGM_INT32_IS_BLANK(val):
+ if val >= DCGM_INT32_BLANK:
+ return True
+ else:
+ return False
+
+
+def DCGM_INT64_IS_BLANK(val):
+ if val >= DCGM_INT64_BLANK:
+ return True
+ else:
+ return False
+
+
+def DCGM_FP64_IS_BLANK(val):
+ if val >= DCGM_FP64_BLANK:
+ return True
+ else:
+ return False
+
+
+#Looks for <<< at first position and >>> inside string
+def DCGM_STR_IS_BLANK(val):
+ if 0 != val.find("<<<"):
+ return False
+ elif 0 > val.find(">>>"):
+ return False
+ return True
+
+
+###############################################################################
+class DcgmValue:
+
+ def __init__(self, value):
+ self.value = value #Contains either an integer (int64), string, or double of the actual value
+
+ ###########################################################################
+ def SetFromInt32(self, i32Value):
+ '''
+ Handle the special case where our source data was an int32 but is currently
+ stored in a python int (int64), dealing with blanks
+ '''
+ value = int(i32Value)
+
+ if not DCGM_INT32_IS_BLANK(i32Value):
+ self.value = value
+ return
+
+ if value == DCGM_INT32_NOT_FOUND:
+ self.value = DCGM_INT64_NOT_FOUND
+ elif value == DCGM_INT32_NOT_SUPPORTED:
+ self.value = DCGM_INT64_NOT_SUPPORTED
+ elif value == DCGM_INT32_NOT_PERMISSIONED:
+ self.value = DCGM_INT64_NOT_PERMISSIONED
+ else:
+ self.value = DCGM_INT64_BLANK
+
+ ###########################################################################
+ def IsBlank(self):
+ '''
+ Returns True if the currently-stored value is a blank value. False if not
+ '''
+ if self.value is None:
+ return True
+ elif type(self.value) == int or type(self.value) == int:
+ return DCGM_INT64_IS_BLANK(self.value)
+ elif type(self.value) == float:
+ return DCGM_FP64_IS_BLANK(self.value)
+ elif type(self.value) == str:
+ return DCGM_STR_IS_BLANK(self.value)
+ else:
+ raise Exception("Unknown type: %s") % str(type(self.value))
+
+ ###########################################################################
+ def __str__(self):
+ return str(self.value)
+
+ ###########################################################################
+
+
+###############################################################################
+def self_test():
+
+ v = DcgmValue(1.0)
+ assert (not v.IsBlank())
+ assert (v.value == 1.0)
+
+ v = DcgmValue(100)
+ assert (not v.IsBlank())
+ assert (v.value == 100)
+
+ v = DcgmValue(DCGM_INT64_NOT_FOUND)
+ assert (v.IsBlank())
+
+ v = DcgmValue(DCGM_FP64_NOT_FOUND)
+ assert (v.IsBlank())
+
+ v.SetFromInt32(DCGM_INT32_NOT_SUPPORTED)
+ assert (v.IsBlank())
+ assert (v.value == DCGM_INT64_NOT_SUPPORTED)
+
+ print("Tests passed")
+ return
+
+
+###############################################################################
+if __name__ == "__main__":
+ self_test()
+
+###############################################################################
diff --git a/model_analyzer/monitor/dcgm/denylist_recommendations.py b/model_analyzer/monitor/dcgm/denylist_recommendations.py
new file mode 100644
index 000000000..38dafc624
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/denylist_recommendations.py
@@ -0,0 +1,573 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import sys
+import logging
+import json
+import os
+
+try:
+ import model_analyzer.monitor.dcgm.pydcgm as pydcgm
+ import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
+ import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+ import model_analyzer.monitor.dcgm.dcgm_errors as dcgm_errors
+ import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+ import model_analyzer.monitor.dcgm.DcgmSystem as DcgmSystem
+except:
+ # If we don't find the bindings, add the default path and try again
+ if 'PYTHONPATH' in os.environ:
+ os.environ['PYTHONPATH'] = os.environ[
+ 'PYTHONPATH'] + ":/usr/local/dcgm/bindings"
+ else:
+ os.environ['PYTHONPATH'] = '/usr/local/dcgm/bindings'
+
+ import model_analyzer.monitor.dcgm.pydcgm as pydcgm
+ import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
+ import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+ import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+ import model_analyzer.monitor.dcgm.DcgmSystem as DcgmSystem
+
+BR_ST_HEALTHY = 0x0000
+BR_ST_NOT_DETECTED = 0x0001
+BR_ST_FAILED_PASSIVE_HEALTH = 0x0002
+BR_ST_FAILED_ACTIVE_HEALTH = 0x0004
+
+BR_HEALTH_WATCH_BITMAP = dcgm_structs.DCGM_HEALTH_WATCH_ALL
+
+DIAG_SM_STRESS_DURATION = 90.0
+DIAG_CONSTANT_POWER_DURATION = 120.0
+DIAG_CONSTANT_STRESS_DURATION = 120.0
+DIAG_DIAGNOSTIC_DURATION = 300.0
+
+global g_gpus
+global g_switches
+g_gpus = []
+g_switches = []
+
+
+class Entity(object):
+
+ def __init__(self,
+ entityId,
+ entityType=dcgm_fields.DCGM_FE_GPU,
+ uuid=None,
+ bdf=None):
+ self.health = BR_ST_HEALTHY
+ self.entityType = entityType
+ self.entityId = entityId
+ self.reasonsUnhealthy = []
+ if uuid:
+ self.uuid = uuid
+ if bdf:
+ self.bdf = bdf
+
+ def IsHealthy(self):
+ return self.health == BR_ST_HEALTHY
+
+ def MarkUnhealthy(self, failCondition, reason):
+ self.health = self.health | failCondition
+ self.reasonsUnhealthy.append(reason)
+
+ def WhyUnhealthy(self):
+ return self.reasonsUnhealthy
+
+ def SetEntityId(self, entityId):
+ self.entityId = entityId
+
+ def GetEntityId(self):
+ return self.entityId
+
+ def GetUUID(self):
+ return self.uuid
+
+ def GetBDF(self):
+ return self.bdf
+
+
+def mark_entity_unhealthy(entities, entityId, code, reason):
+ found = False
+ for entity in entities:
+ if entityId == entity.GetEntityId():
+ entity.MarkUnhealthy(code, reason)
+ found = True
+
+ return found
+
+
+def addParamString(runDiagInfo, paramIndex, paramStr):
+ strIndex = 0
+ for c in paramStr:
+ runDiagInfo.testParms[paramIndex][strIndex] = c
+ strIndex = strIndex + 1
+
+
+def setTestDurations(runDiagInfo, timePercentage):
+ # We only are reducing the test time for the default case
+ if runDiagInfo.validate != 3:
+ return
+
+ stressDuration = int(DIAG_SM_STRESS_DURATION * timePercentage)
+ powerDuration = int(DIAG_CONSTANT_POWER_DURATION * timePercentage)
+ constantStressDuration = int(DIAG_CONSTANT_STRESS_DURATION * timePercentage)
+ diagDuration = int(DIAG_DIAGNOSTIC_DURATION * timePercentage)
+
+ smParam = "sm stress.test_duration=%d" % (stressDuration)
+ powerParam = "targeted power.test_duration=%d" % (powerDuration)
+ constantStressParam = "targeted stress.test_duration=%d" % (
+ constantStressDuration)
+ diagParam = "diagnostic.test_duration=%d" % (diagDuration)
+
+ addParamString(runDiagInfo, 0, diagParam)
+ addParamString(runDiagInfo, 1, smParam)
+ addParamString(runDiagInfo, 2, constantStressParam)
+ addParamString(runDiagInfo, 3, powerParam)
+
+
+def initialize_run_diag_info(settings):
+ runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
+ runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7
+ runDiagInfo.flags = dcgm_structs.DCGM_RUN_FLAGS_VERBOSE
+ testNamesStr = settings['testNames']
+ if testNamesStr == '1':
+ runDiagInfo.validate = 1
+ elif testNamesStr == '2':
+ runDiagInfo.validate = 2
+ elif testNamesStr == '3':
+ runDiagInfo.validate = 3
+ else:
+ # Make sure no number other that 1-3 were submitted
+ if testNamesStr.isdigit():
+ raise ValueError("'%s' is not a valid test name" % testNamesStr)
+
+ # Copy to the testNames portion of the object
+ names = testNamesStr.split(',')
+ testIndex = 0
+ if len(names) > dcgm_structs.DCGM_MAX_TEST_NAMES:
+ err = 'Aborting DCGM Diag because %d test names were specified exceeding the limit of %d' %\
+ (len(names), dcgm_structs.DCGM_MAX_TEST_NAMES)
+ raise ValueError(err)
+
+ for testName in names:
+ testNameIndex = 0
+ if len(testName) >= dcgm_structs.DCGM_MAX_TEST_NAMES_LEN:
+ err = 'Aborting DCGM Diag because test name %s exceeds max length %d' % \
+ (testName, dcgm_structs.DCGM_MAX_TEST_NAMES_LEN)
+ raise ValueError(err)
+
+ for c in testName:
+ runDiagInfo.testNames[testIndex][testNameIndex] = c
+ testNameIndex = testNameIndex + 1
+
+ testIndex = testIndex + 1
+
+ if 'timePercentage' in settings:
+ setTestDurations(runDiagInfo, settings['timePercentage'])
+
+ activeGpuIds = []
+
+ first = True
+ for gpuObj in g_gpus:
+ if gpuObj.IsHealthy():
+ activeGpuIds.append(gpuObj.GetEntityId())
+ if first:
+ runDiagInfo.gpuList = str(gpuObj.GetEntityId())
+ first = False
+ else:
+ to_append = ',%s' % (str(gpuObj.GetEntityId()))
+ runDiagInfo.gpuList = runDiagInfo.gpuList + to_append
+
+ return runDiagInfo, activeGpuIds
+
+
+def mark_all_unhealthy(activeGpuIds, reason):
+ for gpuId in activeGpuIds:
+ mark_entity_unhealthy(g_gpus, gpuId, BR_ST_FAILED_ACTIVE_HEALTH, reason)
+
+
+def result_to_str(result):
+ if result == dcgm_structs.DCGM_DIAG_RESULT_PASS:
+ return 'PASS'
+ elif result == dcgm_structs.DCGM_DIAG_RESULT_SKIP:
+ return 'SKIP'
+ elif result == dcgm_structs.DCGM_DIAG_RESULT_WARN:
+ return 'WARN'
+ elif result == dcgm_structs.DCGM_DIAG_RESULT_FAIL:
+ return 'FAIL'
+ else:
+ return 'NOT RUN'
+
+
+def check_passive_health_checks(response, activeGpuIds):
+ unhealthy = False
+ for i in range(0, dcgm_structs.DCGM_SWTEST_COUNT):
+ if response.levelOneResults[
+ i].result == dcgm_structs.DCGM_DIAG_RESULT_FAIL:
+ mark_all_unhealthy(activeGpuIds,
+ response.levelOneResults[i].error.msg)
+ unhealthy = True
+ break
+
+ return unhealthy
+
+
+def check_gpu_diagnostic(handleObj, settings):
+ runDiagInfo, activeGpuIds = initialize_run_diag_info(settings)
+ if len(activeGpuIds) == 0:
+ return
+
+ response = dcgm_agent.dcgmActionValidate_v2(handleObj.handle, runDiagInfo)
+
+ sysError = response.systemError
+ if (sysError.code != dcgm_errors.DCGM_FR_OK):
+ raise ValueError(sysError)
+
+ if check_passive_health_checks(response, activeGpuIds) == False:
+ for gpuIndex in range(response.gpuCount):
+ for testIndex in range(dcgm_structs.DCGM_PER_GPU_TEST_COUNT_V8):
+ if response.perGpuResponses[gpuIndex].results[
+ testIndex].result == dcgm_structs.DCGM_DIAG_RESULT_FAIL:
+ gpuId = response.perGpuResponses[gpuIndex].gpuId
+ mark_entity_unhealthy(
+ g_gpus, gpuId, BR_ST_FAILED_ACTIVE_HEALTH,
+ response.perGpuResponses[gpuIndex].results[testIndex].
+ result.error.msg)
+
+ # NVVS marks all subsequent tests as failed so there's no point in continuing
+ break
+
+
+def query_passive_health(handleObj, desired_watches):
+ dcgmGroup = handleObj.GetSystem().GetDefaultGroup()
+ watches = dcgmGroup.health.Get()
+
+ # Check for the correct watches to be set and set them if necessary
+ if watches != desired_watches:
+ dcgmGroup.health.Set(desired_watches)
+
+ return dcgmGroup.health.Check()
+
+
+def denylist_from_passive_health_check(response):
+ for incidentIndex in range(response.incidentCount):
+ if response.incidents[
+ incidentIndex].health != dcgm_structs.DCGM_HEALTH_RESULT_FAIL:
+ # Only add to the denylist for failures; ignore warnings
+ continue
+
+ entityId = response.incidents[incidentIndex].entityInfo.entityId
+ entityGroupId = response.incidents[
+ incidentIndex].entityInfo.entityGroupId
+ errorString = response.incidents[incidentIndex].error.msg
+
+ if entityGroupId == dcgm_fields.DCGM_FE_GPU:
+ mark_entity_unhealthy(g_gpus, entityId, BR_ST_FAILED_PASSIVE_HEALTH,
+ errorString)
+ else:
+ mark_entity_unhealthy(g_switches, entityId,
+ BR_ST_FAILED_PASSIVE_HEALTH, errorString)
+
+
+def check_passive_health(handleObj, watches):
+ response = query_passive_health(handleObj, watches)
+
+ if response.overallHealth != dcgm_structs.DCGM_HEALTH_RESULT_PASS:
+ denylist_from_passive_health_check(response)
+
+
+def initialize_devices(handle, flags):
+ gpuIds = dcgm_agent.dcgmGetEntityGroupEntities(handle,
+ dcgm_fields.DCGM_FE_GPU,
+ flags)
+ switchIds = dcgm_agent.dcgmGetEntityGroupEntities(
+ handle, dcgm_fields.DCGM_FE_SWITCH, flags)
+
+ i = 0
+ for gpuId in gpuIds:
+ attributes = dcgm_agent.dcgmGetDeviceAttributes(handle, gpuId)
+ gpuObj = Entity(gpuId,
+ entityType=dcgm_fields.DCGM_FE_GPU,
+ uuid=attributes.identifiers.uuid,
+ bdf=attributes.identifiers.pciBusId)
+ g_gpus.append(gpuObj)
+ i = i + 1
+
+ i = 0
+ for switchId in switchIds:
+ switchObj = Entity(switchId, entityType=dcgm_fields.DCGM_FE_SWITCH)
+ g_switches.append(switchObj)
+ i = i + 1
+
+
+# Process command line arguments
+def __process_command_line__(settings):
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-g',
+ '--num-gpus',
+ dest='num_gpus',
+ type=int,
+ help='The expected number of GPUs.')
+ parser.add_argument('-s',
+ '--num-switches',
+ dest='num_switches',
+ type=int,
+ help='The expected number of NvSwitches.')
+ parser.add_argument(
+ '-n',
+ '--hostname',
+ dest='hostname',
+ type=str,
+ help='The hostname of the nv-hostengine we want to query.')
+ parser.add_argument(
+ '-d',
+ '--detect',
+ dest='detect',
+ action='store_true',
+ help='Run on whatever GPUs can be detected. Do not check counts.')
+ parser.add_argument(
+ '-l',
+ '--log-file',
+ dest='logfileName',
+ type=str,
+ help=
+ 'The name of the log file where details should be stored. Default is stdout'
+ )
+ parser.add_argument(
+ '-u',
+ '--unsupported-too',
+ dest='unsupported',
+ action='store_true',
+ help='Get unsupported devices in addition to the ones DCGM supports')
+ parser.add_argument('-f',
+ '--full-report',
+ dest='fullReport',
+ action='store_true',
+ help='Print a health status for each GPU')
+ parser.add_argument(
+ '-c',
+ '--csv',
+ dest='outfmtCSV',
+ action='store_true',
+ help='Write output in csv format. By default, output is in json format.'
+ )
+ parser.add_argument(
+ '-w',
+ '--watches',
+ dest='watches',
+ type=str,
+ help=
+ 'Specify which health watches to monitor. By default, all are watched. Any list of the following may be specified:\n\ta = All watches\n\tp = PCIE\n\tm = Memory\n\ti = Inforom\n\tt = Thermal and Power\n\tn = NVLINK'
+ )
+
+ group = parser.add_mutually_exclusive_group()
+ group.add_argument(
+ '-r',
+ '--specified-test',
+ dest='testNames',
+ type=str,
+ help='Option to specify what tests are run in dcgmi diag.')
+ group.add_argument(
+ '-i',
+ '--instantaneous',
+ dest='instant',
+ action='store_true',
+ help='Specify to skip the longer tests and run instantaneously')
+ group.add_argument(
+ '-t',
+ '--time-limit',
+ dest='timeLimit',
+ type=int,
+ help=
+ 'The time limit in seconds that all the tests should not exceed. Diagnostics will be reduced in their time to meet this boundary.'
+ )
+
+ parser.set_defaults(instant=False, detect=False, fullReport=False)
+ args = parser.parse_args()
+
+ if args.num_gpus is not None and args.num_switches is not None:
+ settings['numGpus'] = args.num_gpus
+ settings['numSwitches'] = args.num_switches
+ elif args.detect == False:
+ raise ValueError(
+ 'Must specify either a number of gpus and switches with -g and -s or auto-detect with -d'
+ )
+
+ if args.hostname:
+ settings['hostname'] = args.hostname
+ else:
+ settings['hostname'] = 'localhost'
+
+ if args.unsupported:
+ settings['entity_get_flags'] = 0
+ else:
+ settings[
+ 'entity_get_flags'] = dcgm_structs.DCGM_GEGE_FLAG_ONLY_SUPPORTED
+
+ settings['instant'] = args.instant
+ settings['fullReport'] = args.fullReport
+
+ if args.testNames:
+ settings['testNames'] = args.testNames
+ else:
+ settings['testNames'] = '3'
+
+ if args.timeLimit:
+ settings['timePercentage'] = float(args.timeLimit) / 840.0
+
+ if args.logfileName:
+ logging.basicConfig(filename=args.logfileName)
+
+ if args.outfmtCSV:
+ settings['outfmtCSV'] = 1
+
+ if args.watches:
+ health_watches = 0
+ for c in args.watches:
+ if c == 'p':
+ health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_PCIE
+ elif c == 'm':
+ health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_MEM
+ elif c == 'i':
+ health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_INFOROM
+ elif c == 't':
+ health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_THERMAL
+ health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_POWER
+ elif c == 'n':
+ health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_NVLINK
+ elif c == 'a':
+ health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_ALL
+ else:
+ print(("Unrecognized character %s found in watch string '%s'" %
+ (c, args.watches)))
+ sys.exit(-1)
+ settings['watches'] = health_watches
+ else:
+ settings['watches'] = BR_HEALTH_WATCH_BITMAP
+
+
+def get_entity_id_list(entities):
+ ids = ""
+ first = True
+ for entity in entities:
+ if first:
+ ids = str(entity.GetEntityId())
+ else:
+ ids += ",%d" % (entity.GetEntityId())
+ first = False
+
+ return ids
+
+
+def check_health(handleObj, settings, error_list):
+ initialize_devices(handleObj.handle, settings['entity_get_flags'])
+
+ if 'numGpus' in settings:
+ if len(g_gpus) != settings['numGpus']:
+ error_list.append(
+ "%d GPUs were specified but only %d were detected with ids '%s'"
+ %
+ (settings['numGpus'], len(g_gpus), get_entity_id_list(g_gpus)))
+
+ if 'numSwitches' in settings:
+ if len(g_switches) != settings['numSwitches']:
+ error_list.append(
+ "%d switches were specified but only %d were detected with ids '%s'"
+ % (settings['numSwitches'], len(g_switches),
+ get_entity_id_list(g_switches)))
+
+ check_passive_health(handleObj, settings['watches']) # quick check
+
+ if settings['instant'] == False:
+ check_gpu_diagnostic(handleObj, settings)
+
+
+def process_command_line(settings):
+ try:
+ __process_command_line__(settings)
+ except ValueError as e:
+ return str(e)
+
+
+def main():
+ # Parse the command line
+ settings = {}
+ error_list = []
+
+ exitCode = 0
+ jsonTop = {}
+
+ error = process_command_line(settings)
+ if error:
+ # If we had an error processing the command line, don't attempt to check anything
+ error_list.append(error)
+ else:
+ try:
+ handleObj = pydcgm.DcgmHandle(None, settings['hostname'],
+ dcgm_structs.DCGM_OPERATION_MODE_AUTO)
+
+ check_health(handleObj, settings, error_list)
+ except dcgm_structs.DCGMError as e:
+ # Catch any exceptions from DCGM and add them to the error_list so they'll be printed as JSON
+ error_list.append(str(e))
+ except ValueError as e:
+ error_list.append(str(e))
+
+ if 'outfmtCSV' in settings: # show all health, then all un-healthy
+ for gpuObj in g_gpus:
+ if gpuObj.IsHealthy() == True:
+ print("healthy,%s,%s" % (gpuObj.GetBDF(), gpuObj.GetUUID()))
+ for gpuObj in g_gpus:
+ if gpuObj.IsHealthy() == False:
+ print("unhealthy,%s,%s,\"%s\"" %
+ (gpuObj.GetBDF(), gpuObj.GetUUID(),
+ gpuObj.WhyUnhealthy()))
+
+ else: # build obj that can be output in json
+ denylistGpus = {}
+ healthyGpus = {}
+ for gpuObj in g_gpus:
+ if gpuObj.IsHealthy() == False:
+ details = {}
+ details['UUID'] = gpuObj.GetUUID()
+ details['BDF'] = gpuObj.GetBDF()
+ details['Failure Explanation'] = gpuObj.WhyUnhealthy()
+ denylistGpus[gpuObj.GetEntityId()] = details
+ elif settings['fullReport']:
+ details = {}
+ details['UUID'] = gpuObj.GetUUID()
+ details['BDF'] = gpuObj.GetBDF()
+ healthyGpus[gpuObj.GetEntityId()] = details
+
+ jsonTop['denylistedGpus'] = denylistGpus
+ if settings['fullReport']:
+ jsonTop['Healthy GPUs'] = healthyGpus
+
+ if len(error_list): # had error processing the command line
+ exitCode = 1
+ if 'outfmtCSV' in settings: # json output
+ if len(error_list):
+ for errObj in error_list:
+ print("errors,\"%s\"" % (errObj))
+ else:
+ jsonTop['errors'] = error_list
+
+ if 'outfmtCSV' in settings: # show all health, then all un-healthy
+ pass
+ else:
+ print(json.dumps(jsonTop, indent=4, separators=(',', ': ')))
+
+ sys.exit(exitCode)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/model_analyzer/monitor/dcgm/pydcgm.py b/model_analyzer/monitor/dcgm/pydcgm.py
new file mode 100644
index 000000000..da6157471
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/pydcgm.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def _python_version_check():
+ import sys
+ python_version = sys.version.split(None, 1)[0]
+ if python_version < '3':
+ print(
+ '[ERROR] Detected Python version {}. These bindings are for Python 3.5+. Please load the Python 2 bindings found at /usr/local/dcgm/bindings'
+ .format(python_version))
+ sys.exit(1)
+
+
+_python_version_check()
+
+#Bring classes into this namespace
+from model_analyzer.monitor.dcgm.DcgmHandle import *
+from model_analyzer.monitor.dcgm.DcgmGroup import *
+from model_analyzer.monitor.dcgm.DcgmStatus import *
+from model_analyzer.monitor.dcgm.DcgmSystem import *
+from model_analyzer.monitor.dcgm.DcgmFieldGroup import *
+
+import os
+if '__DCGM_TESTING_FRAMEWORK_ACTIVE' in os.environ and os.environ[
+ '__DCGM_TESTING_FRAMEWORK_ACTIVE'] == '1':
+ import utils
+ import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+ dcgm_structs._dcgmInit(utils.get_testing_framework_library_path())
+'''
+Define a unique exception type we will return so that callers can distinguish our exceptions from python standard ones
+'''
+
+
+class DcgmException(Exception):
+ pass