diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ab70f9450..d334e556f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,6 +24,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +exclude: monitor/dcgm/ repos: - repo: https://github.com/timothycrosley/isort rev: 5.12.0 diff --git a/Dockerfile b/Dockerfile index 93fff3753..802cb93b5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,7 +27,7 @@ ARG BASE_IMAGE ARG TRITONSDK_BASE_IMAGE # DCGM version to install for Model Analyzer -ENV DCGM_VERSION=2.4.7 +ENV DCGM_VERSION=3.2.6 # Ensure apt-get won't prompt for selecting options ENV DEBIAN_FRONTEND=noninteractive diff --git a/model_analyzer/device/gpu_device_factory.py b/model_analyzer/device/gpu_device_factory.py index f28e36b3e..03f76115f 100755 --- a/model_analyzer/device/gpu_device_factory.py +++ b/model_analyzer/device/gpu_device_factory.py @@ -66,9 +66,10 @@ def init_all_devices(self, dcgmPath=None): device_atrributes = dcgm_agent.dcgmGetDeviceAttributes( dcgm_handle, device_id ).identifiers - pci_bus_id = device_atrributes.pciBusId.decode("utf-8").upper() - device_uuid = str(device_atrributes.uuid, encoding="utf-8") - device_name = str(device_atrributes.deviceName, encoding="utf-8") + pci_bus_id = device_atrributes.pciBusId + device_uuid = device_atrributes.uuid + device_name = device_atrributes.deviceName + gpu_device = GPUDevice(device_name, device_id, pci_bus_id, device_uuid) self._devices.append(gpu_device) diff --git a/model_analyzer/monitor/dcgm/DcgmDiag.py b/model_analyzer/monitor/dcgm/DcgmDiag.py new file mode 100644 index 000000000..e9178895c --- /dev/null +++ b/model_analyzer/monitor/dcgm/DcgmDiag.py @@ -0,0 +1,191 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs +import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent + + +class DcgmDiag: + + # Maps version codes to simple version values for range comparisons + _versionMap = {dcgm_structs.dcgmRunDiag_version: 5} + + def __init__(self, + gpuIds=None, + testNamesStr='', + paramsStr='', + verbose=True, + version=dcgm_structs.dcgmRunDiag_version): + # Make sure version is valid + if version not in DcgmDiag._versionMap: + raise ValueError("'%s' is not a valid version for dcgmRunDiag." % + version) + self.version = version + + if self.version == dcgm_structs.dcgmRunDiag_version7: + self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7() + else: + self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_t() + + self.numTests = 0 + self.numParams = 0 + self.SetVerbose(verbose) + if testNamesStr == '': + # default to a level 1 test + self.runDiagInfo.validate = 1 + elif testNamesStr == '1': + self.runDiagInfo.validate = 1 + elif testNamesStr == '2': + self.runDiagInfo.validate = 2 + elif testNamesStr == '3': + self.runDiagInfo.validate = 3 + elif testNamesStr == '4': + self.runDiagInfo.validate = 4 + else: + # Make sure no number other that 1-4 were submitted + if testNamesStr.isdigit(): + raise ValueError("'%s' is not a valid test name." % + testNamesStr) + + # Copy to the testNames portion of the object + names = testNamesStr.split(',') + if len(names) > dcgm_structs.DCGM_MAX_TEST_NAMES: + err = 'DcgmDiag cannot initialize: %d test names were specified exceeding the limit of %d.' %\ + (len(names), dcgm_structs.DCGM_MAX_TEST_NAMES) + raise ValueError(err) + + for testName in names: + self.AddTest(testName) + + if paramsStr != '': + params = paramsStr.split(';') + if len(params) >= dcgm_structs.DCGM_MAX_TEST_PARMS: + err = 'DcgmDiag cannot initialize: %d parameters were specified, exceeding the limit of %d.' %\ + (len(params), dcgm_structs.DCGM_MAX_TEST_PARMS) + raise ValueError(err) + + for param in params: + self.AddParameter(param) + + if gpuIds: + first = True + for gpu in gpuIds: + if first: + self.runDiagInfo.gpuList = str(gpu) + first = False + else: + self.runDiagInfo.gpuList = "%s,%s" % ( + self.runDiagInfo.gpuList, str(gpu)) + + def SetVerbose(self, val): + if val == True: + self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_VERBOSE + else: + self.runDiagInfo.flags &= ~dcgm_structs.DCGM_RUN_FLAGS_VERBOSE + + def UseFakeGpus(self): + self.runDiagInfo.fakeGpuList = self.runDiagInfo.gpuList + + def GetStruct(self): + return self.runDiagInfo + + def AddParameter(self, parameterStr): + if len(parameterStr) >= dcgm_structs.DCGM_MAX_TEST_PARMS_LEN: + err = 'DcgmDiag cannot add parameter \'%s\' because it exceeds max length %d.' % \ + (parameterStr, dcgm_structs.DCGM_MAX_TEST_PARMS_LEN) + raise ValueError(err) + + index = 0 + for c in parameterStr: + self.runDiagInfo.testParms[self.numParams][index] = ord(c) + index += 1 + + self.numParams += 1 + + def AddTest(self, testNameStr): + if len(testNameStr) >= dcgm_structs.DCGM_MAX_TEST_NAMES_LEN: + err = 'DcgmDiag cannot add test name \'%s\' because it exceeds max length %d.' % \ + (testNameStr, dcgm_structs.DCGM_MAX_TEST_NAMES_LEN) + raise ValueError(err) + + index = 0 + for c in testNameStr: + self.runDiagInfo.testNames[self.numTests][index] = ord(c) + index += 1 + + self.numTests += 1 + + def SetStatsOnFail(self, val): + if val == True: + self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_STATSONFAIL + + def SetThrottleMask(self, value): + if DcgmDiag._versionMap[self.version] < 3: + raise ValueError( + "Throttle mask requires minimum version 3 for dcgmRunDiag.") + if isinstance( + value, + str) and len(value) >= dcgm_structs.DCGM_THROTTLE_MASK_LEN: + raise ValueError("Throttle mask value '%s' exceeds max length %d." % + (value, dcgm_structs.DCGM_THROTTLE_MASK_LEN - 1)) + + self.runDiagInfo.throttleMask = str(value) + + def SetFailEarly(self, enable=True, checkInterval=5): + if DcgmDiag._versionMap[self.version] < 5: + raise ValueError( + "Fail early requires minimum version 5 for dcgmRunDiag.") + if not isinstance(checkInterval, int): + raise ValueError("Invalid checkInterval value: %s" % checkInterval) + + if enable: + self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_FAIL_EARLY + self.runDiagInfo.failCheckInterval = checkInterval + else: + self.runDiagInfo.flags &= ~dcgm_structs.DCGM_RUN_FLAGS_FAIL_EARLY + + def Execute(self, handle): + return dcgm_agent.dcgmActionValidate_v2(handle, self.runDiagInfo, + self.version) + + def SetStatsPath(self, statsPath): + if len(statsPath) >= dcgm_structs.DCGM_PATH_LEN: + err = "DcgmDiag cannot set statsPath '%s' because it exceeds max length %d." % \ + (statsPath, dcgm_structs.DCGM_PATH_LEN) + raise ValueError(err) + + self.runDiagInfo.statsPath = statsPath + + def SetConfigFileContents(self, configFileContents): + if len(configFileContents) >= dcgm_structs.DCGM_MAX_CONFIG_FILE_LEN: + err = "Dcgm Diag cannot set config file contents to '%s' because it exceeds max length %d." \ + % (configFileContents, dcgm_structs.DCGM_MAX_CONFIG_FILE_LEN) + raise ValueError(err) + + self.runDiagInfo.configFileContents = configFileContents + + def SetDebugLogFile(self, logFileName): + if len(logFileName) >= dcgm_structs.DCGM_FILE_LEN: + raise ValueError("Cannot set debug file to '%s' because it exceeds max length %d."\ + % (logFileName, dcgm_structs.DCGM_FILE_LEN)) + + self.runDiagInfo.debugLogFile = logFileName + + def SetDebugLevel(self, debugLevel): + if debugLevel < 0 or debugLevel > 5: + raise ValueError( + "Cannot set debug level to %d. Debug Level must be a value from 0-5 inclusive." + ) + + self.runDiagInfo.debugLevel = debugLevel diff --git a/model_analyzer/monitor/dcgm/DcgmFieldGroup.py b/model_analyzer/monitor/dcgm/DcgmFieldGroup.py new file mode 100644 index 000000000..bcbe37035 --- /dev/null +++ b/model_analyzer/monitor/dcgm/DcgmFieldGroup.py @@ -0,0 +1,83 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent +import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs +''' +Class for managing a group of field IDs in the host engine. +''' + + +class DcgmFieldGroup: + ''' + Constructor + + dcgmHandle - DcgmHandle() instance to use for communicating with the host engine + name - Name of the field group to use within DCGM. This must be unique + fieldIds - Fields that are part of this group + fieldGroupId - If provided, this is used to initialize the object from an existing field group ID + ''' + + def __init__(self, dcgmHandle, name="", fieldIds=None, fieldGroupId=None): + fieldIds = fieldIds or [] + self.name = name + self.fieldIds = fieldIds + self._dcgmHandle = dcgmHandle + self.wasCreated = False + + #If the user passed in an ID, the field group already exists. Fetch live info + if fieldGroupId is not None: + self.fieldGroupId = fieldGroupId + fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo( + self._dcgmHandle.handle, self.fieldGroupId) + self.name = fieldGroupInfo.fieldGroupName + self.fieldIds = fieldGroupInfo.fieldIds + else: + self.fieldGroupId = None #Assign here so the destructor doesn't fail if the call below fails + self.fieldGroupId = dcgm_agent.dcgmFieldGroupCreate( + self._dcgmHandle.handle, fieldIds, name) + self.wasCreated = True + + ''' + Remove this field group from DCGM. This object can no longer be passed to other APIs after this call. + ''' + + def Delete(self): + if self.wasCreated and self.fieldGroupId is not None: + try: + try: + dcgm_agent.dcgmFieldGroupDestroy(self._dcgmHandle.handle, + self.fieldGroupId) + except dcgm_structs.dcgmExceptionClass( + dcgm_structs.DCGM_ST_NO_DATA): + # someone may have deleted the group under us. That's ok. + pass + except dcgm_structs.dcgmExceptionClass( + dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID): + # We lost our connection, but we're destructing this object anyway. + pass + except AttributeError as ae: + # When we're cleaning up at the end, dcgm_agent and dcgm_structs have been unloaded and we'll + # get an AttributeError: "'NoneType' object has no 'dcgmExceptionClass'" Ignore this + pass + except TypeError as te: + # When we're cleaning up at the end, dcgm_agent and dcgm_structs have been unloaded and we might + # get a TypeError: "'NoneType' object is not callable'" Ignore this + pass + self.fieldGroupId = None + self._dcgmHandle = None + + #Destructor + def __del__(self): + self.Delete() diff --git a/model_analyzer/monitor/dcgm/DcgmGroup.py b/model_analyzer/monitor/dcgm/DcgmGroup.py new file mode 100644 index 000000000..834e102db --- /dev/null +++ b/model_analyzer/monitor/dcgm/DcgmGroup.py @@ -0,0 +1,815 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import model_analyzer.monitor.dcgm.pydcgm as pydcgm +import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent +import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs +import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields +import model_analyzer.monitor.dcgm.dcgm_field_helpers as dcgm_field_helpers +from model_analyzer.monitor.dcgm.DcgmHandle import DcgmHandle + + +class DcgmGroupConfig: + + def __init__(self, dcgmHandle, groupId, dcgmGroup): + self._dcgmHandle = dcgmHandle + self._groupId = groupId + self._dcgmGroup = dcgmGroup + + ''' + Set configuration for this group + + config should be an instance of dcgm_structs.c_dcgmDeviceConfig_v1 + + Will throw an exception on error + ''' + + def Set(self, config): + status = pydcgm.DcgmStatus() + ret = dcgm_structs.DCGM_ST_OK + + try: + ret = dcgm_agent.dcgmConfigSet(self._dcgmHandle.handle, + self._groupId, config, status.handle) + except dcgm_structs.DCGMError as e: + pass + + #Throw specific errors before return error + status.ThrowExceptionOnErrors() + #Throw an appropriate exception on error + dcgm_structs._dcgmCheckReturn(ret) + + ''' + Get configuration for this group + + configType is a DCGM_CONFIG_? constant + + Returns an array of dcgm_structs.c_dcgmDeviceConfig_v1 objects + Throws an exception on error + ''' + + def Get(self, configType): + status = pydcgm.DcgmStatus() + + gpuIds = self._dcgmGroup.GetGpuIds() + configList = dcgm_agent.dcgmConfigGet(self._dcgmHandle.handle, + self._groupId, configType, + len(gpuIds), status.handle) + #Throw specific errors before return error + status.ThrowExceptionOnErrors() + return configList + + ''' + Enforce the configuration that has been set with Set() + + Throws an exception on error + ''' + + def Enforce(self): + status = pydcgm.DcgmStatus() + ret = dcgm_structs.DCGM_ST_OK + try: + ret = dcgm_agent.dcgmConfigEnforce(self._dcgmHandle.handle, + self._groupId, status.handle) + except dcgm_structs.DCGMError as e: + pass + + #Throw specific errors before return error + status.ThrowExceptionOnErrors() + #Throw an appropriate exception on error + dcgm_structs._dcgmCheckReturn(ret) + + +class DcgmGroupSamples: + + def __init__(self, dcgmHandle, groupId, dcgmGroup): + self._dcgmHandle = dcgmHandle + self._groupId = groupId + self._dcgmGroup = dcgmGroup + + ''' + Tell DCGM to start recording samples for the given field group + + fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch. + updateFreq: How often to update these fields in usec + maxKeepAge: How long to keep data for these fields in seconds + maxKeepSamples: Maximum number of samples to keep per field. 0=no limit + + Once the field collection is watched, it will update whenever the next update + loop occurs. If you want to query these values immediately, use + handle.UpdateAllFields(True) to make sure that the fields have updated at least once. + ''' + + def WatchFields(self, fieldGroup, updateFreq, maxKeepAge, maxKeepSamples): + ret = dcgm_agent.dcgmWatchFields(self._dcgmHandle.handle, self._groupId, + fieldGroup.fieldGroupId, updateFreq, + maxKeepAge, maxKeepSamples) + dcgm_structs._dcgmCheckReturn(ret) + + ''' + tell DCGM to stop recording samples for a given field group + + fieldGroup: DcgmFieldGroup() instance tracking the fields we want to unwatch. + ''' + + def UnwatchFields(self, fieldGroup): + ret = dcgm_agent.dcgmUnwatchFields(self._dcgmHandle.handle, + self._groupId, + fieldGroup.fieldGroupId) + dcgm_structs._dcgmCheckReturn(ret) + + ''' + Get the most recent values for each field in a field collection + + fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch. + + Returns DcgmFieldValueCollection object. Use its .values[gpuId][fieldId][0].value to access values + ''' + + def GetLatest(self, fieldGroup): + dfvc = dcgm_field_helpers.DcgmFieldValueCollection( + self._dcgmHandle.handle, self._groupId) + dfvc.GetLatestValues(fieldGroup) + return dfvc + + ''' + Get the most recent values for each field in a field collection + + fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch. + + Returns DcgmFieldValueEntityCollection object. Use its .values[entityGroupId][entityId][fieldId][0].value to access values + ''' + + def GetLatest_v2(self, fieldGroup): + dfvec = dcgm_field_helpers.DcgmFieldValueEntityCollection( + self._dcgmHandle.handle, self._groupId) + dfvec.GetLatestValues(fieldGroup) + return dfvec + + ''' + Get the new values for each field in a field collection since the last + collection. + + dfvc: DcgmFieldValueCollection() instance. Will return a + DcgmFieldValueCollection with values since the one passed in. + Pass None for the first call to get one for subsequent calls. + On subsequent calls, pass what was returned. + fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch. + + Returns DcgmFieldValueCollection object. Use its .values[gpuId][fieldId][*].value to access values + ''' + + def GetAllSinceLastCall(self, dfvc, fieldGroup): + if dfvc == None: + dfvc = dcgm_field_helpers.DcgmFieldValueCollection( + self._dcgmHandle.handle, self._groupId) + dfvc.GetLatestValues(fieldGroup) + else: + # We used to expect at least one value (GetLatestValues), so this + # ensures we provide one at the risk of repetition. This should not + # happen if we call this function infrequently enough (slower than + # the sampling rate). + dfvc.GetAllSinceLastCall(fieldGroup) + if len(dfvc.values) == 0: + dfvc.GetLatestValues(fieldGroup) + return dfvc + + ''' + Gets more values for each field in a field entity collection + + dfvec: DcgmFieldValueEntityCollection() instance. Will return a + DcgmFieldValueEntityCollection with values since the one passed + in. Pass None for the first call to get one for subsequent + calls. On subsequent calls, pass what was returned. + + fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch. + + Returns DcgmFieldValueEntityCollection object. Use its .values[entityGroupId][entityId][fieldId][*].value to access values + ''' + + def GetAllSinceLastCall_v2(self, dvfec, fieldGroup): + if dfvec == None: + dfvec = dcgm_field_helpers.DcgmFieldValueEntityCollection( + self._dcgmHandle.handle, self._groupId) + dfvec.GetLastestValues_v2(fieldGroup) + else: + dfvec.GetAllSinceLastCall_v2(fieldGroup) + # We used to expect at least one value (GetLatestValues), so this + # ensures we provide one at the risk of repetition. This should not + # happen if we call this function infrequently enough (slower than + # the sampling rate). + if len(dfvec.values) == 0: + dfvec.GetLatestValues_v2(fieldGroup) + + return dfvec + + ''' + Convenience alias for DcgmHandle.UpdateAllFields(). All fields on the system will be updated, not + just this group's. + ''' + + def UpdateAllFields(self, waitForUpdate): + self._dcgmHandle.UpdateAllFields(waitForUpdate) + + +class DcgmGroupHealth: + + def __init__(self, dcgmHandle, groupId, dcgmGroup): + self._dcgmHandle = dcgmHandle + self._groupId = groupId + self._dcgmGroup = dcgmGroup + + ''' + Enable health checks for this group + + systems: A bitmask of dcgm_structs.DCGM_HEALTH_WATCH_? definitions of which health checks to enable + updateInterval: How often DCGM should request new health data from the driver in usec + maxKeepAge: How long DCGM should keep health data around once it has been retrieved from the driver in seconds + ''' + + def Set(self, systems, updateInterval=None, maxKeepAge=None): + if updateInterval is None or maxKeepAge is None: + ret = dcgm_agent.dcgmHealthSet(self._dcgmHandle.handle, + self._groupId, systems) + else: + ret = dcgm_agent.dcgmHealthSet_v2(self._dcgmHandle.handle, + self._groupId, systems, + updateInterval, maxKeepAge) + dcgm_structs._dcgmCheckReturn(ret) + + ''' + Retrieve the current state of the DCGM health check system + + Returns a bitmask of dcgm_structs.DCGM_HEALTH_WATCH_? definitions of which health checks are currently enabled + ''' + + def Get(self): + systems = dcgm_agent.dcgmHealthGet(self._dcgmHandle.handle, + self._groupId) + return systems + + ''' + Check the configured watches for any errors/failures/warnings that have occurred + since the last time this check was invoked. On the first call, stateful information + about all of the enabled watches within a group is created but no error results are + provided. On subsequent calls, any error information will be returned. + + @param version IN: Allows the caller to use an older version of this request. Should be + dcgm_structs.dcgmHealthResponse_version4 + + Returns a dcgm_structs.c_dcgmHealthResponse_* object that contains results for each GPU/entity + ''' + + def Check(self, version=dcgm_structs.dcgmHealthResponse_version4): + resp = dcgm_agent.dcgmHealthCheck(self._dcgmHandle.handle, + self._groupId, version) + return resp + + +class DcgmGroupPolicy: + + def __init__(self, dcgmHandle, groupId, dcgmGroup): + self._dcgmHandle = dcgmHandle + self._groupId = groupId + self._dcgmGroup = dcgmGroup + + ''' + Get the current violation policy inside the policy manager. Given a groupId, a number of + policy structures are retrieved. + + @param statusHandle IN/OUT: pydcgm.DcgmStatus for the resulting status of the operation. Pass it as None + if the detailed error information for the operation is not needed (default). + + Returns a list of dcgm_structs.c_dcgmPolicy_v1 with the same length as the number of GPUs in the group. + The index of an entry corresponds to a given GPU ID in the group. Throws an exception on error. + ''' + + def Get(self, statusHandle=None): + if statusHandle: + statusHandle = statusHandle.handle + count = len(self._dcgmGroup.GetGpuIds()) + if count <= 0: + raise pydcgm.DcgmException( + "This group has no GPUs, cannot retrieve policies") + return dcgm_agent.dcgmPolicyGet(self._dcgmHandle.handle, self._groupId, + count, statusHandle) + + ''' + Set the current violation policy inside the policy manager. Given the conditions within "policy", + if a violation has occurred, subsequent action(s) may be performed to either + report or contain the failure. + + This API is only supported on Tesla GPUs and will throw DCGMError_NotSupported if called on non-Tesla GPUs. + + @param policy IN: dcgm_structs.c_dcgmPolicy_v1 that will be applied to all GPUs in the group + + @param statusHandle IN/OUT: pydcgm.DcgmStatus for the resulting status for the operation. Pass it as + None if the detailed error information for the operation is not needed (default). + + Returns Nothing. Throws an exception on error + ''' + + def Set(self, policy, statusHandle=None): + if statusHandle: + statusHandle = statusHandle.handle + dcgm_agent.dcgmPolicySet(self._dcgmHandle.handle, self._groupId, policy, + statusHandle) + + ''' + Register a function to be called when a specific policy condition (see dcgm_structs.c_dcgmPolicy_v1.condition) + has been violated. This callback(s) will be called automatically when in DCGM_OPERATION_MODE_AUTO mode and only after + DcgmPolicy.Trigger when in DCGM_OPERATION_MODE_MANUAL mode. + All callbacks are made within a separate thread. + + This API is only supported on Tesla GPUs and will throw DCGMError_NotSupported if called on non-Tesla GPUs. + + @param condition IN: The set of conditions specified as an OR'd list + (see dcgm_structs.DCGM_POLICY_COND_*) + for which to register a callback function + + @param beginCallback IN: A function that should be called should a violation occur. This + function will be called prior to any actions specified by the policy are taken. + + @param finishCallback IN: A reference to a function that should be called should a violation occur. + This function will be called after any action specified by the policy are completed. + + At least one callback must be provided that is not None. + + Returns Nothing. Throws an exception on error. + ''' + + def Register(self, condition, beginCallback=None, finishCallback=None): + if beginCallback is None and finishCallback is None: + raise pydcgm.DcgmException( + "At least 1 callback must be provided to register that is not None" + ) + dcgm_agent.dcgmPolicyRegister(self._dcgmHandle.handle, self._groupId, + condition, beginCallback, finishCallback) + + ''' + Unregister a function to be called for a specific policy condition (see dcgm_structs.c_dcgmPolicy_v1.condition) . + This function will unregister all callbacks for a given condition. + + @param condition IN: The set of conditions specified as an OR'd list + (see dcgm_structs.DCGM_POLICY_COND_*) + for which to unregister a callback function + + Returns Nothing. Throws an exception on error. + ''' + + def Unregister(self, condition): + dcgm_agent.dcgmPolicyUnregister(self._dcgmHandle.handle, self._groupId, + condition) + + ''' + Inform the policy manager loop to perform an iteration and trigger the callbacks of any + registered functions. Callback functions will be called from a separate thread as the calling function. + + Note: The GPU monitoring and management agent must call this method periodically if the operation + mode is set to manual mode (DCGM_OPERATION_MODE_MANUAL) during initialization + (\ref DcgmHandle.__init__). + + Returns Nothing. Throws an exception if there is a generic error that the + policy manager was unable to perform another iteration. + ''' + + def Trigger(self): + dcgm_agent.dcgmPolicyTrigger(self._dcgmHandle.handle) + + +class DcgmGroupDiscovery: + + def __init__(self, dcgmHandle, groupId, dcgmGroup): + self._dcgmHandle = dcgmHandle + self._groupId = groupId + self._dcgmGroup = dcgmGroup + + ''' + Get the topology for this group + + Returns a c_dcgmGroupTopology_v1 object representing the topology for this group + ''' + + def GetTopology(self): + return dcgm_agent.dcgmGetGroupTopology(self._dcgmHandle.handle, + self._groupId) + + +class DcgmGroupStats: + + def __init__(self, dcgmHandle, groupId, dcgmGroup): + self._dcgmHandle = dcgmHandle + self._groupId = groupId + self._dcgmGroup = dcgmGroup + + ''' + Tell DCGM to start recording samples for fields returned from GetPidInfo() + + updateFreq: How often to update these fields in usec + maxKeepAge: How long to keep data for these fields in seconds + maxKeepSamples: Maximum number of samples to keep per field. 0=no limit + + Once the field collection is watched, it will update whenever the next update + loop occurs. If you want to query these values immediately, use + handle.UpdateAllFields(True) to make sure that the fields have updated at least once. + ''' + + def WatchPidFields(self, updateFreq, maxKeepAge, maxKeepSamples): + ret = dcgm_agent.dcgmWatchPidFields(self._dcgmHandle.handle, + self._groupId, updateFreq, + maxKeepAge, maxKeepSamples) + dcgm_structs._dcgmCheckReturn(ret) + + ''' + Get process stats for a given PID on this GPU group + + You must call WatchPidFields() before this query for this method to return any results + + Returns a dcgm_structs.c_dcgmPidInfo_v2 structure + ''' + + def GetPidInfo(self, pid): + return dcgm_agent.dcgmGetPidInfo(self._dcgmHandle.handle, self._groupId, + pid) + + ''' + Tell DCGM to start recording samples for fields returned from GetJobStats() + + updateFreq: How often to update these fields in usec + maxKeepAge: How long to keep data for these fields in seconds + maxKeepSamples: Maximum number of samples to keep per field. 0=no limit + + Once the fields are watched, they will update whenever the next update + loop occurs. If you want to query these values immediately, use + handle.UpdateAllFields(True) to make sure that the fields have updated at least once. + ''' + + def WatchJobFields(self, updateFreq, maxKeepAge, maxKeepSamples): + ret = dcgm_agent.dcgmWatchJobFields(self._dcgmHandle.handle, + self._groupId, updateFreq, + maxKeepAge, maxKeepSamples) + dcgm_structs._dcgmCheckReturn(ret) + + ''' + Start collecting stats for a named job for this GPU group + + Calling this will tell DCGM to start tracking stats for the given jobId. Stats tracking + will end when StopJobStats() is called + + You must call WatchJobFields() before this call to tell DCGM to start sampling the fields + that are returned from GetJobStats(). + + jobId is a unique string identifier for this job. An exception will be thrown if this is not unique + + Returns Nothing (Will throw exception on error) + ''' + + def StartJobStats(self, jobId): + ret = dcgm_agent.dcgmJobStartStats(self._dcgmHandle.handle, + self._groupId, jobId) + dcgm_structs._dcgmCheckReturn(ret) + + ''' + Stop collecting stats for a named job + + Calling this will tell DCGM to stop collecting stats for a job that was previously started + with StartJobStats(). + + jobId is the unique string that was passed as jobId to StartJobStats. + + Returns Nothing (Will throw exception on error) + ''' + + def StopJobStats(self, jobId): + ret = dcgm_agent.dcgmJobStopStats(self._dcgmHandle.handle, jobId) + dcgm_structs._dcgmCheckReturn(ret) + + ''' + Get stats for a job that was started with StartJobStats. If StopJobStats has not been called yet, + this will get stats from when the job started until now. If StopJob was called prior to + this, the returned Stats will go from when StartJobStats was called to when StopJobStats was called. + + jobId is the unique string that was passed as jobId to StartJobStats and StopJobStats + + Returns a dcgm_structs.c_dcgmJobInfo_v3 structure. Throws an exception on error + ''' + + def GetJobStats(self, jobId): + ret = dcgm_agent.dcgmJobGetStats(self._dcgmHandle.handle, jobId) + return ret + + ''' + This API tells DCGM to stop tracking the job given by jobId. After this call, you will no longer + be able to call GetJobStats() on this jobId. However, you will be able to reuse jobId after + this call. + + jobId is the unique string that was passed as jobId to StartJobStats and StopJobStats + + Returns Nothing (Will throw exception on error) + ''' + + def RemoveJob(self, jobId): + ret = dcgm_agent.dcgmJobRemove(self._dcgmHandle.handle, jobId) + return ret + + ''' + This API tells DCGM to stop tracking all jobs. After this call, you will no longer + be able to call dcgmJobGetStats() any jobs until you call StartJobStats() again. + You will be able to reuse any previously-used jobIds after this call. + + Returns Nothing (Will throw exception on error) + ''' + + def RemoveAllJobs(self): + ret = dcgm_agent.dcgmJobRemoveAll(self._dcgmHandle.handle) + return ret + + +class DcgmGroupAction: + + def __init__(self, dcgmHandle, groupId, dcgmGroup): + self._dcgmHandle = dcgmHandle + self._groupId = groupId + self._dcgmGroup = dcgmGroup + + ''' + Inform the action manager to perform a manual validation of a group of GPUs on the system + + validate is what sort of validation to do. See dcgm_structs.DCGM_POLICY_VALID_* defines. + + Returns a dcgm_structs.c_dcgmDiagResponse_v5 instance + ''' + + def Validate(self, validate): + runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7() + runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7 + runDiagInfo.validate = validate + runDiagInfo.groupId = self._groupId + + ret = dcgm_agent.dcgmActionValidate_v2(self._dcgmHandle.handle, + runDiagInfo) + return ret + + ''' + Run a diagnostic on this group of GPUs. + + diagLevel is the level of diagnostic desired. See dcgm_structs.DCGM_DIAG_LVL_* constants. + + Returns a dcgm_structs.c_dcgmDiagResponse_v5 instance + ''' + + def RunDiagnostic(self, diagLevel): + ret = dcgm_agent.dcgmRunDiagnostic(self._dcgmHandle.handle, + self._groupId, diagLevel) + return ret + + ''' + Run a specific diagnostic test on this group of GPUs. + testName is the name of the specific test that should be invoked. + Returns a dcgm_structs.c_dcgmDiagResponse_v5 instance + ''' + + def RunSpecificTest(self, testName): + runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7() + runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7 + for i in range(len(testName)): + runDiagInfo.testNames[0][i] = testName[i] + runDiagInfo.groupId = self._groupId + runDiagInfo.validate = dcgm_structs.DCGM_POLICY_VALID_NONE + response = dcgm_agent.dcgmActionValidate_v2(self._dcgmHandle.handle, + runDiagInfo) + return response + + +class DcgmGroupProfiling: + + def __init__(self, dcgmHandle, groupId, dcgmGroup): + """ + + Parameters + ---------- + dcgmHandle : DcgmHandle + groupId : int + dcgmGroup : DcgmGroup + """ + self._dcgmHandle = dcgmHandle + self._groupId = groupId + self._dcgmGroup = dcgmGroup + + def GetSupportedMetricGroups(self): + """ + Get a list of the profiling metric groups available for this group of entities + + :return: dcgm_structs.c_dcgmProfGetMetricGroups_v3 + :throws: dcgm_structs.DCGMError on error + """ + gpuIds = self._dcgmGroup.GetGpuIds() + if len(gpuIds) < 1: + raise dcgm_structs.DCGMError_ProfilingNotSupported + + ret = dcgm_agent.dcgmProfGetSupportedMetricGroups( + self._dcgmHandle.handle, gpuIds[0]) + return ret + + +class DcgmGroup: + ''' + Constructor. + + Either groupId OR groupName must be provided as a parameter. + This will set which GPU group this object is bound to + + groupId=DCGM_GROUP_ALL_GPUS creates a group with all GPUs. Passing an existing groupId will + not create an additional group. + If groupName is provided, an empty group (No GPUs) of name groupName will be created. This group + will be destroyed when this object goes out of scope or is deleted with del(). + groupType is the type of group to create. See dcgm_structs.DCGM_GROUP_? constants. + ''' + + def __init__(self, + dcgmHandle, + groupId=None, + groupName=None, + groupType=dcgm_structs.DCGM_GROUP_EMPTY): + self._dcgmHandle = dcgmHandle + + if groupId is None and groupName is None: + raise pydcgm.DcgmException( + "Either groupId or groupName is required") + + if groupId is not None: + self._groupId = groupId + else: + self._groupId = dcgm_agent.dcgmGroupCreate(self._dcgmHandle.handle, + groupType, groupName) + + #Create namespace classes + self.config = DcgmGroupConfig(self._dcgmHandle, self._groupId, self) + self.samples = DcgmGroupSamples(self._dcgmHandle, self._groupId, self) + self.health = DcgmGroupHealth(self._dcgmHandle, self._groupId, self) + self.policy = DcgmGroupPolicy(self._dcgmHandle, self._groupId, self) + self.discovery = DcgmGroupDiscovery(self._dcgmHandle, self._groupId, + self) + self.stats = DcgmGroupStats(self._dcgmHandle, self._groupId, self) + self.action = DcgmGroupAction(self._dcgmHandle, self._groupId, self) + self.profiling = DcgmGroupProfiling(self._dcgmHandle, self._groupId, + self) + + ''' + Remove this group from DCGM. This object will no longer be valid after this call. + ''' + + def Delete(self): + del self.config + self.config = None + del self.samples + self.samples = None + del self.health + self.health = None + del self.policy + self.policy = None + del self.discovery + self.discovery = None + del self.stats + self.stats = None + del self.action + self.action = None + del self.profiling + self.profiling = None + + #Delete the group we created if we're not using the special all-GPU group + if self._groupId is not None and not self._IsGroupIdStatic(): + ret = dcgm_agent.dcgmGroupDestroy(self._dcgmHandle.handle, + self._groupId) + dcgm_structs._dcgmCheckReturn(ret) + + self._groupId = None + + ''' + Private method to determine if our groupId is a predefined one + ''' + + def _IsGroupIdStatic(self): + if self._groupId == dcgm_structs.DCGM_GROUP_ALL_GPUS or \ + self._groupId == dcgm_structs.DCGM_GROUP_ALL_NVSWITCHES: + return True + return False + + ''' + Add a GPU to this group + + gpuId is the GPU ID to add to our group + + Returns Nothing. Throws an exception on error + ''' + + def AddGpu(self, gpuId): + if self._IsGroupIdStatic(): + raise pydcgm.DcgmException("Can't add a GPU to a static group") + + ret = dcgm_agent.dcgmGroupAddDevice(self._dcgmHandle.handle, + self._groupId, gpuId) + dcgm_structs._dcgmCheckReturn(ret) + + ''' + Add an entity to this group + + entityGroupId is DCGM_FE_? constant of the entity group this entity belongs to + entityId is the entity to add to this group + + Returns Nothing. Throws an exception on error + ''' + + def AddEntity(self, entityGroupId, entityId): + if self._IsGroupIdStatic(): + raise pydcgm.DcgmException("Can't add an entity to a static group") + + ret = dcgm_agent.dcgmGroupAddEntity(self._dcgmHandle.handle, + self._groupId, entityGroupId, + entityId) + dcgm_structs._dcgmCheckReturn(ret) + + ''' + Remove a GPU from this group + + gpuId is the GPU ID to remove from our group + + Returns Nothing. Throws an exception on error + ''' + + def RemoveGpu(self, gpuId): + if self._IsGroupIdStatic(): + raise pydcgm.DcgmException("Can't remove a GPU from a static group") + + ret = dcgm_agent.dcgmGroupRemoveDevice(self._dcgmHandle.handle, + self._groupId, gpuId) + dcgm_structs._dcgmCheckReturn(ret) + + ''' + Remove an entity from this group + + entityGroupId is DCGM_FE_? constant of the entity group this entity belongs to + entityId is the entity to remove from this group + + Returns Nothing. Throws an exception on error + ''' + + def RemoveEntity(self, entityGroupId, entityId): + if self._IsGroupIdStatic(): + raise pydcgm.DcgmException( + "Can't remove an entity from a static group") + + ret = dcgm_agent.dcgmGroupRemoveEntity(self._dcgmHandle.handle, + self._groupId, entityGroupId, + entityId) + dcgm_structs._dcgmCheckReturn(ret) + + ''' + Get an array of GPU ids that are part of this group + + Note: this ignores non-GPU members of the group + + Returns a list of GPU ids. Throws an exception on error + ''' + + def GetGpuIds(self): + groupInfo = dcgm_agent.dcgmGroupGetInfo(self._dcgmHandle.handle, + self._groupId) + groupGpuIds = [] + for i in range(groupInfo.count): + if groupInfo.entityList[i].entityGroupId != dcgm_fields.DCGM_FE_GPU: + continue + groupGpuIds.append(groupInfo.entityList[i].entityId) + return groupGpuIds + + ''' + Get an array of entities that are part of this group + + Returns a list of c_dcgmGroupEntityPair_t structs. Throws an exception on error + ''' + + def GetEntities(self): + groupInfo = dcgm_agent.dcgmGroupGetInfo(self._dcgmHandle.handle, + self._groupId) + entities = groupInfo.entityList[0:groupInfo.count] + return entities + + ''' + Get the groupId of this object + + Returns our groupId + ''' + + def GetId(self): + return self._groupId diff --git a/model_analyzer/monitor/dcgm/DcgmHandle.py b/model_analyzer/monitor/dcgm/DcgmHandle.py new file mode 100644 index 000000000..0234318ed --- /dev/null +++ b/model_analyzer/monitor/dcgm/DcgmHandle.py @@ -0,0 +1,141 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import model_analyzer.monitor.dcgm.pydcgm as pydcgm +import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs +import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent + + +class DcgmHandle: + ''' + Class to encapsulate a handle to DCGM and global methods to control + query the host engine + ''' + + def __init__(self, + handle=None, + ipAddress=None, + opMode=dcgm_structs.DCGM_OPERATION_MODE_AUTO, + persistAfterDisconnect=False, + unixSocketPath=None, + timeoutMs=0): + ''' + Constructor + + handle is an existing handle from dcgmInit(). Pass None if you want this object to handle DCGM initialization for you + ipAddress is the host to connect to. None = start embedded host engine + opMode is a dcgm_structs.DCGM_OPERATION_MODE_* constant for how the host engine should run (embedded mode only) + persistAfterDisconnect (TCP-IP connections only) is whether the host engine should persist all of our watches + after we disconnect. 1=persist our watches. 0=clean up after our connection + unixSocketPath is a path to a path on the local filesystem that is a unix socket that the host engine is listening on. + This option is mutually exclusive with ipAddress + timeoutMs is how long to wait for TCP/IP or Unix domain connections to establish in ms. 0=Default timeout (5000ms) + ''' + self._handleCreated = False + self._persistAfterDisconnect = persistAfterDisconnect + + if handle is not None: + self.handle = handle + return + + self._ipAddress = ipAddress + + #Can't provide both unix socket and ip address + if ipAddress is not None and unixSocketPath is not None: + raise dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM) + + #Initialize the DCGM client library + dcgm_structs._dcgmInit() + dcgm_agent.dcgmInit( + ) #Not harmful to call this multiple times in a process + + #If neither ipAddress nor unixSocketPath are present, start an embedded host engine + if ipAddress is None and unixSocketPath is None: + self.handle = dcgm_agent.dcgmStartEmbedded(opMode) + self.isEmbedded = True + self._handleCreated = True + return + + #Set up connection parameters. We're connecting to something + connectParams = dcgm_structs.c_dcgmConnectV2Params_v2() + connectParams.version = dcgm_structs.c_dcgmConnectV2Params_version + connectParams.timeoutMs = timeoutMs + if self._persistAfterDisconnect: + connectParams.persistAfterDisconnect = 1 + else: + connectParams.persistAfterDisconnect = 0 + + if ipAddress is not None: + connectToAddress = ipAddress + connectParams.addressIsUnixSocket = 0 + else: + connectToAddress = unixSocketPath + connectParams.addressIsUnixSocket = 1 + + self.handle = dcgm_agent.dcgmConnect_v2(connectToAddress, connectParams) + self.isEmbedded = False + self._handleCreated = True + + def __del__(self): + ''' + Destructor + ''' + if self._handleCreated: + self.Shutdown() + + def GetSystem(self): + ''' + Get a DcgmSystem instance for this handle + ''' + return pydcgm.DcgmSystem(self) + + def __StopDcgm__(self): + ''' + Shuts down either the hostengine or the embedded server + ''' + if self.isEmbedded: + dcgm_agent.dcgmStopEmbedded(self.handle) + else: + dcgm_agent.dcgmDisconnect(self.handle) + + def Shutdown(self): + ''' + Shutdown DCGM hostengine + ''' + if not self._handleCreated: + return + + try: + self.__StopDcgm__() + except AttributeError as e: + # Due to multi-threading, sometimes this is called after the modules have been unloaded, making + # dcgm_agent effectively NoneType and resulting in this error being thrown. + pass + + self._handleCreated = False + self.handle = None + + @staticmethod + def Unload(): + ''' + Unload DCGM, removing any memory it is pointing at. Use this if you really + want DCGM gone from your process. Shutdown() only closes the connection/embedded host engine + that was create in __init__(). + ''' + dcgm_agent.dcgmShutdown() + + def GetIpAddress(self): + ''' + Returns the IP address associated with this handle. None=embedded connection + ''' + return self._ipAddress diff --git a/model_analyzer/monitor/dcgm/DcgmJsonReader.py b/model_analyzer/monitor/dcgm/DcgmJsonReader.py new file mode 100644 index 000000000..9c2ce187e --- /dev/null +++ b/model_analyzer/monitor/dcgm/DcgmJsonReader.py @@ -0,0 +1,69 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from model_analyzer.monitor.dcgm.DcgmReader import DcgmReader +from json import dumps as toJson +from os import environ +from socket import socket, AF_INET, SOCK_DGRAM +from time import sleep +import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields +import logging + + +class DcgmJsonReader(DcgmReader): + + ########################################################################### + def ConvertFieldIdToTag(self, fieldId): + return self.m_fieldIdToInfo[fieldId].tag + + ########################################################################### + def PrepareJson(self, gpuId, obj): + ''' + Receive an object with measurements turn it into an equivalent JSON. We + add the GPU UUID first. + ''' + uuid = self.m_gpuIdToUUId[gpuId] + # This mutates the original object, but it shouldn't be a problem here + obj['gpu_uuid'] = uuid + return toJson(obj) + + ########################################################################### + def CustomDataHandler(self, fvs): + for gpuId in list(fvs.keys()): + # We don't need the keys because each value has a `fieldId` + # So just get the values + gpuData = list(fvs[gpuId].values()) + + # Get the values from FV (which is a list of values) + valuesListOfLists = [datum.values for datum in gpuData] + + # We only want the last measurement + lastValueList = [l[-1] for l in valuesListOfLists] + + # Turn FV into a conventional Python Object which can be converted to JSON + outObject = { + self.ConvertFieldIdToTag(i.fieldId): i.value + for i in lastValueList + } + outJson = self.PrepareJson(gpuId, outObject) + + self.CustomJsonHandler(outJson) + + ########################################################################### + def CustomJsonHandler(self, outJson): + ''' + This method should be overriden by subclasses to handle the JSON objects + received. + ''' + logging.warning('CustomJsonHandler has not been overriden') + logging.info(outJson) diff --git a/model_analyzer/monitor/dcgm/DcgmReader.py b/model_analyzer/monitor/dcgm/DcgmReader.py new file mode 100644 index 000000000..2c32a1f91 --- /dev/null +++ b/model_analyzer/monitor/dcgm/DcgmReader.py @@ -0,0 +1,623 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import subprocess +import signal, os +import model_analyzer.monitor.dcgm.pydcgm as pydcgm +import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs +import threading +import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields +import sys +import logging + +defaultFieldIds = [ + dcgm_fields.DCGM_FI_DEV_POWER_USAGE, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, + dcgm_fields.DCGM_FI_DEV_SM_CLOCK, dcgm_fields.DCGM_FI_DEV_GPU_UTIL, + dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING, + dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, dcgm_fields.DCGM_FI_DEV_RETIRED_DBE, + dcgm_fields.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, + dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, + dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, + dcgm_fields.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, dcgm_fields.DCGM_FI_DEV_FB_TOTAL, + dcgm_fields.DCGM_FI_DEV_FB_FREE, dcgm_fields.DCGM_FI_DEV_FB_USED, + dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER, + dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION, + dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION, + dcgm_fields.DCGM_FI_DEV_XID_ERRORS, + dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, + dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, + dcgm_fields.DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, + dcgm_fields.DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, + dcgm_fields.DCGM_FI_DEV_MEM_CLOCK, dcgm_fields.DCGM_FI_DEV_MEMORY_TEMP, + dcgm_fields.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, + dcgm_fields.DCGM_FI_DEV_MEM_COPY_UTIL, + dcgm_fields.DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, + dcgm_fields.DCGM_FI_DEV_PCIE_TX_THROUGHPUT, + dcgm_fields.DCGM_FI_DEV_PCIE_RX_THROUGHPUT +] + + +def entity_group_id_to_string(entityGroupId): + if entityGroupId == dcgm_fields.DCGM_FE_GPU: + return 'GPU' + elif entityGroupId == dcgm_fields.DCGM_FE_VGPU: + return 'VGPU' + elif entityGroupId == dcgm_fields.DCGM_FE_SWITCH: + return 'NVSWITCH' + elif entityGroupId == dcgm_fields.DCGM_FE_GPU_I: + return 'GPU INSTANCE' + elif entityGroupId == dcgm_fields.DCGM_FE_GPU_CI: + return 'COMPUTE INSTANCE' + elif entityGroupId == dcgm_fields.DCGM_FE_LINK: + return 'LINK' + else: + return '' + + +class DcgmReader(object): + ########################################################################### + ''' + This function can be implemented as a callback in the class that inherits from DcgmReader + to handle each field individually. + By default, it passes a string with the gpu, field tag, and value to LogInfo() + @params: + gpuId : the id of the GPU this field is reporting on + fieldId : the id of the field (ignored by default, may be useful for children) + fieldTag : the string representation of the field id + val : the value class that comes from DCGM (v.value is the value for the field) + ''' + + def CustomFieldHandler(self, gpuId, fieldId, fieldTag, val): + print("GPU %s field %s=%s" % (str(gpuId), fieldTag, str(val.value))) + + ########################################################################### + ''' + This function can be implemented as a callback in the class that inherits from DcgmReader + to handle each field individually. + By default, it passes a string with the gpu, field tag, and value to LogInfo() + @params: + entityGroupId : the type of entity this field is reporting on + entityId : the id of the entity this field is reporting on + fieldId : the id of the field (ignored by default, may be useful for children) + fieldTag : the string representation of the field id + val : the value class that comes from DCGM (v.value is the value for the field) + ''' + + def CustomFieldHandler_v2(self, entityGroupId, entityId, fieldId, fieldTag, + val): + print("%s %s field %s=%s" % (entity_group_id_to_string(entityGroupId), + str(entityId), fieldTag, str(val.value))) + + ########################################################################### + ''' + This function can be implemented as a callback in the class that inherits from DcgmReader + to handle all of the data queried from DCGM. + By default, it will simply print the field tags and values for each GPU + @params: + fvs : Data in the format entityGroupId -> entityId -> values (dictionary of dictionaries) + ''' + + def CustomDataHandler_v2(self, fvs): + for entityGroupId in list(fvs.keys()): + entityGroup = fvs[entityGroupId] + + for entityId in list(entityGroup.keys()): + entityFv = entityGroup[entityId] + for fieldId in list(entityFv.keys()): + if fieldId in self.m_dcgmIgnoreFields: + continue + + val = entityFv[fieldId][-1] + + if val.isBlank: + continue + + fieldTag = self.m_fieldIdToInfo[fieldId].tag + + self.CustomFieldHandler_v2(entityGroupId, entityId, fieldId, + fieldTag, val) + + ########################################################################### + ''' + This function can be implemented as a callback in the class that inherits from DcgmReader + to handle all of the data queried from DCGM. + By default, it will simply print the field tags and values for each GPU + @params: + fvs : Dictionary with gpuID as key and values as Value + ''' + + def CustomDataHandler(self, fvs): + for gpuId in list(fvs.keys()): + gpuFv = fvs[gpuId] + + for fieldId in list(gpuFv.keys()): + if fieldId in self.m_dcgmIgnoreFields: + continue + + val = gpuFv[fieldId][-1] + + if val.isBlank: + continue + + fieldTag = self.m_fieldIdToInfo[fieldId].tag + + self.CustomFieldHandler(gpuId, fieldId, fieldTag, val) + + ########################################################################### + def SetupGpuIdUUIdMappings(self): + ''' + Populate the m_gpuIdToUUId map + ''' + + gpuIds = self.m_dcgmGroup.GetGpuIds() + for gpuId in gpuIds: + gpuInfo = self.m_dcgmSystem.discovery.GetGpuAttributes(gpuId) + self.m_gpuIdToUUId[gpuId] = gpuInfo.identifiers.uuid + + ########################################################################### + ''' + Constructor + @params: + hostname : Address:port of the host to connect. Defaults to localhost + fieldIds : List of the field ids to publish. If it isn't specified, our default list is used. + updateFrequency : Frequency of update in microseconds. Defauls to 10 seconds or 10000000 microseconds + maxKeepAge : Max time to keep data from NVML, in seconds. Default is 3600.0 (1 hour) + ignoreList : List of the field ids we want to query but not publish. + gpuIds : List of GPU IDs to monitor. If not provided, DcgmReader will monitor all GPUs on the system + fieldIntervalMap: Map of intervals to list of field numbers to monitor. Takes precedence over fieldIds and updateFrequency if not None. + ''' + + def __init__(self, + hostname='localhost', + fieldIds=None, + updateFrequency=10000000, + maxKeepAge=3600.0, + ignoreList=None, + fieldGroupName='dcgm_fieldgroupData', + gpuIds=None, + entities=None, + fieldIntervalMap=None): + fieldIds = fieldIds or defaultFieldIds + ignoreList = ignoreList or [] + self.m_dcgmHostName = hostname + self.m_updateFreq = updateFrequency # default / redundant + + self.m_fieldGroupName = fieldGroupName + self.m_publishFields = {} + + if fieldIntervalMap is not None: + self.m_publishFields = fieldIntervalMap + else: + self.m_publishFields[self.m_updateFreq] = fieldIds + + self.m_requestedGpuIds = gpuIds + self.m_requestedEntities = entities + + self.m_dcgmIgnoreFields = ignoreList #Fields not to publish + self.m_maxKeepAge = maxKeepAge + self.m_dcgmHandle = None + self.m_dcgmSystem = None + self.m_dcgmGroup = None + self.m_closeHandle = False + + self.m_gpuIdToBusId = {} #GpuID => PCI-E busId string + self.m_gpuIdToUUId = {} # FieldId => dcgm_fields.dcgm_field_meta_t + self.m_fieldIdToInfo = {} #FieldId => dcgm_fields.dcgm_field_meta_t + self.m_lock = threading.Lock( + ) #DCGM connection start-up/shutdown is not thread safe. Just lock pessimistically + self.m_debug = False + + # For GetAllSinceLastCall* calls. We cache the value for these objects + # after first retrieval, so initializing them to None lets us know if + # we've made a first retrieval. The first retrieval is based on a + # "since" timestamp of 0, so it gets data in which we are not + # interested in. The second retrieval gets data since the first one, in + # which we ARE interested. The practical upshot of this is that actual + # reporting of data is delayed one collectd sampling interval -- as if + # the sampling was actually started one collectd sampling interval + # later. We expect this is not an issue. + self.fvs = None + self.dfvc = None + self.dfvec = None + + ########################################################################### + ''' + Define what should happen to this object at the beginning of a with + block. In this case, nothing more is needed since the constructor should've + been called. + ''' + + def __enter__(self): + return self + + ########################################################################### + ''' + Define the cleanup + ''' + + def __exit__(self, type, value, traceback): + self.Shutdown() + + ########################################################################### + ''' + This function intializes DCGM from the specified directory and connects to + the host engine. + ''' + + def InitWrapped(self, path=None): + dcgm_structs._dcgmInit(libDcgmPath=path) + self.Reconnect() + + ########################################################################### + ''' + This function tries to connect to hostengine and calls initwrapped to initialize + the dcgm. + ''' + + def Init(self, libpath=None): + with self.m_lock: + try: + self.InitWrapped(path=libpath) + except dcgm_structs.dcgmExceptionClass( + dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID): + self.LogError("Can't connect to nv-hostengine. Is it down?") + self.SetDisconnected() + + ########################################################################### + ''' + Delete the DCGM group, DCGM system and DCGM handle and clear the attributes + on shutdown. + ''' + + def SetDisconnected(self): + #Force destructors since DCGM currently doesn't support more than one client connection per process + if self.m_dcgmGroup is not None: + del (self.m_dcgmGroup) + self.m_dcgmGroup = None + if self.m_dcgmSystem is not None: + del (self.m_dcgmSystem) + self.m_dcgmSystem = None + if self.m_dcgmHandle is not None: + del (self.m_dcgmHandle) + self.m_dcgmHandle = None + + ########################################################################## + ''' + This function calls the SetDisconnected function which disconnects from + DCGM and clears DCGM handle and DCGM group. + ''' + + def Shutdown(self): + with self.m_lock: + if self.m_closeHandle == True: + self.SetDisconnected() + + ############################################################################ + ''' + Turns debugging output on + ''' + + def AddDebugOutput(self): + self.m_debug = True + + ############################################################################ + ''' + ''' + + def InitializeFromHandle(self): + self.m_dcgmSystem = self.m_dcgmHandle.GetSystem() + + if not self.m_requestedGpuIds and not self.m_requestedEntities: + self.m_dcgmGroup = self.m_dcgmSystem.GetDefaultGroup() + else: + groupName = "dcgmreader_%d" % os.getpid() + + if self.m_requestedGpuIds: + self.m_dcgmGroup = self.m_dcgmSystem.GetGroupWithGpuIds( + groupName, self.m_requestedGpuIds) + if self.m_requestedEntities: + for entity in self.m_requestedEntities: + self.m_dcgmGroup.AddEntity(entity.entityGroupId, + entity.entityId) + else: + self.m_dcgmGroup = self.m_dcgmSystem.GetGroupWithEntities( + groupName, self.m_requestedEntities) + + self.SetupGpuIdBusMappings() + self.SetupGpuIdUUIdMappings() + self.GetFieldMetadata() + self.AddFieldWatches() + + ############################################################################ + ''' + Has DcgmReader use but not own a handle. Currently for the unit tests. + ''' + + def SetHandle(self, handle): + self.m_dcgmHandle = pydcgm.DcgmHandle(handle) + self.InitializeFromHandle() + + ############################################################################ + ''' + Reconnect function checks if connection handle is present. If the handle is + none, it creates the handle and gets the default DCGM group. It then maps + gpuIds to BusID, set the meta data of the field ids and adds watches to the + field Ids mentioned in the idToWatch list. + ''' + + def Reconnect(self): + if self.m_dcgmHandle is not None: + return + + self.LogDebug("Connection handle is None. Trying to reconnect") + + self.m_dcgmHandle = pydcgm.DcgmHandle( + None, self.m_dcgmHostName, dcgm_structs.DCGM_OPERATION_MODE_AUTO) + self.m_closeHandle = True + + self.LogDebug("Connected to nv-hostengine") + + self.InitializeFromHandle() + + ########################################################################### + ''' + Populate the g_gpuIdToBusId map. This map contains mapping from + gpuID to the BusID. + ''' + + def SetupGpuIdBusMappings(self): + self.m_gpuIdToBusId = {} + + gpuIds = self.m_dcgmGroup.GetGpuIds() + for gpuId in gpuIds: + gpuInfo = self.m_dcgmSystem.discovery.GetGpuAttributes(gpuId) + self.m_gpuIdToBusId[gpuId] = gpuInfo.identifiers.pciBusId + + ########################################################################### + ''' + Add watches to the fields which are passed in init function in idToWatch + list. It also updates the field values for the first time. + ''' + + def AddFieldWatches(self): + maxKeepSamples = 0 #No limit. Handled by m_maxKeepAge + for interval, fieldGroup in self.m_fieldGroups.items(): + self.LogDebug("AddWatchFields: interval = " + str(interval) + "\n") + self.m_dcgmGroup.samples.WatchFields(fieldGroup, interval, + self.m_maxKeepAge, + maxKeepSamples) + self.m_dcgmSystem.UpdateAllFields(1) + self.LogDebug("AddWatchFields exit\n") + + ########################################################################### + ''' + If the groupID already exists, we delete that group and create a new fieldgroup with + the fields mentioned in idToWatch. Then information of each field is acquired from its id. + ''' + + def GetFieldMetadata(self): + self.m_fieldIdToInfo = {} + self.m_fieldGroups = {} + self.m_fieldGroup = None + allFieldIds = [] + + # Initialize groups for all field intervals. + self.LogDebug("GetFieldMetaData:\n") + + intervalIndex = 0 + for interval, fieldIds in self.m_publishFields.items(): + self.LogDebug("sampling interval = " + str(interval) + ":\n") + for fieldId in fieldIds: + self.LogDebug(" fieldId: " + str(fieldId) + "\n") + + intervalIndex += 1 + fieldGroupName = self.m_fieldGroupName + "_" + str(intervalIndex) + findByNameId = self.m_dcgmSystem.GetFieldGroupIdByName( + fieldGroupName) + self.LogDebug("fieldGroupName: " + fieldGroupName + "\n") + + # Remove our field group if it exists already + if findByNameId is not None: + self.LogDebug("fieldGroupId: " + findByNameId + "\n") + delFieldGroup = pydcgm.DcgmFieldGroup( + dcgmHandle=self.m_dcgmHandle, fieldGroupId=findByNameId) + delFieldGroup.Delete() + del (delFieldGroup) + + self.m_fieldGroups[interval] = pydcgm.DcgmFieldGroup( + self.m_dcgmHandle, fieldGroupName, fieldIds) + + for fieldId in fieldIds: + if fieldId not in allFieldIds: + allFieldIds += [fieldId] + + self.m_fieldIdToInfo[ + fieldId] = self.m_dcgmSystem.fields.GetFieldById(fieldId) + if self.m_fieldIdToInfo[fieldId] == 0 or self.m_fieldIdToInfo[ + fieldId] == None: + self.LogError( + "Cannot get field tag for field id %d. Please check dcgm_fields to see if it is valid." + % (fieldId)) + raise dcgm_structs.DCGMError( + dcgm_structs.DCGM_ST_UNKNOWN_FIELD) + # Initialize a field group of ALL fields. + fieldGroupName = self.m_fieldGroupName + findByNameId = self.m_dcgmSystem.GetFieldGroupIdByName(fieldGroupName) + + # Remove our field group if it exists already + if findByNameId is not None: + delFieldGroup = pydcgm.DcgmFieldGroup(dcgmHandle=self.m_dcgmHandle, + fieldGroupId=findByNameId) + delFieldGroup.Delete() + del (delFieldGroup) + + self.m_fieldGroup = pydcgm.DcgmFieldGroup(self.m_dcgmHandle, + fieldGroupName, allFieldIds) + + ########################################################################### + ''' + This function attempts to connect to DCGM and calls the implemented + CustomDataHandler in the child class with field values. + @params: + self.m_dcgmGroup.samples.GetLatest(self.m_fieldGroup).values : The field + values for each field. This dictionary contains fieldInfo for each field id + requested to be watched. + ''' + + def Process(self): + with self.m_lock: + try: + self.Reconnect() + + # The first call just clears the collection set. + + if not self.m_requestedEntities: + self.dfvc = self.m_dcgmGroup.samples.GetAllSinceLastCall( + self.dfvc, self.m_fieldGroup) + self.CustomDataHandler(self.dfvc.values) + self.dfvc.EmptyValues() + else: + self.dfvec = self.m_dcgmGroup.samples.GetAllSinceLastCall_v2( + self.dfvec, self.m_fieldGroup) + self.CustomDataHandler_v2(self.dfvec.values) + self.dfvec.EmptyValues() + except dcgm_structs.dcgmExceptionClass( + dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID): + self.LogError("Can't connect to nv-hostengine. Is it down?") + self.SetDisconnected() + + ########################################################################### + def LogInfo(self, msg): + logging.info(msg) + + ########################################################################### + def LogDebug(self, msg): + logging.debug(msg) + + ########################################################################### + def LogError(self, msg): + logging.error(msg) + + ########################################################################### + ''' + This function gets each value as a dictionary of dictionaries. The dictionary + returned is each gpu id mapped to a dictionary of it's field values. Each + field value dictionary is the field name mapped to the value or the field + id mapped to value depending on the parameter mapById. + ''' + + def GetLatestGpuValuesAsDict(self, mapById): + systemDictionary = {} + + with self.m_lock: + try: + self.Reconnect() + fvs = self.m_dcgmGroup.samples.GetLatest( + self.m_fieldGroup).values + for gpuId in list(fvs.keys()): + systemDictionary[gpuId] = { + } # initialize the gpu's dictionary + gpuFv = fvs[gpuId] + + for fieldId in list(gpuFv.keys()): + val = gpuFv[fieldId][-1] + + if val.isBlank: + continue + + if mapById == False: + fieldTag = self.m_fieldIdToInfo[fieldId].tag + systemDictionary[gpuId][ + fieldTag] = val.value if isinstance( + val.value, bytes) else val.value + else: + systemDictionary[gpuId][ + fieldId] = val.value if isinstance( + val.value, bytes) else val.value + except dcgm_structs.dcgmExceptionClass( + dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID): + self.LogError( + "Can't connection to nv-hostengine. Please verify that it is running." + ) + self.SetDisconnected() + + return systemDictionary + + ########################################################################### + ''' + This function gets value as a dictionary of dictionaries of lists. The + dictionary returned is each gpu id mapped to a dictionary of it's field + value lists. Each field value dictionary is the field name mapped to the + list of values or the field id mapped to list of values depending on the + parameter mapById. The list of values are the values for each field since + the last retrieval. + ''' + + def GetAllGpuValuesAsDictSinceLastCall(self, mapById): + systemDictionary = {} + + with self.m_lock: + try: + self.Reconnect() + report = self.fvs is not None + self.fvs = self.m_dcgmGroup.samples.GetAllSinceLastCall( + self.fvs, self.m_fieldGroup) + if report: + for gpuId in list(self.fvs.values.keys()): + systemDictionary[gpuId] = { + } # initialize the gpu's dictionary + gpuFv = self.fvs.values[gpuId] + + for fieldId in list(gpuFv.keys()): + for val in gpuFv[fieldId]: + if val.isBlank: + continue + + if mapById == False: + fieldTag = self.m_fieldIdToInfo[fieldId].tag + if not fieldTag in systemDictionary[gpuId]: + systemDictionary[gpuId][fieldTag] = [] + + systemDictionary[gpuId][fieldTag].append( + val) + else: + if not fieldId in systemDictionary[gpuId]: + systemDictionary[gpuId][fieldId] = [] + systemDictionary[gpuId][fieldId].append(val) + except dcgm_structs.dcgmExceptionClass( + dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID): + self.LogError( + "Can't connection to nv-hostengine. Please verify that it is running." + ) + self.SetDisconnected() + + if self.fvs is not None: + self.fvs.EmptyValues() + + return systemDictionary + + ########################################################################### + def GetLatestGpuValuesAsFieldIdDict(self): + return self.GetLatestGpuValuesAsDict(True) + + ########################################################################### + def GetLatestGpuValuesAsFieldNameDict(self): + return self.GetLatestGpuValuesAsDict(False) + + ########################################################################### + def GetAllGpuValuesAsFieldIdDictSinceLastCall(self): + return self.GetAllGpuValuesAsDictSinceLastCall(True) + + ########################################################################### + def GetAllGpuValuesAsFieldNameDictSinceLastCall(self): + return self.GetAllGpuValuesAsDictSinceLastCall(False) diff --git a/model_analyzer/monitor/dcgm/DcgmStatus.py b/model_analyzer/monitor/dcgm/DcgmStatus.py new file mode 100644 index 000000000..f0a5e3a7d --- /dev/null +++ b/model_analyzer/monitor/dcgm/DcgmStatus.py @@ -0,0 +1,57 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import model_analyzer.monitor.dcgm.pydcgm as pydcgm +import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent +import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs + + +class DcgmStatus: + + def __init__(self): + self.handle = dcgm_agent.dcgmStatusCreate() + self.errors = [] + + def __del__(self): + dcgm_agent.dcgmStatusDestroy(self.handle) + + ''' + Take any errors stored in our handle and update self.errors with them + ''' + + def UpdateErrors(self): + errorCount = dcgm_agent.dcgmStatusGetCount(self.handle) + if errorCount < 1: + return + + for i in range(errorCount): + self.errors.append(dcgm_agent.dcgmStatusPopError(self.handle)) + + ''' + Throw an exception if any errors are stored in our status handle + + The exception text will contain all of the errors + ''' + + def ThrowExceptionOnErrors(self): + #Make sure we've captured all errors before looking at them + self.UpdateErrors() + + if len(self.errors) < 1: + return + + errorString = "Errors: " + for value in self.errors: + errorString += "\"%s\"" % value + raise dcgm_structs.DCGMError(value.status) diff --git a/model_analyzer/monitor/dcgm/DcgmSystem.py b/model_analyzer/monitor/dcgm/DcgmSystem.py new file mode 100644 index 000000000..6df2759f7 --- /dev/null +++ b/model_analyzer/monitor/dcgm/DcgmSystem.py @@ -0,0 +1,412 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import model_analyzer.monitor.dcgm.pydcgm as pydcgm +import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent +import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs +import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields +import ctypes + + +class DcgmSystemDiscovery: + ''' + Constructor + ''' + + def __init__(self, dcgmHandle): + self._dcgmHandle = dcgmHandle + + ''' + Get all IDs of the GPUs that DCGM knows about. To get only GPUs that DCGM support, + use GetAllSupportedGpuIds(). + + Returns an array of GPU IDs. Each of these can be passed to DcgmGroup::AddGpu() + ''' + + def GetAllGpuIds(self): + gpuIds = dcgm_agent.dcgmGetAllDevices(self._dcgmHandle.handle) + return gpuIds + + ''' + Get all of IDs of the GPUs that DCGM supports. This will exclude unsupported + GPUs + + Returns an array of GPU IDs. Each of these can be passed to DcgmGroup::AddGpu() + ''' + + def GetAllSupportedGpuIds(self): + gpuIds = dcgm_agent.dcgmGetAllSupportedDevices(self._dcgmHandle.handle) + return gpuIds + + ''' + Get some basic GPU attributes for a given GPU ID. + + Returns a dcgm_structs.c_dcgmDeviceAttributes_v3() object for the given GPU + ''' + + def GetGpuAttributes(self, gpuId): + return dcgm_agent.dcgmGetDeviceAttributes(self._dcgmHandle.handle, + gpuId) + + ''' + Get topology information for a given GPU ID + + Returns a dcgm_structs.c_dcgmDeviceTopology_v1 structure representing the topology for the given GPU + ''' + + def GetGpuTopology(self, gpuId): + return dcgm_agent.dcgmGetDeviceTopology(self._dcgmHandle.handle, gpuId) + + ''' + Get all entityIds of the entities that DCGM knows about. + + entityGroupId IN: DCGM_FE_? constant of the entity group to fetch the entities of + onlyActive IN: Boolean as to whether to fetch entities that are supported by DCGM (True) + or all entity IDs (False) + + Returns an array of entity IDs. Each of these can be passed to DcgmGroup::AddEntity() + ''' + + def GetEntityGroupEntities(self, entityGroupId, onlySupported): + flags = 0 + if onlySupported: + flags |= dcgm_structs.DCGM_GEGE_FLAG_ONLY_SUPPORTED + entityIds = dcgm_agent.dcgmGetEntityGroupEntities( + self._dcgmHandle.handle, entityGroupId, flags) + return entityIds + + ''' + Get the status of all of the NvLink links in the system. + + Returns a dcgm_structs.c_dcgmNvLinkStatus_v3 object. + ''' + + def GetNvLinkLinkStatus(self): + return dcgm_agent.dcgmGetNvLinkLinkStatus(self._dcgmHandle.handle) + + ''' + From a bitmask of input gpu ids, return a bitmask of numGpus GPUs which identifies the topologically + closest GPUs to use for a single job. DCGM will consider CPU affinities and NVLink connection speeds + to determine the closest. + hintFlags can instruct DCGM to consider GPU health or not. By default, unhealthy GPUs are excluded from + consideration. + ''' + + def SelectGpusByTopology(self, inputGpuIds, numGpus, hintFlags): + return dcgm_agent.dcgmSelectGpusByTopology(self._dcgmHandle.handle, + inputGpuIds, numGpus, + hintFlags) + + +class DcgmSystemIntrospect: + ''' + Class to access the system-wide introspection modules of DCGM + ''' + + def __init__(self, dcgmHandle): + self._handle = dcgmHandle + self.memory = DcgmSystemIntrospectMemory(dcgmHandle) + self.cpuUtil = DcgmSystemIntrospectCpuUtil(dcgmHandle) + + def UpdateAll(self, waitForUpdate=True): + dcgm_agent.dcgmIntrospectUpdateAll(self._handle.handle, waitForUpdate) + + +class DcgmSystemIntrospectMemory: + ''' + Class to access information about the memory usage of DCGM itself + ''' + + def __init__(self, dcgmHandle): + self._dcgmHandle = dcgmHandle + + def GetForHostengine(self, waitIfNoData=True): + ''' + Retrieve the total amount of virtual memory that the hostengine process is currently using. + This measurement represents both the resident set size (what is currently in RAM) and + the swapped memory that belongs to the process. + + waitIfNoData: wait for metadata to be updated if it's not available + + Returns a dcgm_structs.c_dcgmIntrospectMemory_v1 object + Raises an exception for DCGM_ST_NO_DATA if no data is available yet and \ref waitIfNoData is False + ''' + return dcgm_agent.dcgmIntrospectGetHostengineMemoryUsage( + self._dcgmHandle.handle, waitIfNoData) + + +class DcgmSystemIntrospectCpuUtil: + ''' + Class to access information about the CPU Utilization of DCGM + ''' + + def __init__(self, dcgmHandle): + self._dcgmHandle = dcgmHandle + + def GetForHostengine(self, waitIfNoData=True): + ''' + Get the current CPU Utilization of the hostengine process. + + waitIfNoData: wait for metadata to be updated if it's not available + + Returns a dcgm_structs.c_dcgmIntrospectCpuUtil_v1 object + Raises an exception for DCGM_ST_NO_DATA if no data is available yet and \ref waitIfNoData is False + ''' + return dcgm_agent.dcgmIntrospectGetHostengineCpuUtilization( + self._dcgmHandle.handle, waitIfNoData) + + +''' +Class to encapsulate DCGM field-metadata requests +''' + + +class DcgmSystemFields: + + def GetFieldById(self, fieldId): + ''' + Get a field's metadata by its dcgm_fields.DCGM_FI_* field ID + + fieldId: dcgm_fields.DCGM_FI_* field ID of the field + + Returns a dcgm_fields.c_dcgm_field_meta_t struct on success or None on error. + ''' + return dcgm_fields.DcgmFieldGetById(fieldId) + + def GetFieldByTag(self, tag): + ''' + Get a field's metadata by its tag name. Ex: 'brand' + + tag: Tag name of the field + + Returns a dcgm_fields.c_dcgm_field_meta_t struct on success or None on error. + ''' + return dcgm_fields.DcgmFieldGetByTag(tag) + + +''' +Class to encapsulate DCGM module management and introspection +''' + + +class DcgmSystemModules: + ''' + Constructor + ''' + + def __init__(self, dcgmHandle): + self._dcgmHandle = dcgmHandle + + ''' + Denylist a module from being loaded by DCGM. + + moduleId a dcgm_structs.dcgmModuleId* ID of the module to denylist + + Returns: Nothing. + Raises a DCGM_ST_IN_USE exception if the module was already loaded + ''' + + def Denylist(self, moduleId): + dcgm_agent.dcgmModuleDenylist(self._dcgmHandle.handle, moduleId) + + ''' + Get the statuses of all of the modules in DCGM + + Returns: a dcgm_structs.c_dcgmModuleGetStatuses_v1 structure. + ''' + + def GetStatuses(self): + return dcgm_agent.dcgmModuleGetStatuses(self._dcgmHandle.handle) + + +''' +Class to encapsulate DCGM profiling +''' + + +class DcgmSystemProfiling: + ''' + Constructor + ''' + + def __init__(self, dcgmHandle): + self._dcgmHandle = dcgmHandle + + ''' + Pause profiling activities in DCGM. This should be used when you are monitoring profiling fields + from DCGM but want to be able to still run developer tools like nvprof, nsight systems, and nsight compute. + Profiling fields start with DCGM_PROF_ and are in the field ID range 1001-1012. + + Call this API before you launch one of those tools and Resume() after the tool has completed. + + DCGM will save BLANK values while profiling is paused. + Calling this while profiling activities are already paused is fine and will be treated as a no-op. + ''' + + def Pause(self): + return dcgm_agent.dcgmProfPause(self._dcgmHandle.handle) + + ''' + Resume profiling activities in DCGM that were previously paused with Pause(). + + Call this API after you have completed running other NVIDIA developer tools to reenable DCGM + profiling metrics. + + DCGM will save BLANK values while profiling is paused. + + Calling this while profiling activities have already been resumed is fine and will be treated as a no-op. + ''' + + def Resume(self): + return dcgm_agent.dcgmProfResume(self._dcgmHandle.handle) + + +''' +Class to encapsulate global DCGM methods. These apply to a single DcgmHandle, provided to the constructor +''' + + +class DcgmSystem: + ''' + Constructor + + dcgmHandle is a pydcgm.DcgmHandle instance of the connection that will be used by all methods of this class + ''' + + def __init__(self, dcgmHandle): + self._dcgmHandle = dcgmHandle + + #Child classes + self.discovery = DcgmSystemDiscovery(self._dcgmHandle) + self.introspect = DcgmSystemIntrospect(self._dcgmHandle) + self.fields = DcgmSystemFields() + self.modules = DcgmSystemModules(self._dcgmHandle) + self.profiling = DcgmSystemProfiling(self._dcgmHandle) + + ''' + Request that the host engine perform a field value update cycle. If the host + engine was starting in DCGM_OPERATION_MODE_MANUAL, calling this method is + the only way that field values will be updated. + + Note that performing a field value update cycle does not update every field. + It only update fields that are newly watched or fields that haven't updated + in enough time to warrant updating again, based on their update frequency. + + waitForUpdate specifies whether this function call should block until the + field value update loop is complete or not. Use True if you intend to query + values immediately after calling this. + ''' + + def UpdateAllFields(self, waitForUpdate): + ret = dcgm_agent.dcgmUpdateAllFields(self._dcgmHandle.handle, + waitForUpdate) + #Throw an exception on error + dcgm_structs._dcgmCheckReturn(ret) + + ''' + Get a DcgmGroup instance for the default all-GPUs group. This object is used to + perform operations on a group of GPUs. See DcgmGroup.py for details. + + AddGpu() and RemoveGpu() operations are not allowed on the default group + ''' + + def GetDefaultGroup(self): + return pydcgm.DcgmGroup(self._dcgmHandle, + groupId=dcgm_structs.DCGM_GROUP_ALL_GPUS) + + ''' + Get an instance of DcgmGroup with no GPUs. Call AddGpu() on the returned + object with GPU IDs from GetAllGpuIds() before performing actions on + the returned DcgmGroup instance. + + groupName is the name of the group to create in the host engine. This name must be + unique. + + Note: The group will be deleted from the host engine when the returned object goes out of scope + ''' + + def GetEmptyGroup(self, groupName): + return pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName) + + ''' + Get an instance of DcgmGroup populated with the gpuIds provided + + groupName is the name of the group to create in the host engine. This name must be + unique. + gpuIds is the list of GPU IDs to add to the group + + Note: The group will be deleted from the host engine when the returned object goes out of scope + ''' + + def GetGroupWithGpuIds(self, groupName, gpuIds): + newGroup = pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName) + for gpuId in gpuIds: + newGroup.AddGpu(gpuId) + return newGroup + + ''' + Get an instance of DcgmGroup populated with the provided entities + + groupName is the name of the group to create in the host engine. This name must be + unique. + entities is the list of entity pairs (type and id) to add to the group + + Note: The group will be deleted from the host engine when the returned object goes out of scope + ''' + + def GetGroupWithEntities(self, groupName, entities): + group = pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName) + for entity in entities: + group.AddEntity(entity.entityGroupId, entity.entityId) + + return group + + ''' + Get ids of all DcgmGroups of GPUs. This returns a list containing the ids of the DcgmGroups. + ''' + + def GetAllGroupIds(self): + return dcgm_agent.dcgmGroupGetAllIds(self._dcgmHandle.handle) + + ''' + Get all all of the field groups in the system + ''' + + def GetAllFieldGroups(self): + return dcgm_agent.dcgmFieldGroupGetAll(self._dcgmHandle.handle) + + ''' + Get a field group's id by its name. + + Returns: Field group ID if found + None if not found + ''' + + def GetFieldGroupIdByName(self, name): + allGroups = self.GetAllFieldGroups() + for i in range(0, allGroups.numFieldGroups): + if allGroups.fieldGroups[i].fieldGroupName == name: + return ctypes.c_void_p(allGroups.fieldGroups[i].fieldGroupId) + + return None + + def PauseTelemetryForDiag(self): + """Pause DCGM modules from updating field values.""" + import dcgm_agent_internal + dcgm_agent_internal.dcgmPauseTelemetryForDiag(self._dcgmHandle.handle) + + def ResumeTelemetryForDiag(self): + """Resume previously paused DCGM modules so that they can update field values.""" + import dcgm_agent_internal + dcgm_agent_internal.dcgmResumeTelemetryForDiag(self._dcgmHandle.handle) diff --git a/model_analyzer/monitor/dcgm/common/__init__.py b/model_analyzer/monitor/dcgm/common/__init__.py new file mode 100644 index 000000000..236f66016 --- /dev/null +++ b/model_analyzer/monitor/dcgm/common/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py b/model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py new file mode 100644 index 000000000..401dcee05 --- /dev/null +++ b/model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py @@ -0,0 +1,194 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from os import environ +import argparse +import logging +import sys + + +############################################################################### +def create_parser( + publish_port=8000, + interval=10, + name='the monitoring tool', # Replace with 'prometheus', 'telegraf', etc. + field_ids=None, + log_file=None, + log_level='INFO', + dcgm_hostname=environ.get('DCGM_HOSTNAME') or 'localhost', +): + ''' + Create a parser that defaults to sane parameters. + + The default parameters can be overridden through keyword arguments. + + Note: if DCGM_HOSTNAME is set as an environment variable, it is used as + the default instead of localhost + ''' + + parser = argparse.ArgumentParser() + parser.add_argument( + '-p', + '--publish-port', + dest='publish_port', + type=int, + default=publish_port, + help='TCP port that the client should publish to. Default={}.'.format( + publish_port)) + parser.add_argument( + '-i', + '--interval', + dest='interval', + type=int, + default=interval, + help= + 'How often the client should retrieve new values from DCGM in seconds. Default={}.' + .format(interval)) + parser.add_argument( + '-f', + '--field-ids', + dest='field_ids', + type=str, + default=field_ids, + help= + 'Comma-separated list of field IDs that should be retrieved from DCGM. ' + + + 'The full list of available field IDs can be obtained from dcgm_fields.h, dcgm_fields.py, ' + + 'or running \'dcgmi dmon -l\'.') + parser.add_argument( + '--log-file', + dest='logfile', + type=str, + default=log_file, + help= + 'A path to a log file for recording what information is being sent to {}' + .format(name)) + parser.add_argument( + '--log-level', + dest='loglevel', + type=str, + default=log_level, + help= + 'Specify a log level to use for logging.\n\tCRITICAL (0) - log only critical errors that drastically affect execution' + + + '\n\tERROR (1) - Log any error in execution\n\tWARNING (2) - Log all warnings and errors that occur' + + + '\n\tINFO (3) - Log informational messages about program execution in addition to warnings and errors' + + + '\n\tDEBUG (4) - Log debugging information in addition to all information about execution' + + '\nDefault: {}'.format(log_level)) + + group = parser.add_mutually_exclusive_group() + group.add_argument( + '-n', + '--hostname', + dest='hostname', + type=str, + default=dcgm_hostname, + help= + 'IP/hostname where the client should query DCGM for values. Default={} (all interfaces).' + .format(dcgm_hostname)) + group.add_argument( + '-e', + '--embedded', + dest='embedded', + action='store_true', + help= + 'Launch DCGM from within this process instead of connecting to nv-hostengine.' + ) + + return parser + + +def add_custom_argument(parser, *args, **kwargs): + parser.add_argument(*args, **kwargs) + + +############################################################################### +def add_target_host_argument(name, parser, default_target='localhost'): + parser.add_argument( + '-t', + '--publish-hostname', + dest='publish_hostname', + type=str, + default=default_target, + help='The hostname at which the client will publish the readings to {}'. + format(name)) + + +############################################################################### +def run_parser(parser): + ''' + Run a parser created using create_parser + ''' + return parser.parse_args() + + +############################################################################### +def get_field_ids(args): + # This indicates the user supplied a string, so we should override the + # default + if isinstance(args.field_ids, str): + tokens = args.field_ids.split(",") + field_ids = [int(token) for token in tokens] + return field_ids + # The default object should already be an array of ints. Just return it + else: + return args.field_ids + + +############################################################################### +def get_log_level(args): + levelStr = args.loglevel.upper() + if levelStr == '0' or levelStr == 'CRITICAL': + numeric_log_level = logging.CRITICAL + elif levelStr == '1' or levelStr == 'ERROR': + numeric_log_level = logging.ERROR + elif levelStr == '2' or levelStr == 'WARNING': + numeric_log_level = logging.WARNING + elif levelStr == '3' or levelStr == 'INFO': + numeric_log_level = logging.INFO + elif levelStr == '4' or levelStr == 'DEBUG': + numeric_log_level = logging.DEBUG + else: + print("Could not understand the specified --log-level '%s'" % + (args.loglevel)) + args.print_help() + sys.exit(2) + return numeric_log_level + + +############################################################################### +def parse_command_line(name, default_port, add_target_host=False): + # Fields we accept raw from the CLI + FIELDS_AS_IS = ['publish_port', 'interval', 'logfile', 'publish_hostname'] + + parser = create_parser( + name=name, + publish_port=default_port, + ) + + if add_target_host: + add_target_host_argument(name, parser) + + args = run_parser(parser) + field_ids = get_field_ids(args) + log_level = get_log_level(args) + + args_as_dict = vars(args) + settings = {i: args_as_dict[i] for i in FIELDS_AS_IS} + settings['dcgm_hostname'] = None if args.embedded else args.hostname + settings['field_ids'] = field_ids + settings['log_level'] = log_level + + return settings diff --git a/model_analyzer/monitor/dcgm/common/dcgm_client_main.py b/model_analyzer/monitor/dcgm/common/dcgm_client_main.py new file mode 100644 index 000000000..54cd04673 --- /dev/null +++ b/model_analyzer/monitor/dcgm/common/dcgm_client_main.py @@ -0,0 +1,86 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from time import sleep +from . import dcgm_client_cli_parser as cli +import signal + + +############################################################################### +def exit_handler(signum, frame): + # The Prometheus client does something smarter but more complex + # Here we just exit + exit() + + +############################################################################### +def initialize_signal_handlers(): + signal.signal(signal.SIGINT, exit_handler) + signal.signal(signal.SIGTERM, exit_handler) + + +############################################################################### +def main(DRConstructor, name, default_port, add_target_host=False): + ''' + This main function should work for most DCGM clients. It creates a + DcgmReader object using DRConstructor and enters a loop that queries DCGM + for data + + Arguments + --------- + DRConstructor: A constructor for a DcgmReader. The constructor must + accept the following keyword arguments: + - hostname: DCGM hostname + - publish_port: port on which the data is published + In some cases, the constructor will also need to accept: + - publish_hostname: hostname the data is published to + - field_ids: field ids to query and publish + name: The name of the client. This is displayed to the user + default_port: Default port to publish to + + Keyword arguments + ----------------- + add_target_host: Boolean that indicates whether this client accepts a + publish hostname + + ''' + + initialize_signal_handlers() + settings = cli.parse_command_line( + name, + default_port, + add_target_host=add_target_host, + ) + + # Create a dictionary for the arguments because field_ids might not be + # provided (if it's None) when we want to use the default in DcgmReader + dr_args = { + 'hostname': settings['dcgm_hostname'], + 'publish_port': settings['publish_port'], + } + + # publish_hostname is only available if we add the target_host parameter + if add_target_host: + dr_args['publish_hostname'] = settings['publish_hostname'] + + if settings['field_ids']: + dr_args['fieldIds'] = settings['field_ids'] + + dr = DRConstructor(**dr_args) + + try: + while True: + dr.Process() + sleep(settings['interval']) + except KeyboardInterrupt: + print('Caught CTRL-C. Exiting') diff --git a/model_analyzer/monitor/dcgm/dcgm_agent.py b/model_analyzer/monitor/dcgm/dcgm_agent.py index 809b57f66..320db76d2 100755 --- a/model_analyzer/monitor/dcgm/dcgm_agent.py +++ b/model_analyzer/monitor/dcgm/dcgm_agent.py @@ -1,6 +1,4 @@ -#!/usr/bin/env python3 - -# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,31 +11,61 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +## +# Python bindings for the internal API of DCGM library (dcgm_agent.h) +## -from ctypes import ( - CFUNCTYPE, - POINTER, - byref, - c_double, - c_int, - c_int32, - c_int64, - c_uint, - c_uint16, - c_uint32, - c_uint64, - c_void_p, - py_object, -) - -import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs +import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields +from ctypes import * +import functools + + +def ensure_byte_strings(): + """ + Ensures that we don't call C APIs with unicode strings in the arguments + every unicode args gets converted to UTF-8 before the function is called + """ + + def convert_result_from_bytes(result): + if isinstance(result, bytes): + return result.decode('utf-8') + if isinstance(result, list): + return list(map(convert_result_from_bytes, result)) + if isinstance(result, tuple): + return tuple(map(convert_result_from_bytes, result)) + return result + + def decorator(fn): + + @functools.wraps(fn) + def wrapper(*args, **kwargs): + newargs = [] + newkwargs = {} + for arg in args: + if isinstance(arg, str): + newargs.append(bytes(arg, 'utf-8')) + else: + newargs.append(arg) + for k, v in kwargs.items(): + if isinstance(v, str): + newkwargs[k] = bytes(v, 'utf-8') + else: + newkwargs[k] = v + newargs = tuple(newargs) + return fn(*newargs, **newkwargs) + + return wrapper + + return decorator + # Provides access to functions from dcgm_agent_internal dcgmFP = dcgm_structs._dcgmGetFunctionPointer # This method is used to initialize DCGM +@ensure_byte_strings() def dcgmInit(): dcgm_handle = c_void_p() fn = dcgmFP("dcgmInit") @@ -47,6 +75,7 @@ def dcgmInit(): # This method is used to shutdown DCGM Engine +@ensure_byte_strings() def dcgmShutdown(): fn = dcgmFP("dcgmShutdown") ret = fn() @@ -54,6 +83,7 @@ def dcgmShutdown(): return ret +@ensure_byte_strings() def dcgmStartEmbedded(opMode): dcgm_handle = c_void_p() fn = dcgmFP("dcgmStartEmbedded") @@ -62,6 +92,7 @@ def dcgmStartEmbedded(opMode): return dcgm_handle +@ensure_byte_strings() def dcgmStopEmbedded(dcgm_handle): fn = dcgmFP("dcgmStopEmbedded") ret = fn(dcgm_handle) @@ -69,6 +100,7 @@ def dcgmStopEmbedded(dcgm_handle): return ret +@ensure_byte_strings() def dcgmConnect(ip_address): dcgm_handle = c_void_p() fn = dcgmFP("dcgmConnect") @@ -77,9 +109,10 @@ def dcgmConnect(ip_address): return dcgm_handle -def dcgmConnect_v2( - ip_address, connectParams, version=dcgm_structs.c_dcgmConnectV2Params_version -): +@ensure_byte_strings() +def dcgmConnect_v2(ip_address, + connectParams, + version=dcgm_structs.c_dcgmConnectV2Params_version): connectParams.version = version dcgm_handle = c_void_p() fn = dcgmFP("dcgmConnect_v2") @@ -88,6 +121,7 @@ def dcgmConnect_v2( return dcgm_handle +@ensure_byte_strings() def dcgmDisconnect(dcgm_handle): fn = dcgmFP("dcgmDisconnect") ret = fn(dcgm_handle) @@ -95,6 +129,7 @@ def dcgmDisconnect(dcgm_handle): return ret +@ensure_byte_strings() def dcgmGetAllSupportedDevices(dcgm_handle): c_count = c_uint() gpuid_list = c_uint * dcgm_structs.DCGM_MAX_NUM_DEVICES @@ -102,9 +137,10 @@ def dcgmGetAllSupportedDevices(dcgm_handle): fn = dcgmFP("dcgmGetAllSupportedDevices") ret = fn(dcgm_handle, c_gpuid_list, byref(c_count)) dcgm_structs._dcgmCheckReturn(ret) - return [c_gpuid_list[i] for i in range(c_count.value)[0 : int(c_count.value)]] + return list(c_gpuid_list[0:int(c_count.value)]) +@ensure_byte_strings() def dcgmGetAllDevices(dcgm_handle): c_count = c_uint() gpuid_list = c_uint * dcgm_structs.DCGM_MAX_NUM_DEVICES @@ -112,18 +148,26 @@ def dcgmGetAllDevices(dcgm_handle): fn = dcgmFP("dcgmGetAllDevices") ret = fn(dcgm_handle, c_gpuid_list, byref(c_count)) dcgm_structs._dcgmCheckReturn(ret) - return [c_gpuid_list[i] for i in range(c_count.value)[0 : int(c_count.value)]] + return list(c_gpuid_list[0:int(c_count.value)]) -def dcgmGetDeviceAttributes(dcgm_handle, gpuId): +@ensure_byte_strings() +def dcgmGetDeviceAttributes(dcgm_handle, + gpuId, + version=dcgm_structs.dcgmDeviceAttributes_version3): fn = dcgmFP("dcgmGetDeviceAttributes") - device_values = dcgm_structs.c_dcgmDeviceAttributes_v2() - device_values.version = dcgm_structs.dcgmDeviceAttributes_version2 + if version == dcgm_structs.dcgmDeviceAttributes_version3: + device_values = dcgm_structs.c_dcgmDeviceAttributes_v3() + device_values.version = dcgm_structs.dcgmDeviceAttributes_version3 + else: + dcgm_structs._dcgmCheckReturn(dcgm_structs.DCGM_ST_VER_MISMATCH) + ret = fn(dcgm_handle, c_int(gpuId), byref(device_values)) dcgm_structs._dcgmCheckReturn(ret) return device_values +@ensure_byte_strings() def dcgmGetEntityGroupEntities(dcgm_handle, entityGroup, flags): capacity = dcgm_structs.DCGM_GROUP_MAX_ENTITIES c_count = c_int32(capacity) @@ -135,24 +179,27 @@ def dcgmGetEntityGroupEntities(dcgm_handle, entityGroup, flags): return c_entityIds[0 : int(c_count.value)] +@ensure_byte_strings() def dcgmGetNvLinkLinkStatus(dcgm_handle): - linkStatus = dcgm_structs.c_dcgmNvLinkStatus_v2() - linkStatus.version = dcgm_structs.dcgmNvLinkStatus_version2 + linkStatus = dcgm_structs.c_dcgmNvLinkStatus_v3() + linkStatus.version = dcgm_structs.dcgmNvLinkStatus_version3 fn = dcgmFP("dcgmGetNvLinkLinkStatus") ret = fn(dcgm_handle, byref(linkStatus)) dcgm_structs._dcgmCheckReturn(ret) return linkStatus +@ensure_byte_strings() def dcgmGetGpuInstanceHierarchy(dcgm_handle): - hierarchy = dcgm_structs.c_dcgmMigHierarchy_v1() - hierarchy.version = dcgm_structs.c_dcgmMigHierarchy_version1 + hierarchy = dcgm_structs.c_dcgmMigHierarchy_v2() + hierarchy.version = dcgm_structs.c_dcgmMigHierarchy_version2 fn = dcgmFP("dcgmGetGpuInstanceHierarchy") ret = fn(dcgm_handle, byref(hierarchy)) dcgm_structs._dcgmCheckReturn(ret) return hierarchy +@ensure_byte_strings() def dcgmCreateMigEntity(dcgm_handle, parentId, profile, createOption, flags): fn = dcgmFP("dcgmCreateMigEntity") cme = dcgm_structs.c_dcgmCreateMigEntity_v1() @@ -165,6 +212,7 @@ def dcgmCreateMigEntity(dcgm_handle, parentId, profile, createOption, flags): dcgm_structs._dcgmCheckReturn(ret) +@ensure_byte_strings() def dcgmDeleteMigEntity(dcgm_handle, entityGroupId, entityId, flags): fn = dcgmFP("dcgmDeleteMigEntity") dme = dcgm_structs.c_dcgmDeleteMigEntity_v1() @@ -176,6 +224,7 @@ def dcgmDeleteMigEntity(dcgm_handle, entityGroupId, entityId, flags): dcgm_structs._dcgmCheckReturn(ret) +@ensure_byte_strings() def dcgmGroupCreate(dcgm_handle, type, groupName): c_group_id = c_void_p() fn = dcgmFP("dcgmGroupCreate") @@ -184,6 +233,7 @@ def dcgmGroupCreate(dcgm_handle, type, groupName): return c_group_id +@ensure_byte_strings() def dcgmGroupDestroy(dcgm_handle, group_id): fn = dcgmFP("dcgmGroupDestroy") ret = fn(dcgm_handle, group_id) @@ -191,6 +241,7 @@ def dcgmGroupDestroy(dcgm_handle, group_id): return ret +@ensure_byte_strings() def dcgmGroupAddDevice(dcgm_handle, group_id, gpu_id): fn = dcgmFP("dcgmGroupAddDevice") ret = fn(dcgm_handle, group_id, gpu_id) @@ -198,6 +249,7 @@ def dcgmGroupAddDevice(dcgm_handle, group_id, gpu_id): return ret +@ensure_byte_strings() def dcgmGroupAddEntity(dcgm_handle, group_id, entityGroupId, entityId): fn = dcgmFP("dcgmGroupAddEntity") ret = fn(dcgm_handle, group_id, entityGroupId, entityId) @@ -205,6 +257,7 @@ def dcgmGroupAddEntity(dcgm_handle, group_id, entityGroupId, entityId): return ret +@ensure_byte_strings() def dcgmGroupRemoveDevice(dcgm_handle, group_id, gpu_id): fn = dcgmFP("dcgmGroupRemoveDevice") ret = fn(dcgm_handle, group_id, gpu_id) @@ -212,6 +265,7 @@ def dcgmGroupRemoveDevice(dcgm_handle, group_id, gpu_id): return ret +@ensure_byte_strings() def dcgmGroupRemoveEntity(dcgm_handle, group_id, entityGroupId, entityId): fn = dcgmFP("dcgmGroupRemoveEntity") ret = fn(dcgm_handle, group_id, entityGroupId, entityId) @@ -219,12 +273,13 @@ def dcgmGroupRemoveEntity(dcgm_handle, group_id, entityGroupId, entityId): return ret -def dcgmGroupGetInfo( - dcgm_handle, group_id, version=dcgm_structs.c_dcgmGroupInfo_version2 -): +@ensure_byte_strings() +def dcgmGroupGetInfo(dcgm_handle, + group_id, + version=dcgm_structs.c_dcgmGroupInfo_version2): fn = dcgmFP("dcgmGroupGetInfo") - # support the old version of the request since the host engine does + #support the old version of the request since the host engine does if version == dcgm_structs.c_dcgmGroupInfo_version2: device_values = dcgm_structs.c_dcgmGroupInfo_v2() device_values.version = dcgm_structs.c_dcgmGroupInfo_version2 @@ -236,6 +291,7 @@ def dcgmGroupGetInfo( return device_values +@ensure_byte_strings() def dcgmGroupGetAllIds(dcgmHandle): fn = dcgmFP("dcgmGroupGetAllIds") c_count = c_uint() @@ -243,25 +299,22 @@ def dcgmGroupGetAllIds(dcgmHandle): c_groupIdList = groupIdList() ret = fn(dcgmHandle, c_groupIdList, byref(c_count)) dcgm_structs._dcgmCheckReturn(ret) - return map(None, c_groupIdList[0 : int(c_count.value)]) + return list(c_groupIdList[0:int(c_count.value)]) +@ensure_byte_strings() def dcgmFieldGroupCreate(dcgm_handle, fieldIds, fieldGroupName): c_field_group_id = c_void_p() c_num_field_ids = c_int32(len(fieldIds)) c_field_ids = (c_uint16 * len(fieldIds))(*fieldIds) fn = dcgmFP("dcgmFieldGroupCreate") - ret = fn( - dcgm_handle, - c_num_field_ids, - byref(c_field_ids), - fieldGroupName, - byref(c_field_group_id), - ) + ret = fn(dcgm_handle, c_num_field_ids, byref(c_field_ids), fieldGroupName, + byref(c_field_group_id)) dcgm_structs._dcgmCheckReturn(ret) return c_field_group_id +@ensure_byte_strings() def dcgmFieldGroupDestroy(dcgm_handle, fieldGroupId): fn = dcgmFP("dcgmFieldGroupDestroy") ret = fn(dcgm_handle, fieldGroupId) @@ -269,6 +322,7 @@ def dcgmFieldGroupDestroy(dcgm_handle, fieldGroupId): return ret +@ensure_byte_strings() def dcgmFieldGroupGetInfo(dcgm_handle, fieldGroupId): c_fieldGroupInfo = dcgm_structs.c_dcgmFieldGroupInfo_v1() c_fieldGroupInfo.version = dcgm_structs.dcgmFieldGroupInfo_version1 @@ -279,6 +333,7 @@ def dcgmFieldGroupGetInfo(dcgm_handle, fieldGroupId): return c_fieldGroupInfo +@ensure_byte_strings() def dcgmFieldGroupGetAll(dcgm_handle): c_allGroupInfo = dcgm_structs.c_dcgmAllFieldGroup_v1() c_allGroupInfo.version = dcgm_structs.dcgmAllFieldGroup_version1 @@ -288,6 +343,7 @@ def dcgmFieldGroupGetAll(dcgm_handle): return c_allGroupInfo +@ensure_byte_strings() def dcgmStatusCreate(): c_status_handle = c_void_p() fn = dcgmFP("dcgmStatusCreate") @@ -296,6 +352,7 @@ def dcgmStatusCreate(): return c_status_handle +@ensure_byte_strings() def dcgmStatusDestroy(status_handle): fn = dcgmFP("dcgmStatusDestroy") ret = fn(status_handle) @@ -303,6 +360,7 @@ def dcgmStatusDestroy(status_handle): return ret +@ensure_byte_strings() def dcgmStatusGetCount(status_handle): c_count = c_uint() fn = dcgmFP("dcgmStatusGetCount") @@ -311,6 +369,7 @@ def dcgmStatusGetCount(status_handle): return c_count.value +@ensure_byte_strings() def dcgmStatusPopError(status_handle): c_errorInfo = dcgm_structs.c_dcgmErrorInfo_v1() fn = dcgmFP("dcgmStatusPopError") @@ -321,6 +380,7 @@ def dcgmStatusPopError(status_handle): return None +@ensure_byte_strings() def dcgmStatusClear(status_handle): fn = dcgmFP("dcgmStatusClear") ret = fn(status_handle) @@ -328,6 +388,7 @@ def dcgmStatusClear(status_handle): return ret +@ensure_byte_strings() def dcgmConfigSet(dcgm_handle, group_id, configToSet, status_handle): fn = dcgmFP("dcgmConfigSet") configToSet.version = dcgm_structs.dcgmDeviceConfig_version1 @@ -336,6 +397,7 @@ def dcgmConfigSet(dcgm_handle, group_id, configToSet, status_handle): return ret +@ensure_byte_strings() def dcgmConfigGet(dcgm_handle, group_id, reqCfgType, count, status_handle): fn = dcgmFP("dcgmConfigGet") @@ -345,11 +407,13 @@ def dcgmConfigGet(dcgm_handle, group_id, reqCfgType, count, status_handle): for index in range(0, count): c_config_values[index].version = dcgm_structs.dcgmDeviceConfig_version1 - ret = fn(dcgm_handle, group_id, reqCfgType, count, c_config_values, status_handle) + ret = fn(dcgm_handle, group_id, reqCfgType, count, c_config_values, + status_handle) dcgm_structs._dcgmCheckReturn(ret) - return map(None, c_config_values[0:count]) + return list(c_config_values[0:count]) +@ensure_byte_strings() def dcgmConfigEnforce(dcgm_handle, group_id, status_handle): fn = dcgmFP("dcgmConfigEnforce") ret = fn(dcgm_handle, group_id, status_handle) @@ -358,6 +422,7 @@ def dcgmConfigEnforce(dcgm_handle, group_id, status_handle): # This method is used to tell the cache manager to update all fields +@ensure_byte_strings() def dcgmUpdateAllFields(dcgm_handle, waitForUpdate): fn = dcgmFP("dcgmUpdateAllFields") ret = fn(dcgm_handle, c_int(waitForUpdate)) @@ -366,6 +431,7 @@ def dcgmUpdateAllFields(dcgm_handle, waitForUpdate): # This method is used to get the policy information +@ensure_byte_strings() def dcgmPolicyGet(dcgm_handle, group_id, count, status_handle): fn = dcgmFP("dcgmPolicyGet") policy_array = count * dcgm_structs.c_dcgmPolicy_v1 @@ -381,6 +447,7 @@ def dcgmPolicyGet(dcgm_handle, group_id, count, status_handle): # This method is used to set the policy information +@ensure_byte_strings() def dcgmPolicySet(dcgm_handle, group_id, policy, status_handle): fn = dcgmFP("dcgmPolicySet") ret = fn(dcgm_handle, group_id, byref(policy), status_handle) @@ -388,56 +455,38 @@ def dcgmPolicySet(dcgm_handle, group_id, policy, status_handle): return ret -# First parameter below is the return type +#First parameter below is the return type dcgmFieldValueEnumeration_f = CFUNCTYPE( - c_int32, c_uint32, POINTER(dcgm_structs.c_dcgmFieldValue_v1), c_int32, c_void_p -) + c_int32, c_uint32, POINTER(dcgm_structs.c_dcgmFieldValue_v1), c_int32, + c_void_p) dcgmFieldValueEntityEnumeration_f = CFUNCTYPE( - c_int32, - c_uint32, - c_uint32, - POINTER(dcgm_structs.c_dcgmFieldValue_v1), - c_int32, - c_void_p, -) - - -def dcgmGetValuesSince( - dcgm_handle, groupId, fieldGroupId, sinceTimestamp, enumCB, userData -): + c_int32, c_uint32, c_uint32, POINTER(dcgm_structs.c_dcgmFieldValue_v1), + c_int32, c_void_p) + + +@ensure_byte_strings() +def dcgmGetValuesSince(dcgm_handle, groupId, fieldGroupId, sinceTimestamp, + enumCB, userData): fn = dcgmFP("dcgmGetValuesSince") c_nextSinceTimestamp = c_int64() - ret = fn( - dcgm_handle, - groupId, - fieldGroupId, - c_int64(sinceTimestamp), - byref(c_nextSinceTimestamp), - enumCB, - py_object(userData), - ) + ret = fn(dcgm_handle, groupId, fieldGroupId, c_int64(sinceTimestamp), + byref(c_nextSinceTimestamp), enumCB, py_object(userData)) dcgm_structs._dcgmCheckReturn(ret) return c_nextSinceTimestamp.value -def dcgmGetValuesSince_v2( - dcgm_handle, groupId, fieldGroupId, sinceTimestamp, enumCB, userData -): +@ensure_byte_strings() +def dcgmGetValuesSince_v2(dcgm_handle, groupId, fieldGroupId, sinceTimestamp, + enumCB, userData): fn = dcgmFP("dcgmGetValuesSince_v2") c_nextSinceTimestamp = c_int64() - ret = fn( - dcgm_handle, - groupId, - fieldGroupId, - c_int64(sinceTimestamp), - byref(c_nextSinceTimestamp), - enumCB, - py_object(userData), - ) + ret = fn(dcgm_handle, groupId, fieldGroupId, c_int64(sinceTimestamp), + byref(c_nextSinceTimestamp), enumCB, py_object(userData)) dcgm_structs._dcgmCheckReturn(ret) return c_nextSinceTimestamp.value +@ensure_byte_strings() def dcgmGetLatestValues(dcgm_handle, groupId, fieldGroupId, enumCB, userData): fn = dcgmFP("dcgmGetLatestValues") ret = fn(dcgm_handle, groupId, fieldGroupId, enumCB, py_object(userData)) @@ -445,29 +494,26 @@ def dcgmGetLatestValues(dcgm_handle, groupId, fieldGroupId, enumCB, userData): return ret -def dcgmGetLatestValues_v2(dcgm_handle, groupId, fieldGroupId, enumCB, userData): +@ensure_byte_strings() +def dcgmGetLatestValues_v2(dcgm_handle, groupId, fieldGroupId, enumCB, + userData): fn = dcgmFP("dcgmGetLatestValues_v2") ret = fn(dcgm_handle, groupId, fieldGroupId, enumCB, py_object(userData)) dcgm_structs._dcgmCheckReturn(ret) return ret -def dcgmWatchFields( - dcgm_handle, groupId, fieldGroupId, updateFreq, maxKeepAge, maxKeepSamples -): +@ensure_byte_strings() +def dcgmWatchFields(dcgm_handle, groupId, fieldGroupId, updateFreq, maxKeepAge, + maxKeepSamples): fn = dcgmFP("dcgmWatchFields") - ret = fn( - dcgm_handle, - groupId, - fieldGroupId, - c_int64(updateFreq), - c_double(maxKeepAge), - c_int32(maxKeepSamples), - ) + ret = fn(dcgm_handle, groupId, fieldGroupId, c_int64(updateFreq), + c_double(maxKeepAge), c_int32(maxKeepSamples)) dcgm_structs._dcgmCheckReturn(ret) return ret +@ensure_byte_strings() def dcgmUnwatchFields(dcgm_handle, groupId, fieldGroupId): fn = dcgmFP("dcgmUnwatchFields") ret = fn(dcgm_handle, groupId, fieldGroupId) @@ -475,6 +521,7 @@ def dcgmUnwatchFields(dcgm_handle, groupId, fieldGroupId): return ret +@ensure_byte_strings() def dcgmHealthSet(dcgm_handle, groupId, systems): fn = dcgmFP("dcgmHealthSet") ret = fn(dcgm_handle, groupId, systems) @@ -482,6 +529,7 @@ def dcgmHealthSet(dcgm_handle, groupId, systems): return ret +@ensure_byte_strings() def dcgmHealthSet_v2(dcgm_handle, groupId, systems, updateInterval, maxKeepAge): params = dcgm_structs.c_dcgmHealthSetParams_v2() params.version = dcgm_structs.dcgmHealthSetParams_version2 @@ -496,6 +544,7 @@ def dcgmHealthSet_v2(dcgm_handle, groupId, systems, updateInterval, maxKeepAge): return ret +@ensure_byte_strings() def dcgmHealthGet(dcgm_handle, groupId): c_systems = c_int32() fn = dcgmFP("dcgmHealthGet") @@ -504,9 +553,10 @@ def dcgmHealthGet(dcgm_handle, groupId): return c_systems.value -def dcgmHealthCheck( - dcgm_handle, groupId, version=dcgm_structs.dcgmHealthResponse_version4 -): +@ensure_byte_strings() +def dcgmHealthCheck(dcgm_handle, + groupId, + version=dcgm_structs.dcgmHealthResponse_version4): if version != dcgm_structs.dcgmHealthResponse_version4: dcgm_structs._dcgmCheckReturn(dcgm_structs.DCGM_ST_VER_MISMATCH) @@ -518,13 +568,16 @@ def dcgmHealthCheck( return c_results -def dcgmPolicyRegister(dcgm_handle, groupId, condition, beginCallback, finishCallback): +@ensure_byte_strings() +def dcgmPolicyRegister(dcgm_handle, groupId, condition, beginCallback, + finishCallback): fn = dcgmFP("dcgmPolicyRegister") ret = fn(dcgm_handle, groupId, condition, beginCallback, finishCallback) dcgm_structs._dcgmCheckReturn(ret) return ret +@ensure_byte_strings() def dcgmPolicyUnregister(dcgm_handle, groupId, condition): fn = dcgmFP("dcgmPolicyUnregister") ret = fn(dcgm_handle, groupId, condition) @@ -532,6 +585,7 @@ def dcgmPolicyUnregister(dcgm_handle, groupId, condition): return ret +@ensure_byte_strings() def dcgmPolicyTrigger(dcgm_handle): fn = dcgmFP("dcgmPolicyTrigger") ret = fn(dcgm_handle) @@ -549,32 +603,34 @@ def helperDiagCheckReturn(ret, response): info = "%s" % response.systemError.msg e.SetAdditionalInfo(info) - raise e # pylint: disable=E0710 + raise e else: raise return response -def dcgmActionValidate_v2( - dcgm_handle, runDiagInfo, runDiagVersion=dcgm_structs.dcgmRunDiag_version6 -): - response = dcgm_structs.c_dcgmDiagResponse_v6() +@ensure_byte_strings() +def dcgmActionValidate_v2(dcgm_handle, + runDiagInfo, + runDiagVersion=dcgm_structs.dcgmRunDiag_version7): + response = dcgm_structs.c_dcgmDiagResponse_v8() runDiagInfo.version = runDiagVersion - response.version = dcgm_structs.dcgmDiagResponse_version6 + response.version = dcgm_structs.dcgmDiagResponse_version8 fn = dcgmFP("dcgmActionValidate_v2") ret = fn(dcgm_handle, byref(runDiagInfo), byref(response)) return helperDiagCheckReturn(ret, response) +@ensure_byte_strings() def dcgmActionValidate(dcgm_handle, group_id, validate): - response = dcgm_structs.c_dcgmDiagResponse_v6() - response.version = dcgm_structs.dcgmDiagResponse_version6 + response = dcgm_structs.c_dcgmDiagResponse_v8() + response.version = dcgm_structs.dcgmDiagResponse_version8 # Put the group_id and validate into a dcgmRunDiag struct - runDiagInfo = dcgm_structs.c_dcgmRunDiag_v6() - runDiagInfo.version = dcgm_structs.dcgmRunDiag_version6 + runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7() + runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7 runDiagInfo.validate = validate runDiagInfo.groupId = group_id @@ -584,28 +640,27 @@ def dcgmActionValidate(dcgm_handle, group_id, validate): return helperDiagCheckReturn(ret, response) +@ensure_byte_strings() def dcgmRunDiagnostic(dcgm_handle, group_id, diagLevel): - response = dcgm_structs.c_dcgmDiagResponse_v6() - response.version = dcgm_structs.dcgmDiagResponse_version6 + response = dcgm_structs.c_dcgmDiagResponse_v8() + response.version = dcgm_structs.dcgmDiagResponse_version8 fn = dcgmFP("dcgmRunDiagnostic") ret = fn(dcgm_handle, group_id, diagLevel, byref(response)) return helperDiagCheckReturn(ret, response) -def dcgmWatchPidFields(dcgm_handle, groupId, updateFreq, maxKeepAge, maxKeepSamples): +@ensure_byte_strings() +def dcgmWatchPidFields(dcgm_handle, groupId, updateFreq, maxKeepAge, + maxKeepSamples): fn = dcgmFP("dcgmWatchPidFields") - ret = fn( - dcgm_handle, - groupId, - c_int64(updateFreq), - c_double(maxKeepAge), - c_int32(maxKeepSamples), - ) + ret = fn(dcgm_handle, groupId, c_int64(updateFreq), c_double(maxKeepAge), + c_int32(maxKeepSamples)) dcgm_structs._dcgmCheckReturn(ret) return ret +@ensure_byte_strings() def dcgmGetPidInfo(dcgm_handle, groupId, pid): fn = dcgmFP("dcgmGetPidInfo") pidInfo = dcgm_structs.c_dcgmPidInfo_v2() @@ -618,6 +673,7 @@ def dcgmGetPidInfo(dcgm_handle, groupId, pid): return pidInfo +@ensure_byte_strings() def dcgmGetDeviceTopology(dcgm_handle, gpuId): devtopo = dcgm_structs.c_dcgmDeviceTopology_v1() fn = dcgmFP("dcgmGetDeviceTopology") @@ -626,6 +682,7 @@ def dcgmGetDeviceTopology(dcgm_handle, gpuId): return devtopo +@ensure_byte_strings() def dcgmGetGroupTopology(dcgm_handle, groupId): grouptopo = dcgm_structs.c_dcgmGroupTopology_v1() fn = dcgmFP("dcgmGetGroupTopology") @@ -634,19 +691,17 @@ def dcgmGetGroupTopology(dcgm_handle, groupId): return grouptopo -def dcgmWatchJobFields(dcgm_handle, groupId, updateFreq, maxKeepAge, maxKeepSamples): +@ensure_byte_strings() +def dcgmWatchJobFields(dcgm_handle, groupId, updateFreq, maxKeepAge, + maxKeepSamples): fn = dcgmFP("dcgmWatchJobFields") - ret = fn( - dcgm_handle, - groupId, - c_int64(updateFreq), - c_double(maxKeepAge), - c_int32(maxKeepSamples), - ) + ret = fn(dcgm_handle, groupId, c_int64(updateFreq), c_double(maxKeepAge), + c_int32(maxKeepSamples)) dcgm_structs._dcgmCheckReturn(ret) return ret +@ensure_byte_strings() def dcgmJobStartStats(dcgm_handle, groupId, jobid): fn = dcgmFP("dcgmJobStartStats") ret = fn(dcgm_handle, groupId, jobid) @@ -654,6 +709,7 @@ def dcgmJobStartStats(dcgm_handle, groupId, jobid): return ret +@ensure_byte_strings() def dcgmJobStopStats(dcgm_handle, jobid): fn = dcgmFP("dcgmJobStopStats") ret = fn(dcgm_handle, jobid) @@ -661,6 +717,7 @@ def dcgmJobStopStats(dcgm_handle, jobid): return ret +@ensure_byte_strings() def dcgmJobGetStats(dcgm_handle, jobid): fn = dcgmFP("dcgmJobGetStats") jobInfo = dcgm_structs.c_dcgmJobInfo_v3() @@ -672,6 +729,7 @@ def dcgmJobGetStats(dcgm_handle, jobid): return jobInfo +@ensure_byte_strings() def dcgmJobRemove(dcgm_handle, jobid): fn = dcgmFP("dcgmJobRemove") ret = fn(dcgm_handle, jobid) @@ -679,6 +737,7 @@ def dcgmJobRemove(dcgm_handle, jobid): return ret +@ensure_byte_strings() def dcgmJobRemoveAll(dcgm_handle): fn = dcgmFP("dcgmJobRemoveAll") ret = fn(dcgm_handle) @@ -686,13 +745,7 @@ def dcgmJobRemoveAll(dcgm_handle): return ret -def dcgmIntrospectToggleState(dcgm_handle, enabledState): - fn = dcgmFP("dcgmIntrospectToggleState") - ret = fn(dcgm_handle, enabledState) - dcgm_structs._dcgmCheckReturn(ret) - return ret - - +@ensure_byte_strings() def dcgmIntrospectGetHostengineMemoryUsage(dcgm_handle, waitIfNoData=True): fn = dcgmFP("dcgmIntrospectGetHostengineMemoryUsage") @@ -704,6 +757,7 @@ def dcgmIntrospectGetHostengineMemoryUsage(dcgm_handle, waitIfNoData=True): return memInfo +@ensure_byte_strings() def dcgmIntrospectGetHostengineCpuUtilization(dcgm_handle, waitIfNoData=True): fn = dcgmFP("dcgmIntrospectGetHostengineCpuUtilization") @@ -715,88 +769,45 @@ def dcgmIntrospectGetHostengineCpuUtilization(dcgm_handle, waitIfNoData=True): return cpuUtil -def dcgmIntrospectGetFieldsExecTime(dcgm_handle, introspectContext, waitIfNoData=True): - fn = dcgmFP("dcgmIntrospectGetFieldsExecTime") - - execTime = dcgm_structs.c_dcgmIntrospectFullFieldsExecTime_v2() - execTime.version = dcgm_structs.dcgmIntrospectFullFieldsExecTime_version2 - - ret = fn(dcgm_handle, byref(introspectContext), byref(execTime), waitIfNoData) - dcgm_structs._dcgmCheckReturn(ret) - return execTime - - -def dcgmIntrospectGetFieldsMemoryUsage( - dcgm_handle, introspectContext, waitIfNoData=True -): - fn = dcgmFP("dcgmIntrospectGetFieldsMemoryUsage") - - memInfo = dcgm_structs.c_dcgmIntrospectFullMemory_v1() - memInfo.version = dcgm_structs.dcgmIntrospectFullMemory_version1 - - ret = fn(dcgm_handle, byref(introspectContext), byref(memInfo), waitIfNoData) - dcgm_structs._dcgmCheckReturn(ret) - return memInfo - - -def dcgmIntrospectUpdateAll(dcgmHandle, waitForUpdate): - fn = dcgmFP("dcgmIntrospectUpdateAll") - ret = fn(dcgmHandle, c_int(waitForUpdate)) - dcgm_structs._dcgmCheckReturn(ret) - - +@ensure_byte_strings() def dcgmEntityGetLatestValues(dcgmHandle, entityGroup, entityId, fieldIds): fn = dcgmFP("dcgmEntityGetLatestValues") field_values = (dcgm_structs.c_dcgmFieldValue_v1 * len(fieldIds))() id_values = (c_uint16 * len(fieldIds))(*fieldIds) - ret = fn( - dcgmHandle, - c_uint(entityGroup), - dcgm_fields.c_dcgm_field_eid_t(entityId), - id_values, - c_uint(len(fieldIds)), - field_values, - ) + ret = fn(dcgmHandle, c_uint(entityGroup), + dcgm_fields.c_dcgm_field_eid_t(entityId), id_values, + c_uint(len(fieldIds)), field_values) dcgm_structs._dcgmCheckReturn(ret) return field_values +@ensure_byte_strings() def dcgmEntitiesGetLatestValues(dcgmHandle, entities, fieldIds, flags): fn = dcgmFP("dcgmEntitiesGetLatestValues") numFvs = len(fieldIds) * len(entities) field_values = (dcgm_structs.c_dcgmFieldValue_v2 * numFvs)() - entities_values = (dcgm_structs.c_dcgmGroupEntityPair_t * len(entities))(*entities) + entities_values = (dcgm_structs.c_dcgmGroupEntityPair_t * + len(entities))(*entities) field_id_values = (c_uint16 * len(fieldIds))(*fieldIds) - ret = fn( - dcgmHandle, - entities_values, - c_uint(len(entities)), - field_id_values, - c_uint(len(fieldIds)), - flags, - field_values, - ) + ret = fn(dcgmHandle, entities_values, c_uint(len(entities)), + field_id_values, c_uint(len(fieldIds)), flags, field_values) dcgm_structs._dcgmCheckReturn(ret) return field_values +@ensure_byte_strings() def dcgmSelectGpusByTopology(dcgmHandle, inputGpuIds, numGpus, hintFlags): fn = dcgmFP("dcgmSelectGpusByTopology") outputGpuIds = c_int64() - ret = fn( - dcgmHandle, - c_uint64(inputGpuIds), - c_uint32(numGpus), - byref(outputGpuIds), - c_uint64(hintFlags), - ) + ret = fn(dcgmHandle, c_uint64(inputGpuIds), c_uint32(numGpus), + byref(outputGpuIds), c_uint64(hintFlags)) dcgm_structs._dcgmCheckReturn(ret) return outputGpuIds -def dcgmGetFieldSummary( - dcgmHandle, fieldId, entityGroupType, entityId, summaryMask, startTime, endTime -): +@ensure_byte_strings() +def dcgmGetFieldSummary(dcgmHandle, fieldId, entityGroupType, entityId, + summaryMask, startTime, endTime): fn = dcgmFP("dcgmGetFieldSummary") request = dcgm_structs.c_dcgmFieldSummaryRequest_v1() request.version = dcgm_structs.dcgmFieldSummaryRequest_version1 @@ -811,13 +822,15 @@ def dcgmGetFieldSummary( return request -def dcgmModuleBlacklist(dcgmHandle, moduleId): - fn = dcgmFP("dcgmModuleBlacklist") +@ensure_byte_strings() +def dcgmModuleDenylist(dcgmHandle, moduleId): + fn = dcgmFP("dcgmModuleDenylist") ret = fn(dcgmHandle, c_uint32(moduleId)) dcgm_structs._dcgmCheckReturn(ret) return ret +@ensure_byte_strings() def dcgmModuleGetStatuses(dcgmHandle): moduleStatuses = dcgm_structs.c_dcgmModuleGetStatuses_v1() moduleStatuses.version = dcgm_structs.dcgmModuleGetStatuses_version1 @@ -827,45 +840,18 @@ def dcgmModuleGetStatuses(dcgmHandle): return moduleStatuses -def dcgmProfGetSupportedMetricGroups(dcgmHandle, groupId): - msg = dcgm_structs.c_dcgmProfGetMetricGroups_v2() - msg.version = dcgm_structs.dcgmProfGetMetricGroups_version1 - msg.groupId = groupId +@ensure_byte_strings() +def dcgmProfGetSupportedMetricGroups(dcgmHandle, gpuId): + msg = dcgm_structs.c_dcgmProfGetMetricGroups_v3() + msg.version = dcgm_structs.dcgmProfGetMetricGroups_version3 + msg.gpuId = gpuId fn = dcgmFP("dcgmProfGetSupportedMetricGroups") ret = fn(dcgmHandle, byref(msg)) dcgm_structs._dcgmCheckReturn(ret) return msg -def dcgmProfWatchFields( - dcgmHandle, fieldIds, groupId, updateFreq, maxKeepAge, maxKeepSamples -): - msg = dcgm_structs.c_dcgmProfWatchFields_v1() - msg.version = dcgm_structs.dcgmProfWatchFields_version1 - msg.groupId = groupId - msg.updateFreq = updateFreq - msg.maxKeepAge = maxKeepAge - msg.maxKeepSamples = maxKeepSamples - msg.numFieldIds = c_uint32(len(fieldIds)) - for i, fieldId in enumerate(fieldIds): - msg.fieldIds[i] = fieldId - - fn = dcgmFP("dcgmProfWatchFields") - ret = fn(dcgmHandle, byref(msg)) - dcgm_structs._dcgmCheckReturn(ret) - return msg - - -def dcgmProfUnwatchFields(dcgmHandle, groupId): - msg = dcgm_structs.c_dcgmProfUnwatchFields_v1() - msg.version = dcgm_structs.dcgmProfUnwatchFields_version1 - msg.groupId = groupId - fn = dcgmFP("dcgmProfUnwatchFields") - ret = fn(dcgmHandle, byref(msg)) - dcgm_structs._dcgmCheckReturn(ret) - return msg - - +@ensure_byte_strings() def dcgmProfPause(dcgmHandle): fn = dcgmFP("dcgmProfPause") ret = fn(dcgmHandle) @@ -873,6 +859,7 @@ def dcgmProfPause(dcgmHandle): return ret +@ensure_byte_strings() def dcgmProfResume(dcgmHandle): fn = dcgmFP("dcgmProfResume") ret = fn(dcgmHandle) @@ -880,6 +867,7 @@ def dcgmProfResume(dcgmHandle): return ret +@ensure_byte_strings() def dcgmVersionInfo(): msg = dcgm_structs.c_dcgmVersionInfo_v2() msg.version = dcgm_structs.dcgmVersionInfo_version2 @@ -889,10 +877,11 @@ def dcgmVersionInfo(): return msg +@ensure_byte_strings() def dcgmHostengineIsHealthy(dcgmHandle): heHealth = dcgm_structs.c_dcgmHostengineHealth_v1() heHealth.version = dcgm_structs.dcgmHostengineHealth_version1 fn = dcgmFP("dcgmHostengineIsHealthy") ret = fn(dcgmHandle, byref(heHealth)) dcgm_structs._dcgmCheckReturn(ret) - return heHealth + return heHealth \ No newline at end of file diff --git a/model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py b/model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py new file mode 100644 index 000000000..d3355c556 --- /dev/null +++ b/model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py @@ -0,0 +1,369 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +import subprocess +import signal +import os +import re +import sys + +dir_path = os.path.dirname(os.path.realpath(__file__)) +parent_dir_path = os.path.abspath(os.path.join(dir_path, os.pardir)) +sys.path.insert(0, parent_dir_path) + +import model_analyzer.monitor.dcgm.dcgm_fields_collectd as dcgm_fields_collectd +import model_analyzer.monitor.dcgm.pydcgm as pydcgm +import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields +import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs +import threading +from model_analyzer.monitor.dcgm.DcgmReader import DcgmReader + +if 'DCGM_TESTING_FRAMEWORK' in os.environ: + try: + import collectd_tester_api as collectd + except: + import collectd +else: + import collectd + +# Set default values for the hostname and the library path +g_dcgmLibPath = '/usr/lib' +g_dcgmHostName = 'localhost' + +# Add overriding through the environment instead of hard coded. +if 'DCGM_HOSTNAME' in os.environ: + g_dcgmHostName = os.environ['DCGM_HOSTNAME'] + +if 'DCGMLIBPATH' in os.environ: + g_dcgmLibPath = os.environ['DCGMLIBPATH'] + +c_ONE_SEC_IN_USEC = 1000000 + +g_intervalSec = 10 # Default + +g_dcgmIgnoreFields = [dcgm_fields.DCGM_FI_DEV_UUID] # Fields not to publish + +g_publishFieldIds = [ + dcgm_fields.DCGM_FI_DEV_UUID, #Needed for plugin instance + dcgm_fields.DCGM_FI_DEV_POWER_USAGE, + dcgm_fields.DCGM_FI_DEV_GPU_TEMP, + dcgm_fields.DCGM_FI_DEV_SM_CLOCK, + dcgm_fields.DCGM_FI_DEV_GPU_UTIL, + dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING, + dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, + dcgm_fields.DCGM_FI_DEV_RETIRED_DBE, + dcgm_fields.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, + dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, + dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, + dcgm_fields.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, + dcgm_fields.DCGM_FI_DEV_FB_TOTAL, + dcgm_fields.DCGM_FI_DEV_FB_FREE, + dcgm_fields.DCGM_FI_DEV_FB_USED, + dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER, + dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION, + dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION, + dcgm_fields.DCGM_FI_DEV_XID_ERRORS, + dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, + dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, + dcgm_fields.DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, + dcgm_fields.DCGM_FI_DEV_MEM_CLOCK, + dcgm_fields.DCGM_FI_DEV_MEMORY_TEMP, + dcgm_fields.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, + dcgm_fields.DCGM_FI_DEV_MEM_COPY_UTIL, + dcgm_fields.DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, + dcgm_fields.DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, + dcgm_fields.DCGM_FI_DEV_PCIE_TX_THROUGHPUT, + dcgm_fields.DCGM_FI_DEV_PCIE_RX_THROUGHPUT +] + +g_fieldIntervalMap = None +g_parseRegEx = None +g_fieldRegEx = None + +# We build up a regex to match field IDs. These can be numeric IDs, or +# names. We start with field_regex that matches either as a string (as +# well as names that might start with digits, but we do not worry about +# this over-generation of valid IDs at this point). +# +# Basically a field is an integral number or a textual name. A field +# list is a field, or a list of fields separated by commas and enclosed +# in parenthssis. A field list may be optionally followed by a colon, +# indicating a possible non-default interval if also followed by a +# floating point interval value. This is a complete field list. +# Multiple complete field lists may appear, separated by commas. +# +# For example: (1001,tensor_active):5,1002:10 +# +# This specifies that fields 1001 and tensor_active are to be sampled +# at a rate of every 5 seconds, and 1002 every ten seconds. +# +# For example: (1001,tensor_active):5,1002: +# +# This is the same, but field 1002 is to be sampled at the default rate +# (and the colon in entirely unnecessary, but not illegal). + +field_regex = r"[0-9a-zA-Z_]+" +g_fieldRegEx = re.compile("((" + field_regex + "),?)") + +# We now generate a list of field regular expressions, separated by a +# comma, and enclosed with parenthesis, for grouping. + +fields_regex = r"\(" + field_regex + "(," + field_regex + ")*" + r"\)" + +# This is an optional interval specification, allowing an optional :, +# followed by an optional floating point dcgm sampling interval. If any +# are missing, the default collectd sampling interval is used. + +interval_regex = r"(:[0-9]*(\.[0-9]+)?)?,?" + +# Here, we combine a field regex or field list regex with an optional +# interval regex. Multiple of these may appear in succession. + +g_parseRegEx = re.compile("((" + field_regex + "|(" + fields_regex + "))" + + interval_regex + ")") + + +class DcgmCollectdPlugin(DcgmReader): + ########################################################################### + def __init__(self): + global c_ONE_SEC_IN_USEC + + collectd.debug( + 'Initializing DCGM with interval={}s'.format(g_intervalSec)) + DcgmReader.__init__(self, + fieldIds=g_publishFieldIds, + ignoreList=g_dcgmIgnoreFields, + fieldGroupName='collectd_plugin', + updateFrequency=g_intervalSec * c_ONE_SEC_IN_USEC, + fieldIntervalMap=g_fieldIntervalMap) + +########################################################################### + + def CustomDataHandler(self, fvs): + global c_ONE_SEC_IN_USEC + + value = collectd.Values(type='gauge') # pylint: disable=no-member + value.plugin = 'dcgm_collectd' + + for gpuId in list(fvs.keys()): + gpuFv = fvs[gpuId] + + uuid = self.m_gpuIdToUUId[gpuId] + collectd.debug('CustomDataHandler uuid: ' + '%s' % (uuid) + '\n') + value.plugin_instance = '%s' % (uuid) + + typeInstance = str(gpuId) + + for fieldId in list(gpuFv.keys()): + # Skip ignore list + if fieldId in self.m_dcgmIgnoreFields: + continue + + fieldTag = self.m_fieldIdToInfo[fieldId].tag + lastValTime = float("inf") + + # Filter out times too close together (< 1.0 sec) but always + # include latest one. + + for val in gpuFv[fieldId][::-1]: + # Skip blank values. Otherwise, we'd have to insert a placeholder blank value based on the fieldId + if val.isBlank: + continue + + valTimeSec1970 = (val.ts / c_ONE_SEC_IN_USEC + ) #Round down to 1-second for now + if (lastValTime - valTimeSec1970) < 1.0: + collectd.debug( + "DCGM sample for field ID %d too soon at %f, last one sampled at %f" + % (fieldId, valTimeSec1970, lastValTime)) + val.isBlank = True # Filter this one out + continue + + lastValTime = valTimeSec1970 + + i = 0 + + for val in gpuFv[fieldId]: + # Skip blank values. Otherwise, we'd have to insert a placeholder blank value based on the fieldId + if val.isBlank: + continue + + # Round down to 1-second for now + valTimeSec1970 = (val.ts / c_ONE_SEC_IN_USEC) + valueArray = [ + val.value, + ] + value.dispatch(type=fieldTag, + type_instance=typeInstance, + time=valTimeSec1970, + values=valueArray, + plugin=value.plugin) + + collectd.debug( + " gpuId %d, tag %s, sample %d, value %s, time %s" % + (gpuId, fieldTag, i, str(val.value), str(val.ts))) # pylint: disable=no-member + i += 1 + + ########################################################################### + def LogInfo(self, msg): + collectd.info(msg) # pylint: disable=no-member + + ########################################################################### + def LogError(self, msg): + collectd.error(msg) # pylint: disable=no-member + + +############################################################################### +##### Parse supplied collectd configuration object. +############################################################################### +def parse_config(config): + global c_ONE_SEC_IN_USEC + global g_intervalSec + global g_fieldIntervalMap + global g_parseRegEx + global g_fieldRegEx + + g_fieldIntervalMap = {} + + for node in config.children: + if node.key == 'Interval': + g_intervalSec = float(node.values[0]) + elif node.key == 'FieldIds': + fieldIds = node.values[0] + + # And we parse out the field ID list with this regex. + field_set_list = g_parseRegEx.finditer(fieldIds) + + for field_set in field_set_list: + # We get the list of fields... + fields = field_set.group(2) + + # ... and the optional interval. + interval_str = field_set.group(5) + + # We figure out if the default collectd sampling interval is + # to be used, or a different one. + if (interval_str == None) or (interval_str == ":"): + interval = int(g_intervalSec * c_ONE_SEC_IN_USEC) + else: + interval = int(float(interval_str[1:]) * + c_ONE_SEC_IN_USEC) # strip : + + # We keep a set of fields for each unique interval + if interval not in g_fieldIntervalMap.keys(): + g_fieldIntervalMap[interval] = [] + + # Here we parse out either miltiple fields sharing an + # interval, or a single field. + if fields[0:1] == "(": # a true field set + fields = fields[1:-1] + field_list = g_fieldRegEx.finditer(fields) + for field_group in field_list: + + # We map any field names to field numbers, and add + # them to the list for the interval + field = dcgm_fields_collectd.GetFieldByName( + field_group.group(2)) + g_fieldIntervalMap[interval] += [field] + else: # just one field + # Map field name to number. + field = dcgm_fields_collectd.GetFieldByName(fields) + g_fieldIntervalMap[interval] += [field] + + +############################################################################### +##### Wrapper the Class methods for collectd callbacks +############################################################################### +def config_dcgm(config=None): + """ + collectd config for dcgm is in the form of a dcgm.conf file, usually + installed in /etc/collectd/collectd.conf.d/dcgm.conf. + + An example is: + + LoadPlugin python + + ModulePath "/usr/lib64/collectd/dcgm" + LogTraces true + Interactive false + Import "dcgm_collectd_plugin" + + Interval 2 + FieldIds "(1001,tensor_active):5,1002:10,1004:.1,1010:" + FieldIds "1007" + + + + ModulePath indicates where the plugin and supporting files are installed + (generally copied from /usr/local/dcgm/bindings/python3). + + Interval is the default collectd sampling interval in seconds. + + FieldIds may appear several times. One is either a field ID by name or + number. A field ID list is either a single field ID or a list of same, + separated by commas (,) and bounded by parenthesis ( ( and ) ). Each field + ID list can be followed by an optional colon (:) and a floating point + DCGM sampling interval. If no sampling interval is specified the default + collectd sampling interval is used (and the colon is redundant but not + illegal). Multiple field ID lists can appear on one FieldIds entry, + separated by commas (,). FieldIDs are strings and must be enclosed in + quotes ("). Multiple FieldIds lines are permitted. + + DCGM will sample the fields at the interval(s) indicated, and collectd will + collect the samples asynchronously at the Interval specified. Because this + is asynchronous sometimes one less than expected will be collected and other + times one more than expected will be collected. + """ + + # If we throw an exception here, collectd config will terminate loading the + # plugin. + if config is not None: + parse_config(config) + + # Register the read function with the default collectd sampling interval. + collectd.register_read(read_dcgm, interval=g_intervalSec) # pylint: disable=no-member + + +############################################################################### +def init_dcgm(): + global g_dcgmCollectd + + # restore default SIGCHLD behavior to avoid exceptions with new processes + signal.signal(signal.SIGCHLD, signal.SIG_DFL) + + g_dcgmCollectd = DcgmCollectdPlugin() + g_dcgmCollectd.Init() + + +############################################################################### +def shutdown_dcgm(): + g_dcgmCollectd.Shutdown() + + +############################################################################### +def read_dcgm(data=None): + g_dcgmCollectd.Process() + + +def register_collectd_callbacks(): + collectd.register_config(config_dcgm, name="dcgm_collectd_plugin") # pylint: disable=no-member + # config_dcgm registers read since it needs to parse the sampling interval. + collectd.register_init(init_dcgm) # pylint: disable=no-member + collectd.register_shutdown(shutdown_dcgm) # pylint: disable=no-member + + +############################################################################### +##### Main +############################################################################### +register_collectd_callbacks() diff --git a/model_analyzer/monitor/dcgm/dcgm_errors.py b/model_analyzer/monitor/dcgm/dcgm_errors.py new file mode 100644 index 000000000..e52f3b114 --- /dev/null +++ b/model_analyzer/monitor/dcgm/dcgm_errors.py @@ -0,0 +1,395 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import ctypes +import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs + +DCGM_FR_OK = 0 # No error +DCGM_FR_UNKNOWN = 1 # Unknown error code +DCGM_FR_UNRECOGNIZED = 2 # Unrecognized error code +DCGM_FR_PCI_REPLAY_RATE = 3 # Unacceptable rate of PCI errors +DCGM_FR_VOLATILE_DBE_DETECTED = 4 # Uncorrectable volatile double bit error +DCGM_FR_VOLATILE_SBE_DETECTED = 5 # Unacceptable rate of volatile single bit errors +DCGM_FR_PENDING_PAGE_RETIREMENTS = 6 # Pending page retirements detected +DCGM_FR_RETIRED_PAGES_LIMIT = 7 # Unacceptable total page retirements detected +DCGM_FR_RETIRED_PAGES_DBE_LIMIT = 8 # Unacceptable total page retirements due to uncorrectable errors +DCGM_FR_CORRUPT_INFOROM = 9 # Corrupt inforom found +DCGM_FR_CLOCK_THROTTLE_THERMAL = 10 # Clocks being throttled due to overheating +DCGM_FR_POWER_UNREADABLE = 11 # Cannot get a reading for power from NVML +DCGM_FR_CLOCK_THROTTLE_POWER = 12 # Clock being throttled due to power restrictions +DCGM_FR_NVLINK_ERROR_THRESHOLD = 13 # Unacceptable rate of NVLink errors +DCGM_FR_NVLINK_DOWN = 14 # NVLink is down +DCGM_FR_NVSWITCH_FATAL_ERROR = 15 # Fatal errors on the NVSwitch +DCGM_FR_NVSWITCH_NON_FATAL_ERROR = 16 # Non-fatal errors on the NVSwitch +DCGM_FR_NVSWITCH_DOWN = 17 # NVSwitch is down +DCGM_FR_NO_ACCESS_TO_FILE = 18 # Cannot access a file +DCGM_FR_NVML_API = 19 # Error occurred on an NVML API +DCGM_FR_DEVICE_COUNT_MISMATCH = 20 # Disagreement in GPU count between /dev and NVML +DCGM_FR_BAD_PARAMETER = 21 # Bad parameter passed to API +DCGM_FR_CANNOT_OPEN_LIB = 22 # Cannot open a library that must be accessed +DCGM_FR_DENYLISTED_DRIVER = 23 # A driver on the denylist (nouveau) is active +DCGM_FR_NVML_LIB_BAD = 24 # The NVML library is missing expected functions +DCGM_FR_GRAPHICS_PROCESSES = 25 # Graphics processes are active on this GPU +DCGM_FR_HOSTENGINE_CONN = 26 # Unstable connection to nv-hostengine (daemonized DCGM) +DCGM_FR_FIELD_QUERY = 27 # Error querying a field from DCGM +DCGM_FR_BAD_CUDA_ENV = 28 # The environment has variables that hurt CUDA +DCGM_FR_PERSISTENCE_MODE = 29 # Persistence mode is disabled +DCGM_FR_LOW_BANDWIDTH = 30 # The bandwidth is unacceptably low +DCGM_FR_HIGH_LATENCY = 31 # Latency is too high +DCGM_FR_CANNOT_GET_FIELD_TAG = 32 # Cannot find a tag for a field +DCGM_FR_FIELD_VIOLATION = 33 # The value for the specified error field is above 0 +DCGM_FR_FIELD_THRESHOLD = 34 # The value for the specified field is above the threshold +DCGM_FR_FIELD_VIOLATION_DBL = 35 # The value for the specified error field is above 0 +DCGM_FR_FIELD_THRESHOLD_DBL = 36 # The value for the specified field is above the threshold +DCGM_FR_UNSUPPORTED_FIELD_TYPE = 37 # Field type cannot be supported +DCGM_FR_FIELD_THRESHOLD_TS = 38 # The value for the specified field is above the threshold +DCGM_FR_FIELD_THRESHOLD_TS_DBL = 39 # The value for the specified field is above the threshold +DCGM_FR_THERMAL_VIOLATIONS = 40 # Thermal violations detected +DCGM_FR_THERMAL_VIOLATIONS_TS = 41 # Thermal violations detected with a timestamp +DCGM_FR_TEMP_VIOLATION = 42 # Temperature is too high +DCGM_FR_THROTTLING_VIOLATION = 43 # Non-benign clock throttling is occurring +DCGM_FR_INTERNAL = 44 # An internal error was detected +DCGM_FR_PCIE_GENERATION = 45 # PCIe generation is too low +DCGM_FR_PCIE_WIDTH = 46 # PCIe width is too low +DCGM_FR_ABORTED = 47 # Test was aborted by a user signal +DCGM_FR_TEST_DISABLED = 48 # This test is disabled for this GPU +DCGM_FR_CANNOT_GET_STAT = 49 # Cannot get telemetry for a needed value +DCGM_FR_STRESS_LEVEL = 50 # Stress level is too low (bad performance) +DCGM_FR_CUDA_API = 51 # Error calling the specified CUDA API +DCGM_FR_FAULTY_MEMORY = 52 # Faulty memory detected on this GPU +DCGM_FR_CANNOT_SET_WATCHES = 53 # Unable to set field watches in DCGM +DCGM_FR_CUDA_UNBOUND = 54 # CUDA context is no longer bound +DCGM_FR_ECC_DISABLED = 55 # ECC memory is disabled right now +DCGM_FR_MEMORY_ALLOC = 56 # Cannot allocate memory +DCGM_FR_CUDA_DBE = 57 # CUDA detected unrecovable double-bit error +DCGM_FR_MEMORY_MISMATCH = 58 # Memory error detected +DCGM_FR_CUDA_DEVICE = 59 # No CUDA device discoverable for existing GPU +DCGM_FR_ECC_UNSUPPORTED = 60 # ECC memory is unsupported by this SKU +DCGM_FR_ECC_PENDING = 61 # ECC memory is in a pending state +DCGM_FR_MEMORY_BANDWIDTH = 62 # Memory bandwidth is too low +DCGM_FR_TARGET_POWER = 63 # Cannot hit the target power draw +DCGM_FR_API_FAIL = 64 # The specified API call failed +DCGM_FR_API_FAIL_GPU = 65 # The specified API call failed for the specified GPU +DCGM_FR_CUDA_CONTEXT = 66 # Cannot create a CUDA context on this GPU +DCGM_FR_DCGM_API = 67 # DCGM API failure +DCGM_FR_CONCURRENT_GPUS = 68 # Need multiple GPUs to run this test +DCGM_FR_TOO_MANY_ERRORS = 69 # More errors than fit in the return struct +DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD = 70 # More than 100 CRC errors are happening per second +DCGM_FR_NVLINK_ERROR_CRITICAL = 71 # NVLink error for a field that should always be 0 +DCGM_FR_ENFORCED_POWER_LIMIT = 72 # The enforced power limit is too low to hit the target +DCGM_FR_MEMORY_ALLOC_HOST = 73 # Cannot allocate memory on the host +DCGM_FR_GPU_OP_MODE = 74 # Bad GPU operating mode for running plugin +DCGM_FR_NO_MEMORY_CLOCKS = 75 # No memory clocks with the needed MHz were found +DCGM_FR_NO_GRAPHICS_CLOCKS = 76 # No graphics clocks with the needed MHz were found +DCGM_FR_HAD_TO_RESTORE_STATE = 77 # Note that we had to restore a GPU's state +DCGM_FR_L1TAG_UNSUPPORTED = 78 # L1TAG test is unsupported by this SKU +DCGM_FR_L1TAG_MISCOMPARE = 79 # L1TAG test failed on a miscompare +DCGM_FR_ROW_REMAP_FAILURE = 80 # Row remapping failed (Ampere or newer GPUs) +DCGM_FR_UNCONTAINED_ERROR = 81 # Uncontained error - XID 95 +DCGM_FR_EMPTY_GPU_LIST = 82 # No GPU information given to plugin +DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS = 83 # Pending page retirements due to a DBE +DCGM_FR_UNCORRECTABLE_ROW_REMAP = 84 # Uncorrectable row remapping +DCGM_FR_PENDING_ROW_REMAP = 85 # Row remapping is pending +DCGM_FR_BROKEN_P2P_MEMORY_DEVICE = 86 # P2P copy test detected an error writing to this GPU +DCGM_FR_BROKEN_P2P_WRITER_DEVICE = 87 # P2P copy test detected an error writing from this GPU +DCGM_FR_NVSWITCH_NVLINK_DOWN = 88 # An NVLink is down +DCGM_FR_EUD_BINARY_PERMISSIONS = 89 # EUD binary permissions are incorrect +DCGM_FR_EUD_NON_ROOT_USER = 90 # EUD plugin is not running as root +DCGM_FR_EUD_SPAWN_FAILURE = 91 # EUD plugin failed to spawn the EUD binary +DCGM_FR_EUD_TIMEOUT = 92 # EUD plugin timed out +DCGM_FR_EUD_ZOMBIE = 93 # EUD process remains running after the plugin considers it finished +DCGM_FR_EUD_NON_ZERO_EXIT_CODE = 94 # EUD process exited with a non-zero exit code +DCGM_FR_EUD_TEST_FAILED = 95 # EUD test failed +DCGM_FR_FILE_CREATE_PERMISSIONS = 96 # We cannot write a file in this directory. +DCGM_FR_PAUSE_RESUME_FAILED = 97 # Pause/Resume failed +DCGM_FR_ERROR_SENTINEL = 98 # MUST BE THE LAST ERROR CODE + +# Standard message for running a field diagnostic +TRIAGE_RUN_FIELD_DIAG_MSG = "Run a field diagnostic on the GPU." +DEBUG_COOLING_MSG = "Verify that the cooling on this machine is functional, including external, thermal "\ + "material interface, fans, and any other components." +BUG_REPORT_MSG = "Please capture an nvidia-bug-report and send it to NVIDIA." + +# Define DCGM error priorities +DCGM_ERROR_MONITOR = 0 # Can perform workload, but needs to be monitored. +DCGM_ERROR_ISOLATE = 1 # Cannot perform workload. GPU should be isolated. +DCGM_ERROR_UNKNOWN = 2 # This error code is not recognized + +# Messages for the error codes. All messages must be defined in the ERROR_CODE_MSG format +# where is the actual message. + +DCGM_FR_OK_MSG = "The operation completed successfully." +DCGM_FR_UNKNOWN_MSG = "Unknown error." +DCGM_FR_UNRECOGNIZED_MSG = "Unrecognized error code." +# replay limit, gpu id, replay errors detected +DCGM_FR_PCI_REPLAY_RATE_MSG = "Detected more than %u PCIe replays per minute for GPU %u : %d" +# dbes deteced, gpu id +DCGM_FR_VOLATILE_DBE_DETECTED_MSG = "Detected %d volatile double-bit ECC error(s) in GPU %u." +# sbe limit, gpu id, sbes detected +DCGM_FR_VOLATILE_SBE_DETECTED_MSG = "More than %u single-bit ECC error(s) detected in GPU %u Volatile SBEs: %lld" +# gpu id +DCGM_FR_PENDING_PAGE_RETIREMENTS_MSG = "A pending retired page has been detected in GPU %u." +# retired pages detected, gpud id +DCGM_FR_RETIRED_PAGES_LIMIT_MSG = "%u or more retired pages have been detected in GPU %u. " +# retired pages due to dbes detected, gpu id +DCGM_FR_RETIRED_PAGES_DBE_LIMIT_MSG = "An excess of %u retired pages due to DBEs have been detected and" \ + " more than one page has been retired due to DBEs in the past" \ + " week in GPU %u." +# gpu id +DCGM_FR_CORRUPT_INFOROM_MSG = "A corrupt InfoROM has been detected in GPU %u." +# gpu id +DCGM_FR_CLOCK_THROTTLE_THERMAL_MSG = "Detected clock throttling due to thermal violation in GPU %u." +# gpu id +DCGM_FR_POWER_UNREADABLE_MSG = "Cannot reliably read the power usage for GPU %u." +# gpu id +DCGM_FR_CLOCK_THROTTLE_POWER_MSG = "Detected clock throttling due to power violation in GPU %u." +# nvlink errors detected, nvlink id, error threshold +DCGM_FR_NVLINK_ERROR_THRESHOLD_MSG = "Detected %ld NvLink errors on NvLink %u which exceeds threshold of %u" +# gpu id, nvlink id +DCGM_FR_NVLINK_DOWN_MSG = "GPU %u's NvLink link %d is currently down" +# nvswitch id, nvlink id +DCGM_FR_NVSWITCH_FATAL_ERROR_MSG = "Detected fatal errors on NvSwitch %u link %u" +# nvswitch id, nvlink id +DCGM_FR_NVSWITCH_NON_FATAL_ERROR_MSG = "Detected nonfatal errors on NvSwitch %u link %u" +# nvswitch id, nvlink port +DCGM_FR_NVSWITCH_DOWN_MSG = "NvSwitch physical ID %u's NvLink port %d is currently down." +# file path, error detail +DCGM_FR_NO_ACCESS_TO_FILE_MSG = "File %s could not be accessed directly: %s" +# purpose for communicating with NVML, NVML error as string, NVML error +DCGM_FR_NVML_API_MSG = "Error calling NVML API %s: %s" +DCGM_FR_DEVICE_COUNT_MISMATCH_MSG = "The number of devices NVML returns is different than the number "\ + "of devices in /dev." +# function name +DCGM_FR_BAD_PARAMETER_MSG = "Bad parameter to function %s cannot be processed" +# library name, error returned from dlopen +DCGM_FR_CANNOT_OPEN_LIB_MSG = "Cannot open library %s: '%s'" +# the name of the driver on the denylist +DCGM_FR_DENYLISTED_DRIVER_MSG = "Found driver on the denylist: %s" +# the name of the function that wasn't found +DCGM_FR_NVML_LIB_BAD_MSG = "Cannot get pointer to %s from libnvidia-ml.so" +DCGM_FR_GRAPHICS_PROCESSES_MSG = "NVVS has detected graphics processes running on at least one "\ + "GPU. This may cause some tests to fail." +# error message from the API call +DCGM_FR_HOSTENGINE_CONN_MSG = "Could not connect to the host engine: '%s'" +# field name, gpu id +DCGM_FR_FIELD_QUERY_MSG = "Could not query field %s for GPU %u" +# environment variable name +DCGM_FR_BAD_CUDA_ENV_MSG = "Found CUDA performance-limiting environment variable '%s'." +# gpu id +DCGM_FR_PERSISTENCE_MODE_MSG = "Persistence mode for GPU %u is currently disabled. The DCGM "\ + "diagnostic requires peristence mode to be enabled." +DCGM_FR_LOW_BANDWIDTH_MSG = "Bandwidth of GPU %u in direction %s of %.2f did not exceed "\ + "minimum required bandwidth of %.2f." +DCGM_FR_HIGH_LATENCY_MSG = "Latency type %s of GPU %u value %.2f exceeded maximum allowed "\ + "latency of %.2f." +DCGM_FR_CANNOT_GET_FIELD_TAG_MSG = "Unable to get field information for field id %hu" +DCGM_FR_FIELD_VIOLATION_MSG = "Detected %ld %s for GPU %u" +DCGM_FR_FIELD_THRESHOLD_MSG = "Detected %ld %s for GPU %u which is above the threshold %ld" +DCGM_FR_FIELD_VIOLATION_DBL_MSG = "Detected %.1f %s for GPU %u" +DCGM_FR_FIELD_THRESHOLD_DBL_MSG = "Detected %.1f %s for GPU %u which is above the threshold %.1f" +DCGM_FR_UNSUPPORTED_FIELD_TYPE_MSG = "Field %s is not supported by this API because it is neither an "\ + "int64 nor a double type." +DCGM_FR_FIELD_THRESHOLD_TS_MSG = "%s met or exceeded the threshold of %lu per second: %lu at "\ + "%.1f seconds into the test." +DCGM_FR_FIELD_THRESHOLD_TS_DBL_MSG = "%s met or exceeded the threshold of %.1f per second: %.1f at "\ + "%.1f seconds into the test." +DCGM_FR_THERMAL_VIOLATIONS_MSG = "There were thermal violations totaling %lu seconds for GPU %u" +DCGM_FR_THERMAL_VIOLATIONS_TS_MSG = "Thermal violations totaling %lu samples started at %.1f seconds "\ + "into the test for GPU %u" +DCGM_FR_TEMP_VIOLATION_MSG = "Temperature %lld of GPU %u exceeded user-specified maximum "\ + "allowed temperature %lld" +DCGM_FR_THROTTLING_VIOLATION_MSG = "Clocks are being throttling for GPU %u because of clock "\ + "throttling starting %.1f seconds into the test. %s" +DCGM_FR_INTERNAL_MSG = "There was an internal error during the test: '%s'" +DCGM_FR_PCIE_GENERATION_MSG = "GPU %u is running at PCI link generation %d, which is below "\ + "the minimum allowed link generation of %d (parameter '%s')" +DCGM_FR_PCIE_WIDTH_MSG = "GPU %u is running at PCI link width %dX, which is below the "\ + "minimum allowed link generation of %d (parameter '%s')" +DCGM_FR_ABORTED_MSG = "Test was aborted early due to user signal" +DCGM_FR_TEST_DISABLED_MSG = "The %s test is skipped for this GPU." +DCGM_FR_CANNOT_GET_STAT_MSG = "Unable to generate / collect stat %s for GPU %u" +DCGM_FR_STRESS_LEVEL_MSG = "Max stress level of %.1f did not reach desired stress level of "\ + "%.1f for GPU %u" +DCGM_FR_CUDA_API_MSG = "Error using CUDA API %s" +DCGM_FR_FAULTY_MEMORY_MSG = "Found %d faulty memory elements on GPU %u" +DCGM_FR_CANNOT_SET_WATCHES_MSG = "Unable to add field watches to DCGM: %s" +DCGM_FR_CUDA_UNBOUND_MSG = "Cuda GPU %d is no longer bound to a CUDA context...Aborting" +DCGM_FR_ECC_DISABLED_MSG = "Skipping test %s because ECC is not enabled on GPU %u" +DCGM_FR_MEMORY_ALLOC_MSG = "Couldn't allocate at least %.1f%% of GPU memory on GPU %u" +DCGM_FR_CUDA_DBE_MSG = "CUDA APIs have indicated that a double-bit ECC error has "\ + "occured on GPU %u." +DCGM_FR_MEMORY_MISMATCH_MSG = "A memory mismatch was detected on GPU %u, but no error was "\ + "reported by CUDA or NVML." +DCGM_FR_CUDA_DEVICE_MSG = "Unable to find a corresponding CUDA device for GPU %u: '%s'" +DCGM_FR_ECC_UNSUPPORTED_MSG = "This card does not support ECC Memory. Skipping test." +DCGM_FR_ECC_PENDING_MSG = "ECC memory for GPU %u is in a pending state." +DCGM_FR_MEMORY_BANDWIDTH_MSG = "GPU %u only achieved a memory bandwidth of %.2f GB/s, failing "\ + "to meet %.2f GB/s for test %d" +DCGM_FR_TARGET_POWER_MSG = "Max power of %.1f did not reach desired power minimum %s of "\ + "%.1f for GPU %u" +DCGM_FR_API_FAIL_MSG = "API call %s failed: '%s'" +DCGM_FR_API_FAIL_GPU_MSG = "API call %s failed for GPU %u: '%s'" +DCGM_FR_CUDA_CONTEXT_MSG = "GPU %u failed to create a CUDA context: %s" +DCGM_FR_DCGM_API_MSG = "Error using DCGM API %s" +DCGM_FR_CONCURRENT_GPUS_MSG = "Unable to run concurrent pair bandwidth test without 2 or more "\ + "gpus. Skipping" +DCGM_FR_TOO_MANY_ERRORS_MSG = "This API can only return up to four errors per system. "\ + "Additional errors were found for this system that couldn't be "\ + "communicated." +DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_MSG = "%.1f %s NvLink errors found occuring per second on GPU %u, "\ + "exceeding the limit of 100 per second." +DCGM_FR_NVLINK_ERROR_CRITICAL_MSG = "Detected %ld %s NvLink errors on GPU %u's NVLink (should be 0)" +DCGM_FR_ENFORCED_POWER_LIMIT_MSG = "Enforced power limit on GPU %u set to %.1f, which is too low to "\ + "attempt to achieve target power %.1f" +DCGM_FR_MEMORY_ALLOC_HOST_MSG = "Cannot allocate %zu bytes on the host" +DCGM_FR_GPU_OP_MODE_MSG = "Skipping plugin due to a GPU being in GPU Operating Mode: LOW_DP." +DCGM_FR_NO_MEMORY_CLOCKS_MSG = "No memory clocks <= %u MHZ were found in %u supported memory clocks." +DCGM_FR_NO_GRAPHICS_CLOCKS_MSG = "No graphics clocks <= %u MHZ were found in %u supported graphics clocks for memory clock %u MHZ." +DCGM_FR_HAD_TO_RESTORE_STATE_MSG = "Had to restore GPU state on NVML GPU(s): %s" +DCGM_FR_L1TAG_UNSUPPORTED_MSG = "This card does not support the L1 cache test. Skipping test." +DCGM_FR_L1TAG_MISCOMPARE_MSG = "The L1 cache test failed with a miscompare." +DCGM_FR_ROW_REMAP_FAILURE_MSG = "Row remapping failed." +DCGM_FR_UNCONTAINED_ERROR_MSG = "GPU had an uncontained error (XID 95)" +DCGM_FR_EMPTY_GPU_LIST_MSG = "No valid GPUs passed to plugin" +DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_MSG = "Pending page retirements together with a DBE were detected on GPU %u." +DCGM_FR_UNCORRECTABLE_ROW_REMAP_MSG = "GPU %u has uncorrectable row remappings" +DCGM_FR_PENDING_ROW_REMAP_MSG = "GPU %u has pending row remappings" +DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_MSG = "GPU %u was unsuccessfully written to in a peer-to-peer test: %s" +DCGM_FR_BROKEN_P2P_WRITER_DEVICE_MSG = "GPU %u unsuccessfully wrote data in a peer-to-peer test: %s" +DCGM_FR_NVSWITCH_NVLINK_DOWN_MSG = "NVSwitch %u's NvLink %u is down." +DCGM_FR_FILE_CREATE_PERMISSIONS_MSG = "The DCGM Diagnostic does not have permissions to create a file in directory '%s'" + +# Suggestions for next steps for the corresponding error message +DCGM_FR_OK_NEXT = "N/A" +DCGM_FR_UNKNOWN_NEXT = "" +DCGM_FR_UNRECOGNIZED_NEXT = "" +DCGM_FR_PCI_REPLAY_RATE_NEXT = "Reconnect PCIe card. Run system side PCIE diagnostic utilities "\ + "to verify hops off the GPU board. If issue is on the board, run "\ + "the field diagnostic." +DCGM_FR_VOLATILE_DBE_DETECTED_NEXT = "Drain the GPU and reset it or reboot the node." +DCGM_FR_VOLATILE_SBE_DETECTED_NEXT = "Monitor - this GPU can still perform workload." +DCGM_FR_PENDING_PAGE_RETIREMENTS_NEXT = "If volatile double bit errors exist, drain the GPU and reset it "\ + "or reboot the node. Otherwise, monitor - GPU can still perform "\ + "workload." +DCGM_FR_RETIRED_PAGES_LIMIT_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG +DCGM_FR_RETIRED_PAGES_DBE_LIMIT_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG +DCGM_FR_CORRUPT_INFOROM_NEXT = "Flash the InfoROM to clear this corruption." +DCGM_FR_CLOCK_THROTTLE_THERMAL_NEXT = DEBUG_COOLING_MSG +DCGM_FR_POWER_UNREADABLE_NEXT = "" +DCGM_FR_CLOCK_THROTTLE_POWER_NEXT = "Monitor the power conditions. This GPU can still perform workload." +DCGM_FR_NVLINK_ERROR_THRESHOLD_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG +DCGM_FR_NVLINK_DOWN_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG +DCGM_FR_NVSWITCH_FATAL_ERROR_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG +DCGM_FR_NVSWITCH_NON_FATAL_ERROR_NEXT = "Monitor the NVSwitch. It can still perform workload." +DCGM_FR_NVSWITCH_DOWN_NEXT = "" +DCGM_FR_NO_ACCESS_TO_FILE_NEXT = "Check relevant permissions, access, and existence of the file." +DCGM_FR_NVML_API_NEXT = "Check the error condition and ensure that appropriate libraries "\ + "are present and accessible." +DCGM_FR_DEVICE_COUNT_MISMATCH_NEXT = "Check for the presence of cgroups, operating system blocks, and "\ + "or unsupported / older cards" +DCGM_FR_BAD_PARAMETER_NEXT = "" +DCGM_FR_CANNOT_OPEN_LIB_NEXT = "Check for the existence of the library and set LD_LIBRARY_PATH "\ + "if needed." +DCGM_FR_DENYLISTED_DRIVER_NEXT = "Please load the appropriate driver." +DCGM_FR_NVML_LIB_BAD_NEXT = "Make sure that the required version of libnvidia-ml.so "\ + "is present and accessible on the system." +DCGM_FR_GRAPHICS_PROCESSES_NEXT = "Stop the graphics processes or run this diagnostic on a server "\ + "that is not being used for display purposes." +DCGM_FR_HOSTENGINE_CONN_NEXT = "If hostengine is run separately, please ensure that it is up "\ + "and responsive." +DCGM_FR_FIELD_QUERY_NEXT = "" +DCGM_FR_BAD_CUDA_ENV_NEXT = "Please unset this environment variable to address test failures." +DCGM_FR_PERSISTENCE_MODE_NEXT = "Enable persistence mode by running \"nvidia-smi -i -pm "\ + "1 \" as root." +DCGM_FR_LOW_BANDWIDTH_NEXT = "Verify that your minimum bandwidth setting is appropriate for "\ + "all topological consequences." +DCGM_FR_HIGH_LATENCY_NEXT = "" +DCGM_FR_CANNOT_GET_FIELD_TAG_NEXT = "" +DCGM_FR_FIELD_VIOLATION_NEXT = "" +DCGM_FR_FIELD_THRESHOLD_NEXT = "" +DCGM_FR_FIELD_VIOLATION_DBL_NEXT = "" +DCGM_FR_FIELD_THRESHOLD_DBL_NEXT = "" +DCGM_FR_UNSUPPORTED_FIELD_TYPE_NEXT = "" +DCGM_FR_FIELD_THRESHOLD_TS_NEXT = "" +DCGM_FR_FIELD_THRESHOLD_TS_DBL_NEXT = "" +DCGM_FR_THERMAL_VIOLATIONS_NEXT = DEBUG_COOLING_MSG +DCGM_FR_THERMAL_VIOLATIONS_TS_NEXT = DEBUG_COOLING_MSG +DCGM_FR_TEMP_VIOLATION_NEXT = "Verify that the user-specified temperature maximum is set "\ + "correctly. If it is, %s" % DEBUG_COOLING_MSG +DCGM_FR_THROTTLING_VIOLATION_NEXT = "" +DCGM_FR_INTERNAL_NEXT = "" +DCGM_FR_PCIE_GENERATION_NEXT = "" +DCGM_FR_PCIE_WIDTH_NEXT = "" +DCGM_FR_ABORTED_NEXT = "" +DCGM_FR_TEST_DISABLED_NEXT = "" +DCGM_FR_CANNOT_GET_STAT_NEXT = "If running a standalone nv-hostengine, verify that it is up "\ + "and responsive." +DCGM_FR_STRESS_LEVEL_NEXT = "" +DCGM_FR_CUDA_API_NEXT = "" +DCGM_FR_FAULTY_MEMORY_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG +DCGM_FR_CANNOT_SET_WATCHES_NEXT = "" +DCGM_FR_CUDA_UNBOUND_NEXT = "" +DCGM_FR_ECC_DISABLED_NEXT = "Enable ECC memory by running \"nvidia-smi -i -e 1\" "\ + "to enable. This may require a GPU reset or reboot to take effect." +DCGM_FR_MEMORY_ALLOC_NEXT = "" +DCGM_FR_CUDA_DBE_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG +DCGM_FR_MEMORY_MISMATCH_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG +DCGM_FR_CUDA_DEVICE_NEXT = "" +DCGM_FR_ECC_UNSUPPORTED_NEXT = "" +DCGM_FR_ECC_PENDING_NEXT = "Please reboot to activate it." +DCGM_FR_MEMORY_BANDWIDTH_NEXT = "" +DCGM_FR_TARGET_POWER_NEXT = "" +DCGM_FR_API_FAIL_NEXT = "" +DCGM_FR_API_FAIL_GPU_NEXT = "" +DCGM_FR_CUDA_CONTEXT_NEXT = "Please make sure the correct driver version is installed and "\ + "verify that no conflicting libraries are present." +DCGM_FR_DCGM_API_NEXT = "" +DCGM_FR_CONCURRENT_GPUS_NEXT = "" +DCGM_FR_TOO_MANY_ERRORS_NEXT = "" +DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG +DCGM_FR_NVLINK_ERROR_CRITICAL_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG +DCGM_FR_ENFORCED_POWER_LIMIT_NEXT = "If this enforced power limit is necessary, then this test "\ + "cannot be run. If it is unnecessary, then raise the enforced "\ + "power limit setting to be able to run this test." +DCGM_FR_MEMORY_ALLOC_HOST_NEXT = "Manually kill processes or restart your machine." +DCGM_FR_GPU_OP_MODE_NEXT = "Fix by running nvidia-smi as root with: nvidia-smi --gom=0 -i "\ + "" +DCGM_FR_NO_MEMORY_CLOCKS_NEXT = "" +DCGM_FR_NO_GRAPHICS_CLOCKS_NEXT = "" +DCGM_FR_HAD_TO_RESTORE_STATE_NEXT = "" +DCGM_FR_L1TAG_UNSUPPORTED_NEXT = "" +DCGM_FR_L1TAG_MISCOMPARE_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG +DCGM_FR_ROW_REMAP_FAILURE_NEXT = DCGM_FR_VOLATILE_DBE_DETECTED_NEXT +DCGM_FR_UNCONTAINED_ERROR_NEXT = DCGM_FR_VOLATILE_DBE_DETECTED_NEXT +DCGM_FR_EMPTY_GPU_LIST_NEXT = "" +DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_NEXT = "Drain the GPU and reset it or reboot the node to resolve this issue." +DCGM_FR_UNCORRECTABLE_ROW_REMAP_NEXT = "" +DCGM_FR_PENDING_ROW_REMAP_NEXT = "" +DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_NEXT = BUG_REPORT_MSG +DCGM_FR_BROKEN_P2P_WRITER_DEVICE_NEXT = BUG_REPORT_MSG +DCGM_FR_NVSWITCH_NVLINK_DOWN_NEXT = "Please check fabric manager and initialization logs to figure out why the link is down. You may also need to run a field diagnostic." +DCGM_FR_FILE_CREATE_PERMISSIONS_NEXT = "Please restart the hostengine with parameter --home-dir to specify a different home directory for the " \ + "diagnostic or change permissions in the current directory to allow the user to write files there." + + +def dcgmErrorGetPriorityByCode(code): + fn = dcgm_structs._dcgmGetFunctionPointer("dcgmErrorGetPriorityByCode") + ret = fn(code) + return ret + + +def dcgmErrorGetFormatMsgByCode(code): + fn = dcgm_structs._dcgmGetFunctionPointer("dcgmErrorGetFormatMsgByCode") + fn.restype = ctypes.c_char_p + ret = fn(code) + return ret.decode('utf-8') if isinstance(ret, bytes) else ret diff --git a/model_analyzer/monitor/dcgm/dcgm_field_helpers.py b/model_analyzer/monitor/dcgm/dcgm_field_helpers.py index d29a5c412..ceb9f7e0e 100755 --- a/model_analyzer/monitor/dcgm/dcgm_field_helpers.py +++ b/model_analyzer/monitor/dcgm/dcgm_field_helpers.py @@ -1,6 +1,4 @@ -#!/usr/bin/env python3 - -# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,29 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. +import time +import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields +import model_analyzer.monitor.dcgm.dcgm_fields_internal as dcgm_fields_internal +import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs +import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent import ctypes +import model_analyzer.monitor.dcgm.dcgmvalue as dcgmvalue +import model_analyzer.monitor.dcgm.pydcgm as pydcgm import json +''' +Helper class that makes a python-friendly field value from one returned from the python bindings +''' -import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent -import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields -import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs -import model_analyzer.monitor.dcgm.dcgm_value as dcgmvalue +class DcgmFieldValue(): + ''' + Constructor -class DcgmFieldValue: - """ - Helper class that makes a python-friendly field value from one returned - from the python bindings - """ + rawValue is the latest dcgm_structs.c_dcgmFieldValue_v? structure of a field value returned from the raw APIs + ''' def __init__(self, rawValue): - """ - rawValue : dcgm_structs.c_dcgmFieldValue_v? - is the latest structure of a field value returned from the raw APIs - """ - # Make sure the class passed in is an expected type + #Make sure the class passed in is an expected type if not type(rawValue) == dcgm_structs.c_dcgmFieldValue_v1: - raise Exception(f"Unexpected rawValue type {str(type(rawValue))}") + raise Exception("Unexpected rawValue type %s" % str(type(rawValue))) self.ts = rawValue.ts self.fieldId = rawValue.fieldId @@ -51,10 +51,7 @@ def __init__(self, rawValue): if self.fieldType == dcgm_fields.DCGM_FT_DOUBLE: self.value = float(rawValue.value.dbl) self.isBlank = dcgmvalue.DCGM_FP64_IS_BLANK(self.value) - elif ( - self.fieldType == dcgm_fields.DCGM_FT_INT64 - or self.fieldType == dcgm_fields.DCGM_FT_TIMESTAMP - ): + elif self.fieldType == dcgm_fields.DCGM_FT_INT64 or self.fieldType == dcgm_fields.DCGM_FT_TIMESTAMP: self.value = int(rawValue.value.i64) self.isBlank = dcgmvalue.DCGM_INT64_IS_BLANK(self.value) elif self.fieldType == dcgm_fields.DCGM_FT_STRING: @@ -63,31 +60,33 @@ def __init__(self, rawValue): elif self.fieldType == dcgm_fields.DCGM_FT_BINARY: if self.fieldId == dcgm_fields.DCGM_FI_DEV_ACCOUNTING_DATA: accStats = dcgm_structs.c_dcgmDevicePidAccountingStats_v1() - ctypes.memmove( - ctypes.addressof(accStats), - rawValue.value.blob, - accStats.FieldsSizeof(), - ) - if self.fieldId == dcgm_fields.DCGM_FI_DEV_COMPUTE_PIDS: - accStats = dcgm_structs.c_dcgmDeviceVgpuProcessUtilInfo_v1() - ctypes.memmove( - ctypes.addressof(accStats), - rawValue.value.blob, - accStats.FieldsSizeof(), - ) + ctypes.memmove(ctypes.addressof(accStats), rawValue.value.blob, + accStats.FieldsSizeof()) + if self.fieldId in [ + dcgm_fields_internal.DCGM_FI_DEV_COMPUTE_PIDS, + dcgm_fields_internal.DCGM_FI_DEV_GRAPHICS_PIDS + ]: + processStats = dcgm_structs.c_dcgmRunningProcess_t() + ctypes.memmove(ctypes.addressof(processStats), + rawValue.value.blob, processStats.FieldsSizeof()) + self.value = processStats + self.fieldType = dcgm_fields.DCGM_FT_BINARY + # This should always be false + self.isBlank = dcgmvalue.DCGM_INT64_IS_BLANK(processStats.pid) elif self.fieldId == dcgm_fields.DCGM_FI_SYNC_BOOST: - # Not exposed publicly for now + #Not exposed publicly for now self.value = None else: - raise Exception("Blobs not handled yet for fieldId %d" % self.fieldId) + raise Exception("Blobs not handled yet for fieldId %d" % + self.fieldId) else: raise Exception("Unhandled fieldType: %s" % self.fieldType) class DcgmFieldValueTimeSeries: + def __init__(self): - # Values in timestamp order - self.values = [] + self.values = [] #Values in timestamp order def __len__(self): return len(self.values) @@ -100,7 +99,7 @@ def InsertValue(self, value): self.values.append(value) return - # Otherwise, we need to insert the value in the correct place. + #Otherwise, we need to insert the value in the correct place. Find the place for i, existingValue in enumerate(self.values): if value.ts < existingValue.ts: self.values.insert(i, value) @@ -110,75 +109,75 @@ def InsertValue(self, value): class FieldValueEncoder(json.JSONEncoder): - # Pylint does not link overloading the default method, so the comment below - # is WAR for the linting problem + # Pylint does not link overloading the default method, so the comment below is WAR for the linting problem def default(self, obj): # pylint: disable=E0202 nested_json = [] + i = 0 for key in obj: if isinstance(key, DcgmFieldValue): - if key.isBlank: + if (key.isBlank): continue - nested_json.append( - {"Timestamp": key.ts, "FieldId": key.fieldId, "Value": key.value} - ) + nested_json.append({ + 'Timestamp': key.ts, + 'FieldId': key.fieldId, + 'Value': key.value + }) else: return json.JSONEncoder.default( - self, obj - ) # Let default encoder throw exception + self, obj) # Let default encoder throw exception return nested_json -def py_helper_dcgm_field_values_since_callback(gpuId, values, numValues, userData): +def py_helper_dcgm_field_values_since_callback(gpuId, values, numValues, + userData): + userData = ctypes.cast(userData, ctypes.py_object).value userData._ProcessValues(gpuId, values[0:numValues]) return 0 helper_dcgm_field_values_since_callback = dcgm_agent.dcgmFieldValueEnumeration_f( - py_helper_dcgm_field_values_since_callback -) + py_helper_dcgm_field_values_since_callback) -def py_helper_dcgm_field_values_since_callback_v2( - entityGroupId, entityId, values, numValues, userData -): +def py_helper_dcgm_field_values_since_callback_v2(entityGroupId, entityId, + values, numValues, userData): userData = ctypes.cast(userData, ctypes.py_object).value - userData._ProcessValues(entityGroupId, entityId, values[0:numValues]) + userData._ProcessValuesV2(entityGroupId, entityId, values[0:numValues]) return 0 -helper_dcgm_field_values_since_callback_v2 = ( - dcgm_agent.dcgmFieldValueEntityEnumeration_f( - py_helper_dcgm_field_values_since_callback_v2 - ) -) +helper_dcgm_field_values_since_callback_v2 = dcgm_agent.dcgmFieldValueEntityEnumeration_f( + py_helper_dcgm_field_values_since_callback_v2) +''' +Helper class for handling field value update callbacks and storing them in a .values member variable +''' class DcgmFieldValueCollection: - """ - Helper class for handling field value update callbacks and storing them - in a .values member variable - """ def __init__(self, handle, groupId): - self.values = {} - # 2D dictionary of [gpuId][fieldId](DcgmFieldValueTimeSeries) + self.values = { + } #2D dictionary of [gpuId][fieldId](DcgmFieldValueTimeSeries) + self.entityValues = { + } #3D dictionary of [entityGroupId][entityId][fieldId](DcgmFieldValueTimeSeries) self._handle = handle self._groupId = groupId self._numValuesSeen = 0 + self._nextSinceTimestamp = 0 + + ''' + Helper function called by the callback of dcgm_agent.dcgmGetValuesSince to process individual field values + ''' def _ProcessValues(self, gpuId, values): - """ - Helper function called by the callback of - dcgm_agent.dcgmGetValuesSince to process individual field values - """ self._numValuesSeen += len(values) if gpuId not in self.values: self.values[gpuId] = {} for rawValue in values: - # Convert to python-friendly value + #Convert to python-friendly value value = DcgmFieldValue(rawValue) if value.fieldId not in self.values[gpuId]: @@ -186,185 +185,187 @@ def _ProcessValues(self, gpuId, values): self.values[gpuId][value.fieldId].InsertValue(value) - def GetLatestValues(self, fieldGroup): - """ - Get the latest values for a fieldGroup and store them to the .values - member variable + ''' + Helper function called by the callback py_helper_dcgm_field_values_since_callback_v2 to process individual field values + ''' + + def _ProcessValuesV2(self, entityGroupId, entityId, values): + self._numValuesSeen += len(values) + + if entityGroupId not in self.entityValues: + self.entityValues[entityGroupId] = {} + + if entityId not in self.entityValues[entityGroupId]: + self.entityValues[entityGroupId][entityId] = {} + + for rawValue in values: + #Convert to python-friendly value + value = DcgmFieldValue(rawValue) - Note: This class does not automatically watch fieldGroup. You must do - that ahead of time with dcgmGroup.samples.WatchFields() - """ + if value.fieldId not in self.entityValues[entityGroupId][entityId]: + self.entityValues[entityGroupId][entityId][ + value.fieldId] = DcgmFieldValueTimeSeries() + + self.entityValues[entityGroupId][entityId][ + value.fieldId].InsertValue(value) + + ''' + Get the latest values for a fieldGroup and store them to the .values member variable + + Note: This class does not automatically watch fieldGroup. You must do that ahead of time with dcgmGroup.samples.WatchFields() + ''' + + def GetLatestValues(self, fieldGroup): ret = dcgm_agent.dcgmGetLatestValues( - self._handle, - self._groupId, - fieldGroup.fieldGroupId, - helper_dcgm_field_values_since_callback, - self, - ) - # Will throw exception on error + self._handle, self._groupId, fieldGroup.fieldGroupId, + helper_dcgm_field_values_since_callback, self) + #Will throw exception on error dcgm_structs._dcgmCheckReturn(ret) + ''' + Method to cause more field values to be retrieved from DCGM. Returns the + number of field values that were retrieved. + ''' + + def GetAllSinceLastCall(self, fieldGroup): + beforeCount = self._numValuesSeen + self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince( + self._handle, self._groupId, fieldGroup.fieldGroupId, + self._nextSinceTimestamp, helper_dcgm_field_values_since_callback, + self) + afterCount = self._numValuesSeen + return afterCount - beforeCount + def GetLatestValues_v2(self, fieldGroup): ret = dcgm_agent.dcgmGetLatestValues_v2( - self._handle, - self._groupId, - fieldGroup.fieldGroupId, - helper_dcgm_field_values_since_callback_v2, - self, - ) - # Will throw exception on error + self._handle, self._groupId, fieldGroup.fieldGroupId, + helper_dcgm_field_values_since_callback_v2, self) + #Will throw exception on error dcgm_structs._dcgmCheckReturn(ret) + ''' + Method to cause more field values to be retrieved from DCGM. Returns the number of field values that were retrieved + ''' + + def GetAllSinceLastCall_v2(self, fieldGroup): + beforeCount = self._numValuesSeen + self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince_v2( + self._handle, self._groupId, fieldGroup.fieldGroupId, + self._nextSinceTimestamp, + helper_dcgm_field_values_since_entity_callback, self) + afterCount = self._numValuesSeen + return afterCount - beforeCount + + ''' + Empty .values{} so that old data is no longer present in this structure. + This can be used to prevent .values from growing over time + ''' + def EmptyValues(self): - """ - Empty .values{} so that old data is no longer present in this - structure. This can be used to prevent .values from growing over time - """ self.values = {} self._numValuesSeen = 0 +''' +Helper class for watching a field group and storing fields values returned from it +''' + + class DcgmFieldGroupWatcher(DcgmFieldValueCollection): - """ - Helper class for watching a field group and storing fields values returned - from it - """ - - def __init__( - self, - handle, - groupId, - fieldGroup, - operationMode, - updateFreq, - maxKeepAge, - maxKeepSamples, - startTimestamp, - ): - """ - handle : - DCGM handle from dcgm_agent.dcgmInit() - groupId : - a DCGM group ID returned from dcgm_agent.dcgmGroupCreate - fieldGroup : - DcgmFieldGroup() instance to watch fields for - operationMode : - a dcgm_structs.DCGM_OPERATION_MODE_? constant for if the host - engine is running in lock step or auto mode - updateFreq : - how often to update each field in usec - maxKeepAge : - how long DCGM should keep values for in seconds - maxKeepSamples : - is the maximum number of samples DCGM should ever cache for each - field - startTimestamp : - a base timestamp we should start from when first reading - values. This can be used to resume a previous instance of a - DcgmFieldGroupWatcher by using its _nextSinceTimestamp. 0=start - with all cached data - """ + ''' + Constructor + + handle is a DCGM handle from dcgm_agent.dcgmInit() + groupId is a valid DCGM group ID returned from dcgm_agent.dcgmGroupCreate + fieldGroup is the DcgmFieldGroup() instance to watch fields for + operationMode is a dcgm_structs.DCGM_OPERATION_MODE_? constant for if the host engine is running in lock step or auto mode + updateFreq is how often to update each field in usec + maxKeepAge is how long DCGM should keep values for in seconds + maxKeepSamples is the maximum number of samples DCGM should ever cache for each field + startTimestamp is a base timestamp we should start from when first reading values. This can be used to resume a + previous instance of a DcgmFieldGroupWatcher by using its _nextSinceTimestamp. + 0=start with all cached data + ''' + + def __init__(self, handle, groupId, fieldGroup, operationMode, updateFreq, + maxKeepAge, maxKeepSamples, startTimestamp): self._fieldGroup = fieldGroup - self._oprationMode = operationMode + self._operationMode = operationMode self._updateFreq = updateFreq self._maxKeepAge = maxKeepAge self._maxKeepSamples = maxKeepSamples DcgmFieldValueCollection.__init__(self, handle, groupId) - # Start from beginning of time - self._nextSinceTimestamp = 0 + self._nextSinceTimestamp = 0 #Start from beginning of time if startTimestamp > 0: self._nextSinceTimestamp = startTimestamp - # Start watches + #Start watches self._WatchFieldGroup() + ''' + Initiate the host engine watch on the fields + ''' + def _WatchFieldGroup(self): - """ - Initiate the host engine watch on the fields - """ - ret = dcgm_agent.dcgmWatchFields( - self._handle, - self._groupId, - self._fieldGroup, - self._updateFreq, - self._maxKeepAge, - self._maxKeepSamples, - ) - # Will throw exception on error - dcgm_structs._dcgmCheckReturn(ret) + ret = dcgm_agent.dcgmWatchFields(self._handle, self._groupId, + self._fieldGroup.fieldGroupId, + self._updateFreq, self._maxKeepAge, + self._maxKeepSamples) + dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error - # Force an update of the fields so that we can fetch initial values + # Force an update of the fields so that we can fetch initial values. ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1) - # Will throw exception on error - dcgm_structs._dcgmCheckReturn(ret) + dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error - # initial update will fetch from startTimestamp - self.GetMore() + # Initial update will fetch from startTimestamp. + self.GetAllSinceLastCall() - def GetMore(self): - """ - Method to cause more field values to be retrieved from DCGM. + ''' + Method to cause more field values to be retrieved from DCGM. Returns the + number of field values that were retrieved + ''' - Returns - ------- - int - the number of field values that were retrieved - """ - beforeCount = self._numValuesSeen - - # If we're in manual mode, force an update - if self._oprationMode == dcgm_structs.DCGM_OPERATION_MODE_MANUAL: + def GetAllSinceLastCall(self): + #If we're in manual mode, force an update + if self._operationMode == dcgm_structs.DCGM_OPERATION_MODE_MANUAL: ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1) - # Will throw exception on error - dcgm_structs._dcgmCheckReturn(ret) + dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error + + return super().GetAllSinceLastCall(self._fieldGroup) - self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince( - self._handle, - self._groupId, - self._fieldGroup, - self._nextSinceTimestamp, - helper_dcgm_field_values_since_callback, - self, - ) - afterCount = self._numValuesSeen - return afterCount - beforeCount +def py_helper_dcgm_field_values_since_entity_callback(entityGroupId, entityId, + values, numValues, + userData): -def py_helper_dcgm_field_values_since_entity_callback( - entityGroupId, entityId, values, numValues, userData -): userData = ctypes.cast(userData, ctypes.py_object).value userData._ProcessValues(entityGroupId, entityId, values[0:numValues]) return 0 -helper_dcgm_field_values_since_entity_callback = ( - dcgm_agent.dcgmFieldValueEntityEnumeration_f( - py_helper_dcgm_field_values_since_entity_callback - ) -) +helper_dcgm_field_values_since_entity_callback = dcgm_agent.dcgmFieldValueEntityEnumeration_f( + py_helper_dcgm_field_values_since_entity_callback) +''' +Helper class for handling field value update callbacks and storing them in a .values member variable +''' class DcgmFieldValueEntityCollection: - """ - Helper class for handling field value update callbacks and storing them - in a .values member variable - """ def __init__(self, handle, groupId): - # 3D dictionary of - # [entityGroupId][entityId][fieldId](DcgmFieldValueTimeSeries) - self.values = {} + self.values = { + } #3D dictionary of [entityGroupId][entityId][fieldId](DcgmFieldValueTimeSeries) self._handle = handle self._groupId = groupId self._numValuesSeen = 0 + self._nextSinceTimestamp = 0 + + ''' + Helper function called by the callback of dcgm_agent.dcgmGetValuesSince to process individual field values + ''' def _ProcessValues(self, entityGroupId, entityId, values): - """ - Helper function called by the callback of - dcgm_agent.dcgmGetValuesSince to process individual field values - """ self._numValuesSeen += len(values) if entityGroupId not in self.values: @@ -374,141 +375,172 @@ def _ProcessValues(self, entityGroupId, entityId, values): self.values[entityGroupId][entityId] = {} for rawValue in values: - # Convert to python-friendly value + #Convert to python-friendly value value = DcgmFieldValue(rawValue) if value.fieldId not in self.values[entityGroupId][entityId]: self.values[entityGroupId][entityId][ - value.fieldId - ] = DcgmFieldValueTimeSeries() + value.fieldId] = DcgmFieldValueTimeSeries() - self.values[entityGroupId][entityId][value.fieldId].InsertValue(value) + self.values[entityGroupId][entityId][value.fieldId].InsertValue( + value) - def GetLatestValues(self, fieldGroup): - """ - Get the latest values for a fieldGroup and store them to the - .values member variable + ''' + Get the latest values for a fieldGroup and store them to the .values member variable + + Note: This class does not automatically watch fieldGroup. You must do that ahead of time with dcgmGroup.samples.WatchFields() + ''' - Note: This class does not automatically watch fieldGroup. You must do - that ahead of time with dcgmGroup.samples.WatchFields() - """ + def GetLatestValues(self, fieldGroup): ret = dcgm_agent.dcgmGetLatestValues_v2( - self._handle, - self._groupId, - fieldGroup.fieldGroupId, - helper_dcgm_field_values_since_entity_callback, - self, - ) - # Will throw exception on error + self._handle, self._groupId, fieldGroup.fieldGroupId, + helper_dcgm_field_values_since_entity_callback, self) + #Will throw exception on error dcgm_structs._dcgmCheckReturn(ret) + ''' + Method to cause more field values to be retrieved from DCGM. Returns the + number of field values that were retrieved. + ''' + + def GetAllSinceLastCall(self, fieldGroup): + beforeCount = self._numValuesSeen + self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince_v2( + self._handle, self._groupId, fieldGroup.fieldGroupId, + self._nextSinceTimestamp, + helper_dcgm_field_values_since_entity_callback, self) + afterCount = self._numValuesSeen + return afterCount - beforeCount + + ''' + Empty .values{} so that old data is no longer present in this structure. + This can be used to prevent .values from growing over time + ''' + def EmptyValues(self): - """ - Empty .values{} so that old data is no longer present in this - structure. This can be used to prevent .values from growing over time - """ self.values = {} self._numValuesSeen = 0 +''' +Helper class for watching a field group and storing fields values returned from it +''' + + class DcgmFieldGroupEntityWatcher(DcgmFieldValueEntityCollection): - """ - Helper class for watching a field group and storing fields values - returned from it - """ - - def __init__( - self, - handle, - groupId, - fieldGroup, - operationMode, - updateFreq, - maxKeepAge, - maxKeepSamples, - startTimestamp, - ): - """ - Constructor - - handle : - a DCGM handle from dcgm_agent.dcgmInit() - groupId : - a valid DCGM group ID returned from dcgm_agent.dcgmGroupCreate - fieldGroup : - DcgmFieldGroup() instance to watch fields for - operationMode : - is a dcgm_structs.DCGM_OPERATION_MODE_? constant for if the host - engine is running in lock step or auto mode - updateFreq : - how often to update each field in usec - maxKeepAge : - how long DCGM should keep values for in seconds - maxKeepSamples : - the maximum number of samples DCGM should ever cache for each field - startTimestamp : - a base timestamp we should start from when first reading values. - This can be used to resume a previous instance of a - DcgmFieldGroupWatcher by using its _nextSinceTimestamp. 0=start - with all cached data - """ + ''' + Constructor + + handle is a DCGM handle from dcgm_agent.dcgmInit() + groupId is a valid DCGM group ID returned from dcgm_agent.dcgmGroupCreate + fieldGroup is the DcgmFieldGroup() instance to watch fields for + operationMode is a dcgm_structs.DCGM_OPERATION_MODE_? constant for if the host engine is running in lock step or auto mode + updateFreq is how often to update each field in usec + maxKeepAge is how long DCGM should keep values for in seconds + maxKeepSamples is the maximum number of samples DCGM should ever cache for each field + startTimestamp is a base timestamp we should start from when first reading values. This can be used to resume a + previous instance of a DcgmFieldGroupWatcher by using its _nextSinceTimestamp. + 0=start with all cached data + ''' + + def __init__(self, handle, groupId, fieldGroup, operationMode, updateFreq, + maxKeepAge, maxKeepSamples, startTimestamp): self._fieldGroup = fieldGroup - self._oprationMode = operationMode + self._operationMode = operationMode self._updateFreq = updateFreq self._maxKeepAge = maxKeepAge self._maxKeepSamples = maxKeepSamples DcgmFieldValueEntityCollection.__init__(self, handle, groupId) - # Start from beginning of time - self._nextSinceTimestamp = 0 + self._nextSinceTimestamp = 0 #Start from beginning of time if startTimestamp > 0: self._nextSinceTimestamp = startTimestamp - # Start watches + #Start watches self._WatchFieldGroup() + ''' + Initiate the host engine watch on the fields + ''' + def _WatchFieldGroup(self): - """ - Initiate the host engine watch on the fields - """ - ret = dcgm_agent.dcgmWatchFields( - self._handle, - self._groupId, - self._fieldGroup.fieldGroupId, - self._updateFreq, - self._maxKeepAge, - self._maxKeepSamples, - ) - # Will throw exception on error - dcgm_structs._dcgmCheckReturn(ret) + ret = dcgm_agent.dcgmWatchFields(self._handle, self._groupId, + self._fieldGroup.fieldGroupId, + self._updateFreq, self._maxKeepAge, + self._maxKeepSamples) + dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error - # Force an update of the fields so that we can fetch initial values + # Force an update of the fields so that we can fetch initial values. ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1) - # Will throw exception on error - dcgm_structs._dcgmCheckReturn(ret) - # initial update will fetch from startTimestamp - self.GetMore() - - def GetMore(self): - """ - Method to cause more field values to be retrieved from DCGM. Returns - the number of field values that were retrieved - """ - beforeCount = self._numValuesSeen + dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error - # If we're in manual mode, force an update - if self._oprationMode == dcgm_structs.DCGM_OPERATION_MODE_MANUAL: - ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1) - # Will throw exception on error - dcgm_structs._dcgmCheckReturn(ret) + # Initial update will fetch from startTimestamp. + self.GetAllSinceLastCall() - self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince_v2( - self._handle, - self._groupId, - self._fieldGroup.fieldGroupId, - self._nextSinceTimestamp, - helper_dcgm_field_values_since_entity_callback, - self, - ) - afterCount = self._numValuesSeen - return afterCount - beforeCount + ''' + Method to cause more field values to be retrieved from DCGM. Returns the + number of field values that were retrieved + ''' + + def GetAllSinceLastCall(self): + #If we're in manual mode, force an update + if self._operationMode == dcgm_structs.DCGM_OPERATION_MODE_MANUAL: + ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1) + dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error + + return super().GetAllSinceLastCall(self._fieldGroup) + + +#Test program for demonstrating how this module works +def main(): + operationMode = dcgm_structs.DCGM_OPERATION_MODE_AUTO + timeStep = 1.0 + + dcgm_structs._dcgmInit() + dcgm_agent.dcgmInit() #Will throw an exception on error + handle = dcgm_agent.dcgmStartEmbedded(operationMode) + handleObj = pydcgm.DcgmHandle(handle=handle) + groupId = dcgm_structs.DCGM_GROUP_ALL_GPUS + fieldIds = [ + dcgm_fields.DCGM_FI_DEV_SM_CLOCK, dcgm_fields.DCGM_FI_DEV_MEM_CLOCK + ] + + fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "my_field_group", fieldIds) + + updateFreq = int(timeStep * 1000000.0) + maxKeepAge = 3600.0 #1 hour + maxKeepSamples = 0 #unlimited. maxKeepAge will enforce quota + startTimestamp = 0 #beginning of time + + dfcw = DcgmFieldGroupWatcher(handle, groupId, fieldGroup, operationMode, + updateFreq, maxKeepAge, maxKeepSamples, + startTimestamp) + dfcw2 = DcgmFieldGroupEntityWatcher(handle, groupId, fieldGroup, + operationMode, updateFreq, maxKeepAge, + maxKeepSamples, startTimestamp) + + while (True): + newUpdateCount = dfcw.GetAllSinceLastCall() + newUpdateCount2 = dfcw2.GetAllSinceLastCall() + print("Got %d and %d new field value updates" % + (newUpdateCount, newUpdateCount2)) + for gpuId in list(dfcw.values.keys()): + print("gpuId %d" % gpuId) + for fieldId in list(dfcw.values[gpuId].keys()): + print(" fieldId %d: %d values. latest timestamp %d" % \ + (fieldId, len(dfcw.values[gpuId][fieldId]), dfcw.values[gpuId][fieldId][-1].ts)) + + for entityGroupId in list(dfcw2.values.keys()): + print("entityGroupId %d" % entityGroupId) + for entityId in list(dfcw2.values[entityGroupId].keys()): + print(" entityId %d" % entityId) + for fieldId in list( + dfcw2.values[entityGroupId][entityId].keys()): + print(" fieldId %d: %d values. latest timestamp %d" % \ + (fieldId, len(dfcw2.values[entityGroupId][entityId][fieldId]), dfcw2.values[entityGroupId][entityId][fieldId][-1].ts)) + + time.sleep(timeStep) + + +if __name__ == "__main__": + main() diff --git a/model_analyzer/monitor/dcgm/dcgm_fields.py b/model_analyzer/monitor/dcgm/dcgm_fields.py index 708008233..7c07111cd 100755 --- a/model_analyzer/monitor/dcgm/dcgm_fields.py +++ b/model_analyzer/monitor/dcgm/dcgm_fields.py @@ -1,6 +1,4 @@ -#!/usr/bin/env python3 - -# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,38 +11,28 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +## +# Python bindings for the internal API of DCGM library (dcgm_fields.h) +## -from ctypes import ( - POINTER, - Structure, - addressof, - c_char, - c_char_p, - c_int, - c_short, - c_ubyte, - c_uint32, - memmove, - sizeof, -) - +from ctypes import * +from ctypes.util import find_library import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs # Provides access to functions dcgmFP = dcgm_structs._dcgmGetFunctionPointer # Field Types are a single byte. List these in ASCII order -DCGM_FT_BINARY = "b" # Blob of binary data representing a structure -DCGM_FT_DOUBLE = "d" # 8-byte double precision -DCGM_FT_INT64 = "i" # 8-byte signed integer -DCGM_FT_STRING = "s" # Null-terminated ASCII Character string -DCGM_FT_TIMESTAMP = "t" # 8-byte signed integer usec since 1970 +DCGM_FT_BINARY = 'b' # Blob of binary data representing a structure +DCGM_FT_DOUBLE = 'd' # 8-byte double precision +DCGM_FT_INT64 = 'i' # 8-byte signed integer +DCGM_FT_STRING = 's' # Null-terminated ASCII Character string +DCGM_FT_TIMESTAMP = 't' # 8-byte signed integer usec since 1970 # Field scope. What are these fields associated with DCGM_FS_GLOBAL = 0 # Field is global (ex: driver version) DCGM_FS_ENTITY = 1 # Field is associated with an entity (GPU, VGPU, ..etc) -# Field is associated with a device. Deprecated. Use DCGM_FS_ENTITY -DCGM_FS_DEVICE = DCGM_FS_ENTITY +DCGM_FS_DEVICE = DCGM_FS_ENTITY # Field is associated with a device. Deprecated. Use DCGM_FS_ENTITY # DCGM_FI_DEV_CLOCK_THROTTLE_REASONS is a bitmap of why the clock is throttled. # These macros are masks for relevant throttling, and are a 1:1 map to the NVML @@ -63,8 +51,7 @@ # # This is an indicator of: # - temperature being too high -# - External Power Brake Assertion is triggered -# (e.g. by the system power supply) +# - External Power Brake Assertion is triggered (e.g. by the system power supply) # - Power draw is too high and Fast Trigger protection is reducing the clocks # - May be also reported during PState or clock change # - This behavior may be removed in a later release. @@ -87,635 +74,451 @@ # - Current memory temperature above the Memory Max Operating Temperature DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL = 0x0000000000000020 -# HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is -# engaged +# HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged # # This is an indicator of: # - temperature being too high DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL = 0x0000000000000040 -# HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) -# is engaged +# HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged # # This is an indicator of: -# - External Power Brake Assertion being triggered (e.g. by the system power -# supply) +# - External Power Brake Assertion being triggered (e.g. by the system power supply) DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE = 0x0000000000000080 # GPU clocks are limited by current setting of Display clocks DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS = 0x0000000000000100 -# Field entity groups. Which type of entity is this field or field value -# associated with - -# Field is not associated with an entity. Field scope should be DCGM_FS_GLOBAL -DCGM_FE_NONE = 0 +#Field entity groups. Which type of entity is this field or field value associated with +DCGM_FE_NONE = 0 # Field is not associated with an entity. Field scope should be DCGM_FS_GLOBAL DCGM_FE_GPU = 1 # Field is associated with a GPU entity DCGM_FE_VGPU = 2 # Field is associated with a VGPU entity DCGM_FE_SWITCH = 3 # Field is associated with a Switch entity DCGM_FE_GPU_I = 4 # Field is associated with a GPU Instance entity DCGM_FE_GPU_CI = 5 # Field is associated with a GPU Compute Instance entity +DCGM_FE_LINK = 6 # Field is associated with an NVLINK -# Represents an identifier for an entity within a field entity. For instance, -# this is the gpuId for DCGM_FE_GPU. -c_dcgm_field_eid_t = c_uint32 +c_dcgm_field_eid_t = c_uint32 #Represents an identifier for an entity within a field entity. For instance, this is the gpuId for DCGM_FE_GPU. -# -# System attributes -# +#System attributes DCGM_FI_UNKNOWN = 0 -# Driver Version -DCGM_FI_DRIVER_VERSION = 1 -# Underlying NVML version -DCGM_FI_NVML_VERSION = 2 -# Process Name. Will be nv-hostengine or your process's name in embedded mode -DCGM_FI_PROCESS_NAME = 3 -# Number of Devices on the node -DCGM_FI_DEV_COUNT = 4 - -# -# Device attributes -# -# Name of the GPU device -DCGM_FI_DEV_NAME = 50 -# Device Brand -DCGM_FI_DEV_BRAND = 51 -# NVML index of this GPU -DCGM_FI_DEV_NVML_INDEX = 52 -# Device Serial Number -DCGM_FI_DEV_SERIAL = 53 -# UUID corresponding to the device -DCGM_FI_DEV_UUID = 54 -# Device node minor number /dev/nvidia# -DCGM_FI_DEV_MINOR_NUMBER = 55 -# OEM inforom version -DCGM_FI_DEV_OEM_INFOROM_VER = 56 -# PCI attributes for the device -DCGM_FI_DEV_PCI_BUSID = 57 -# The combined 16-bit device id and 16-bit vendor id -DCGM_FI_DEV_PCI_COMBINED_ID = 58 -# The 32-bit Sub System Device ID -DCGM_FI_DEV_PCI_SUBSYS_ID = 59 -# Topology of all GPUs on the system via PCI (static) -DCGM_FI_GPU_TOPOLOGY_PCI = 60 -# Topology of all GPUs on the system via NVLINK (static) -DCGM_FI_GPU_TOPOLOGY_NVLINK = 61 -# Affinity of all GPUs on the system (static) -DCGM_FI_GPU_TOPOLOGY_AFFINITY = 62 -# Compute mode for the device -DCGM_FI_DEV_COMPUTE_MODE = 65 -# Persistence mode for the device -DCGM_FI_DEV_PERSISTENCE_MODE = 66 -# MIG mode for the device -DCGM_FI_DEV_MIG_MODE = 67 -# String value for CUDA_VISIBLE_DEVICES for the device -DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR = 68 -# Device CPU affinity. part 1/8 = cpus 0 - 63 -DCGM_FI_DEV_CPU_AFFINITY_0 = 70 -# Device CPU affinity. part 1/8 = cpus 64 - 127 -DCGM_FI_DEV_CPU_AFFINITY_1 = 71 -# Device CPU affinity. part 2/8 = cpus 128 - 191 -DCGM_FI_DEV_CPU_AFFINITY_2 = 72 -# Device CPU affinity. part 3/8 = cpus 192 - 255 -DCGM_FI_DEV_CPU_AFFINITY_3 = 73 -# ECC inforom version -DCGM_FI_DEV_ECC_INFOROM_VER = 80 -# Power management object inforom version -DCGM_FI_DEV_POWER_INFOROM_VER = 81 -# Inforom image version -DCGM_FI_DEV_INFOROM_IMAGE_VER = 82 -# Inforom configuration checksum -DCGM_FI_DEV_INFOROM_CONFIG_CHECK = 83 -# Reads the infoROM from the flash and verifies the checksums -DCGM_FI_DEV_INFOROM_CONFIG_VALID = 84 -# VBIOS version of the device -DCGM_FI_DEV_VBIOS_VERSION = 85 -# Total BAR1 of the GPU -DCGM_FI_DEV_BAR1_TOTAL = 90 -# Deprecated - Sync boost settings on the node -DCGM_FI_SYNC_BOOST = 91 -# Used BAR1 of the GPU in MB -DCGM_FI_DEV_BAR1_USED = 92 -# Free BAR1 of the GPU in MB -DCGM_FI_DEV_BAR1_FREE = 93 - -# -# Clocks and power -# -# SM clock for the device -DCGM_FI_DEV_SM_CLOCK = 100 -# Memory clock for the device -DCGM_FI_DEV_MEM_CLOCK = 101 -# Video encoder/decoder clock for the device -DCGM_FI_DEV_VIDEO_CLOCK = 102 -# SM Application clocks -DCGM_FI_DEV_APP_SM_CLOCK = 110 -# Memory Application clocks -DCGM_FI_DEV_APP_MEM_CLOCK = 111 -# Current clock throttle reasons (bitmask of DCGM_CLOCKS_THROTTLE_REASON_*) -DCGM_FI_DEV_CLOCK_THROTTLE_REASONS = 112 -# Maximum supported SM clock for the device -DCGM_FI_DEV_MAX_SM_CLOCK = 113 -# Maximum supported Memory clock for the device -DCGM_FI_DEV_MAX_MEM_CLOCK = 114 -# Maximum supported Video encoder/decoder clock for the device -DCGM_FI_DEV_MAX_VIDEO_CLOCK = 115 -# Auto-boost for the device (1 = enabled. 0 = disabled) -DCGM_FI_DEV_AUTOBOOST = 120 -# Supported clocks for the device -DCGM_FI_DEV_SUPPORTED_CLOCKS = 130 -# Memory temperature for the device -DCGM_FI_DEV_MEMORY_TEMP = 140 -# Current temperature readings for the device, in degrees C -DCGM_FI_DEV_GPU_TEMP = 150 -# Power usage for the device in Watts -DCGM_FI_DEV_POWER_USAGE = 155 -# Total energy consumption for the GPU in mJ since the driver was last reloaded -DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 156 -# Slowdown temperature for the device -DCGM_FI_DEV_SLOWDOWN_TEMP = 158 -# Shutdown temperature for the device -DCGM_FI_DEV_SHUTDOWN_TEMP = 159 -# Current Power limit for the device -DCGM_FI_DEV_POWER_MGMT_LIMIT = 160 -# Minimum power management limit for the device -DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN = 161 -# Maximum power management limit for the device -DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX = 162 -# Default power management limit for the device -DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF = 163 -# Effective power limit that the driver enforces after taking into account all -# limiters -DCGM_FI_DEV_ENFORCED_POWER_LIMIT = 164 -# Performance state (P-State) 0-15. 0=highest -DCGM_FI_DEV_PSTATE = 190 -# Fan speed for the device in percent 0-100 -DCGM_FI_DEV_FAN_SPEED = 191 - -# -# Device utilization and telemetry -# -# Deprecated - PCIe Tx utilization information -DCGM_FI_DEV_PCIE_TX_THROUGHPUT = 200 -# Deprecated - PCIe Rx utilization information -DCGM_FI_DEV_PCIE_RX_THROUGHPUT = 201 -# PCIe replay counter -DCGM_FI_DEV_PCIE_REPLAY_COUNTER = 202 -# GPU Utilization -DCGM_FI_DEV_GPU_UTIL = 203 -# Memory Utilization -DCGM_FI_DEV_MEM_COPY_UTIL = 204 -# Process accounting stats -DCGM_FI_DEV_ACCOUNTING_DATA = 205 -# Encoder utilization -DCGM_FI_DEV_ENC_UTIL = 206 -# Decoder utilization -DCGM_FI_DEV_DEC_UTIL = 207 -# Memory utilization samples -DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES = 210 -# SM utilization samples -DCGM_FI_DEV_GPU_UTIL_SAMPLES = 211 -# Graphics processes running on the GPU. -DCGM_FI_DEV_GRAPHICS_PIDS = 220 -# Compute processes running on the GPU. -DCGM_FI_DEV_COMPUTE_PIDS = 221 -# XID errors. The value is the specific XID error -DCGM_FI_DEV_XID_ERRORS = 230 -# PCIe Max Link Generation -DCGM_FI_DEV_PCIE_MAX_LINK_GEN = 235 -# PCIe Max Link Width -DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH = 236 -# PCIe Current Link Generation -DCGM_FI_DEV_PCIE_LINK_GEN = 237 -# PCIe Current Link Width -DCGM_FI_DEV_PCIE_LINK_WIDTH = 238 - -# -# Violation counters -# -# Power Violation time in usec -DCGM_FI_DEV_POWER_VIOLATION = 240 -# Thermal Violation time in usec -DCGM_FI_DEV_THERMAL_VIOLATION = 241 -# Sync Boost Violation time in usec -DCGM_FI_DEV_SYNC_BOOST_VIOLATION = 242 -# Board Limit Violation time in usec. -DCGM_FI_DEV_BOARD_LIMIT_VIOLATION = 243 -# Low Utilization Violation time in usec. -DCGM_FI_DEV_LOW_UTIL_VIOLATION = 244 -# Reliability Violation time in usec. -DCGM_FI_DEV_RELIABILITY_VIOLATION = 245 -# App Clocks Violation time in usec. -DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION = 246 -# Base Clocks Violation time in usec. -DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION = 247 - -# -# Framebuffer usage -# -# Total framebuffer memory in MB -DCGM_FI_DEV_FB_TOTAL = 250 -# Total framebuffer used in MB -DCGM_FI_DEV_FB_FREE = 251 -# Total framebuffer free in MB -DCGM_FI_DEV_FB_USED = 252 - -# -# Device ECC Counters -# -# Current ECC mode for the device -DCGM_FI_DEV_ECC_CURRENT = 300 -# Pending ECC mode for the device -DCGM_FI_DEV_ECC_PENDING = 301 -# Total single bit volatile ecc errors -DCGM_FI_DEV_ECC_SBE_VOL_TOTAL = 310 -# Total double bit volatile ecc errors -DCGM_FI_DEV_ECC_DBE_VOL_TOTAL = 311 -# Total single bit aggregate (persistent) ecc errors -DCGM_FI_DEV_ECC_SBE_AGG_TOTAL = 312 -# Total double bit aggregate (persistent) ecc errors -DCGM_FI_DEV_ECC_DBE_AGG_TOTAL = 313 -# L1 cache single bit volatile ecc errors -DCGM_FI_DEV_ECC_SBE_VOL_L1 = 314 -# L1 cache double bit volatile ecc errors -DCGM_FI_DEV_ECC_DBE_VOL_L1 = 315 -# L2 cache single bit volatile ecc errors -DCGM_FI_DEV_ECC_SBE_VOL_L2 = 316 -# L2 cache double bit volatile ecc errors -DCGM_FI_DEV_ECC_DBE_VOL_L2 = 317 -# Device memory single bit volatile ecc errors -DCGM_FI_DEV_ECC_SBE_VOL_DEV = 318 -# Device memory double bit volatile ecc errors -DCGM_FI_DEV_ECC_DBE_VOL_DEV = 319 -# Register file single bit volatile ecc errors -DCGM_FI_DEV_ECC_SBE_VOL_REG = 320 -# Register file double bit volatile ecc errors -DCGM_FI_DEV_ECC_DBE_VOL_REG = 321 -# Texture memory single bit volatile ecc errors -DCGM_FI_DEV_ECC_SBE_VOL_TEX = 322 -# Texture memory double bit volatile ecc errors -DCGM_FI_DEV_ECC_DBE_VOL_TEX = 323 -# L1 cache single bit aggregate (persistent) ecc errors -DCGM_FI_DEV_ECC_SBE_AGG_L1 = 324 -# L1 cache double bit aggregate (persistent) ecc errors -DCGM_FI_DEV_ECC_DBE_AGG_L1 = 325 -# L2 cache single bit aggregate (persistent) ecc errors -DCGM_FI_DEV_ECC_SBE_AGG_L2 = 326 -# L2 cache double bit aggregate (persistent) ecc errors -DCGM_FI_DEV_ECC_DBE_AGG_L2 = 327 -# Device memory single bit aggregate (persistent) ecc errors -DCGM_FI_DEV_ECC_SBE_AGG_DEV = 328 -# Device memory double bit aggregate (persistent) ecc errors -DCGM_FI_DEV_ECC_DBE_AGG_DEV = 329 -# Register File single bit aggregate (persistent) ecc errors -DCGM_FI_DEV_ECC_SBE_AGG_REG = 330 -# Register File double bit aggregate (persistent) ecc errors -DCGM_FI_DEV_ECC_DBE_AGG_REG = 331 -# Texture memory single bit aggregate (persistent) ecc errors -DCGM_FI_DEV_ECC_SBE_AGG_TEX = 332 -# Texture memory double bit aggregate (persistent) ecc errors -DCGM_FI_DEV_ECC_DBE_AGG_TEX = 333 -# Number of retired pages because of single bit errors -DCGM_FI_DEV_RETIRED_SBE = 390 -# Number of retired pages because of double bit errors -DCGM_FI_DEV_RETIRED_DBE = 391 -# Number of pages pending retirement -DCGM_FI_DEV_RETIRED_PENDING = 392 - -# -# Row remapper fields (Ampere and newer) -# -# Number of remapped rows for uncorrectable errors -DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS = 393 -# Number of remapped rows for correctable errors -DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS = 394 -# Whether remapping of rows has failed -DCGM_FI_DEV_ROW_REMAP_FAILURE = 395 - -# -# Device NvLink Bandwidth and Error Counters -# -# NV Link flow control CRC Error Counter for Lane 0 -DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 = 400 -# NV Link flow control CRC Error Counter for Lane 1 -DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 = 401 -# NV Link flow control CRC Error Counter for Lane 2 -DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 = 402 -# NV Link flow control CRC Error Counter for Lane 3 -DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 = 403 -# NV Link flow control CRC Error Counter for Lane 4 -DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 = 404 -# NV Link flow control CRC Error Counter for Lane 5 -DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 = 405 -# NV Link flow control CRC Error Counter total for all Lanes -DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = 409 -# NV Link data CRC Error Counter for Lane 0 -DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 = 410 -# NV Link data CRC Error Counter for Lane 1 -DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 = 411 -# NV Link data CRC Error Counter for Lane 2 -DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 = 412 -# NV Link data CRC Error Counter for Lane 3 -DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 = 413 -# NV Link data CRC Error Counter for Lane 4 -DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 = 414 -# NV Link data CRC Error Counter for Lane 5 -DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 = 415 -# NV Link data CRC Error Counter total for all Lanes -DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = 419 -# NV Link Replay Error Counter for Lane 0 -DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 = 420 -# NV Link Replay Error Counter for Lane 1 -DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 = 421 -# NV Link Replay Error Counter for Lane 2 -DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 = 422 -# NV Link Replay Error Counter for Lane 3 -DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 = 423 -# NV Link Replay Error Counter for Lane 4 -DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 = 424 -# NV Link Replay Error Counter for Lane 3 -DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 = 425 -# NV Link Replay Error Counter total for all Lanes -DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL = 429 -# NV Link Recovery Error Counter for Lane 0 -DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 = 430 -# NV Link Recovery Error Counter for Lane 1 -DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 = 431 -# NV Link Recovery Error Counter for Lane 2 -DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 = 432 -# NV Link Recovery Error Counter for Lane 3 -DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 = 433 -# NV Link Recovery Error Counter for Lane 4 -DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 = 434 -# NV Link Recovery Error Counter for Lane 5 -DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 = 435 -# NV Link Recovery Error Counter total for all Lanes -DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = 439 -# NV Link Bandwidth Counter for Lane 0 -DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 = 440 -# NV Link Bandwidth Counter for Lane 1 -DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 = 441 -# NV Link Bandwidth Counter for Lane 2 -DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 = 442 -# NV Link Bandwidth Counter for Lane 3 -DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 = 443 -# NV Link Bandwidth Counter for Lane 4 -DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 = 444 -# NV Link Bandwidth Counter for Lane 5 -DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 = 445 -# NV Link Bandwidth Counter total for all Lanes -DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL = 449 -# GPU NVLink error information -DCGM_FI_DEV_GPU_NVLINK_ERRORS = 450 - -# -# Device Attributes associated with virtualization -# -# Operating mode of the GPU -DCGM_FI_DEV_VIRTUAL_MODE = 500 -# Includes Count and Supported vGPU type information -DCGM_FI_DEV_SUPPORTED_TYPE_INFO = 501 -# Includes Count and List of Creatable vGPU type IDs -DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS = 502 -# Includes Count and List of vGPU instance IDs -DCGM_FI_DEV_VGPU_INSTANCE_IDS = 503 -# Utilization values for vGPUs running on the device -DCGM_FI_DEV_VGPU_UTILIZATIONS = 504 -# Utilization values for processes running within vGPU VMs using the device -DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION = 505 -# Current encoder statistics for a given device -DCGM_FI_DEV_ENC_STATS = 506 -# Statistics of current active frame buffer capture sessions on a given device -DCGM_FI_DEV_FBC_STATS = 507 -# Information about active frame buffer capture sessions on a target device -DCGM_FI_DEV_FBC_SESSIONS_INFO = 508 - -# -# Related to vGPU Instance IDs -# -# vGPU VM ID -DCGM_FI_DEV_VGPU_VM_ID = 520 -# vGPU VM name -DCGM_FI_DEV_VGPU_VM_NAME = 521 -# vGPU type of the vGPU instance -DCGM_FI_DEV_VGPU_TYPE = 522 -# UUID of the vGPU instance -DCGM_FI_DEV_VGPU_UUID = 523 -# Driver version of the vGPU instance -DCGM_FI_DEV_VGPU_DRIVER_VERSION = 524 -# Memory usage of the vGPU instance -DCGM_FI_DEV_VGPU_MEMORY_USAGE = 525 -# License status of the vGPU instance -DCGM_FI_DEV_VGPU_LICENSE_STATUS = 526 -# Frame rate limit of the vGPU instance -DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT = 527 -# Current encoder statistics of the vGPU instance -DCGM_FI_DEV_VGPU_ENC_STATS = 528 -# Information about all active encoder sessions on the vGPU instance -DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO = 529 -# Statistics of current active frame buffer capture sessions on the vGPU -# instance -DCGM_FI_DEV_VGPU_FBC_STATS = 530 -# Information about active frame buffer capture sessions on the vGPU instance -DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO = 531 - -# Internal fields reserve the range 600..699 -# below fields related to NVSwitch -DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P00 = 700 -DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P00 = 701 -DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P00 = 702 -DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P00 = 703 -DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P01 = 704 -DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P01 = 705 -DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P01 = 706 -DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P01 = 707 -DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P02 = 708 -DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P02 = 709 -DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P02 = 710 -DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P02 = 711 -DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P03 = 712 -DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P03 = 713 -DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P03 = 714 -DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P03 = 715 -DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P04 = 716 -DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P04 = 717 -DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P04 = 718 -DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P04 = 719 -DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P05 = 720 -DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P05 = 721 -DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P05 = 722 -DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P05 = 723 -DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P06 = 724 -DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P06 = 725 -DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P06 = 726 -DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P06 = 727 -DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P07 = 728 -DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P07 = 729 -DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P07 = 730 -DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P07 = 731 -DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P08 = 732 -DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P08 = 733 -DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P08 = 734 -DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P08 = 735 -DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P09 = 736 -DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P09 = 737 -DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P09 = 738 -DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P09 = 739 -DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P10 = 740 -DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P10 = 741 -DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P10 = 742 -DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P10 = 743 -DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P11 = 744 -DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P11 = 745 -DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P11 = 746 -DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P11 = 747 -DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P12 = 748 -DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P12 = 749 -DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P12 = 750 -DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P12 = 751 -DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P13 = 752 -DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P13 = 753 -DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P13 = 754 -DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P13 = 755 -DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P14 = 756 -DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P14 = 757 -DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P14 = 758 -DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P14 = 759 -DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P15 = 760 -DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P15 = 761 -DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P15 = 762 -DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P15 = 763 -DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P16 = 764 -DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P16 = 765 -DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P16 = 766 -DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P16 = 767 -DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P17 = 768 -DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P17 = 769 -DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P17 = 770 -DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P17 = 771 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00 = 780 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00 = 781 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P01 = 782 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P01 = 783 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P02 = 784 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P02 = 785 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P03 = 786 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P03 = 787 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P04 = 788 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P04 = 789 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P05 = 790 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P05 = 791 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P06 = 792 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P06 = 793 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P07 = 794 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P07 = 795 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P08 = 796 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P08 = 797 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P09 = 798 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P09 = 799 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P10 = 800 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P10 = 801 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P11 = 802 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P11 = 803 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P12 = 804 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P12 = 805 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P13 = 806 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P13 = 807 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P14 = 808 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P14 = 809 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P15 = 810 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P15 = 811 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P16 = 812 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P16 = 813 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P17 = 814 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P17 = 815 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00 = 820 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P00 = 821 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P01 = 822 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P01 = 823 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P02 = 824 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P02 = 825 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P03 = 826 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P03 = 827 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P04 = 828 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P04 = 829 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P05 = 830 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P05 = 831 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P06 = 832 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P06 = 833 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P07 = 834 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P07 = 835 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P08 = 836 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P08 = 837 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P09 = 838 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P09 = 839 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P10 = 840 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P10 = 841 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P11 = 842 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P11 = 843 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P12 = 844 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P12 = 845 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P13 = 846 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P13 = 847 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P14 = 848 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P14 = 849 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P15 = 850 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P15 = 851 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P16 = 852 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P16 = 853 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P17 = 854 -DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P17 = 855 +DCGM_FI_DRIVER_VERSION = 1 #Driver Version +DCGM_FI_NVML_VERSION = 2 #Underlying NVML version +DCGM_FI_PROCESS_NAME = 3 #Process Name. Will be nv-hostengine or your process's name in embedded mode +DCGM_FI_DEV_COUNT = 4 #Number of Devices on the node +DCGM_FI_CUDA_DRIVER_VERSION = 5 #Cuda Driver Version as an integer. CUDA 11.1 = 11100 +#Device attributes +DCGM_FI_DEV_NAME = 50 #Name of the GPU device +DCGM_FI_DEV_BRAND = 51 #Device Brand +DCGM_FI_DEV_NVML_INDEX = 52 #NVML index of this GPU +DCGM_FI_DEV_SERIAL = 53 #Device Serial Number +DCGM_FI_DEV_UUID = 54 #UUID corresponding to the device +DCGM_FI_DEV_MINOR_NUMBER = 55 #Device node minor number /dev/nvidia# +DCGM_FI_DEV_OEM_INFOROM_VER = 56 #OEM inforom version +DCGM_FI_DEV_PCI_BUSID = 57 #PCI attributes for the device +DCGM_FI_DEV_PCI_COMBINED_ID = 58 #The combined 16-bit device id and 16-bit vendor id +DCGM_FI_DEV_PCI_SUBSYS_ID = 59 #The 32-bit Sub System Device ID +DCGM_FI_GPU_TOPOLOGY_PCI = 60 #Topology of all GPUs on the system via PCI (static) +DCGM_FI_GPU_TOPOLOGY_NVLINK = 61 #Topology of all GPUs on the system via NVLINK (static) +DCGM_FI_GPU_TOPOLOGY_AFFINITY = 62 #Affinity of all GPUs on the system (static) +DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY = 63 #Cuda compute capability for the device +DCGM_FI_DEV_COMPUTE_MODE = 65 #Compute mode for the device +DCGM_FI_DEV_PERSISTENCE_MODE = 66 #Persistence mode for the device +DCGM_FI_DEV_MIG_MODE = 67 #MIG mode for the device +DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR = 68 #String value for CUDA_VISIBLE_DEVICES for the device +DCGM_FI_DEV_MIG_MAX_SLICES = 69 #The maximum number of slices this GPU supports +DCGM_FI_DEV_CPU_AFFINITY_0 = 70 #Device CPU affinity. part 1/8 = cpus 0 - 63 +DCGM_FI_DEV_CPU_AFFINITY_1 = 71 #Device CPU affinity. part 1/8 = cpus 64 - 127 +DCGM_FI_DEV_CPU_AFFINITY_2 = 72 #Device CPU affinity. part 2/8 = cpus 128 - 191 +DCGM_FI_DEV_CPU_AFFINITY_3 = 73 #Device CPU affinity. part 3/8 = cpus 192 - 255 +DCGM_FI_DEV_CC_MODE = 74 #Device CC/APM mode +DCGM_FI_DEV_MIG_ATTRIBUTES = 75 #MIG device attributes +DCGM_FI_DEV_MIG_GI_INFO = 76 #GPU instance profile information +DCGM_FI_DEV_MIG_CI_INFO = 77 #Compute instance profile information +DCGM_FI_DEV_ECC_INFOROM_VER = 80 #ECC inforom version +DCGM_FI_DEV_POWER_INFOROM_VER = 81 #Power management object inforom version +DCGM_FI_DEV_INFOROM_IMAGE_VER = 82 #Inforom image version +DCGM_FI_DEV_INFOROM_CONFIG_CHECK = 83 #Inforom configuration checksum +DCGM_FI_DEV_INFOROM_CONFIG_VALID = 84 #Reads the infoROM from the flash and verifies the checksums +DCGM_FI_DEV_VBIOS_VERSION = 85 #VBIOS version of the device +DCGM_FI_DEV_BAR1_TOTAL = 90 #Total BAR1 of the GPU +DCGM_FI_SYNC_BOOST = 91 #Deprecated - Sync boost settings on the node +DCGM_FI_DEV_BAR1_USED = 92 #Used BAR1 of the GPU in MB +DCGM_FI_DEV_BAR1_FREE = 93 #Free BAR1 of the GPU in MB +#Clocks and power +DCGM_FI_DEV_SM_CLOCK = 100 #SM clock for the device +DCGM_FI_DEV_MEM_CLOCK = 101 #Memory clock for the device +DCGM_FI_DEV_VIDEO_CLOCK = 102 #Video encoder/decoder clock for the device +DCGM_FI_DEV_APP_SM_CLOCK = 110 #SM Application clocks +DCGM_FI_DEV_APP_MEM_CLOCK = 111 #Memory Application clocks +DCGM_FI_DEV_CLOCK_THROTTLE_REASONS = 112 #Current clock throttle reasons (bitmask of DCGM_CLOCKS_THROTTLE_REASON_*) +DCGM_FI_DEV_MAX_SM_CLOCK = 113 #Maximum supported SM clock for the device +DCGM_FI_DEV_MAX_MEM_CLOCK = 114 #Maximum supported Memory clock for the device +DCGM_FI_DEV_MAX_VIDEO_CLOCK = 115 #Maximum supported Video encoder/decoder clock for the device +DCGM_FI_DEV_AUTOBOOST = 120 #Auto-boost for the device (1 = enabled. 0 = disabled) +DCGM_FI_DEV_SUPPORTED_CLOCKS = 130 #Supported clocks for the device +DCGM_FI_DEV_MEMORY_TEMP = 140 #Memory temperature for the device +DCGM_FI_DEV_GPU_TEMP = 150 #Current temperature readings for the device, in degrees C +DCGM_FI_DEV_MEM_MAX_OP_TEMP = 151 #Maximum operating temperature for the memory of this GPU +DCGM_FI_DEV_GPU_MAX_OP_TEMP = 152 #Maximum operating temperature for this GPU +DCGM_FI_DEV_POWER_USAGE = 155 #Power usage for the device in Watts +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 156 #Total energy consumption for the GPU in mJ since the driver was last reloaded +DCGM_FI_DEV_SLOWDOWN_TEMP = 158 #Slowdown temperature for the device +DCGM_FI_DEV_SHUTDOWN_TEMP = 159 #Shutdown temperature for the device +DCGM_FI_DEV_POWER_MGMT_LIMIT = 160 #Current Power limit for the device +DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN = 161 #Minimum power management limit for the device +DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX = 162 #Maximum power management limit for the device +DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF = 163 #Default power management limit for the device +DCGM_FI_DEV_ENFORCED_POWER_LIMIT = 164 #Effective power limit that the driver enforces after taking into account all limiters +DCGM_FI_DEV_PSTATE = 190 #Performance state (P-State) 0-15. 0=highest +DCGM_FI_DEV_FAN_SPEED = 191 #Fan speed for the device in percent 0-100 +#Device utilization and telemetry +DCGM_FI_DEV_PCIE_TX_THROUGHPUT = 200 #Deprecated - PCIe Tx utilization information +DCGM_FI_DEV_PCIE_RX_THROUGHPUT = 201 #Deprecated - PCIe Rx utilization information +DCGM_FI_DEV_PCIE_REPLAY_COUNTER = 202 #PCIe replay counter +DCGM_FI_DEV_GPU_UTIL = 203 #GPU Utilization +DCGM_FI_DEV_MEM_COPY_UTIL = 204 #Memory Utilization +DCGM_FI_DEV_ACCOUNTING_DATA = 205 #Process accounting stats +DCGM_FI_DEV_ENC_UTIL = 206 #Encoder utilization +DCGM_FI_DEV_DEC_UTIL = 207 #Decoder utilization +# Fields 210, 211, 220, and 221 are internal-only. see dcgm_fields_internal.py +DCGM_FI_DEV_XID_ERRORS = 230 #XID errors. The value is the specific XID error +DCGM_FI_DEV_PCIE_MAX_LINK_GEN = 235 #PCIe Max Link Generation +DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH = 236 #PCIe Max Link Width +DCGM_FI_DEV_PCIE_LINK_GEN = 237 #PCIe Current Link Generation +DCGM_FI_DEV_PCIE_LINK_WIDTH = 238 #PCIe Current Link Width +#Violation counters +DCGM_FI_DEV_POWER_VIOLATION = 240 #Power Violation time in usec +DCGM_FI_DEV_THERMAL_VIOLATION = 241 #Thermal Violation time in usec +DCGM_FI_DEV_SYNC_BOOST_VIOLATION = 242 #Sync Boost Violation time in usec +DCGM_FI_DEV_BOARD_LIMIT_VIOLATION = 243 #Board Limit Violation time in usec. +DCGM_FI_DEV_LOW_UTIL_VIOLATION = 244 #Low Utilization Violation time in usec. +DCGM_FI_DEV_RELIABILITY_VIOLATION = 245 #Reliability Violation time in usec. +DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION = 246 #App Clocks Violation time in usec. +DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION = 247 #Base Clocks Violation time in usec. +#Framebuffer usage +DCGM_FI_DEV_FB_TOTAL = 250 #Total framebuffer memory in MB +DCGM_FI_DEV_FB_FREE = 251 #Total framebuffer used in MB +DCGM_FI_DEV_FB_USED = 252 #Total framebuffer free in MB +DCGM_FI_DEV_FB_RESERVED = 253 #Total framebuffer reserved in MB +#Device ECC Counters +DCGM_FI_DEV_ECC_CURRENT = 300 #Current ECC mode for the device +DCGM_FI_DEV_ECC_PENDING = 301 #Pending ECC mode for the device +DCGM_FI_DEV_ECC_SBE_VOL_TOTAL = 310 #Total single bit volatile ecc errors +DCGM_FI_DEV_ECC_DBE_VOL_TOTAL = 311 #Total double bit volatile ecc errors +DCGM_FI_DEV_ECC_SBE_AGG_TOTAL = 312 #Total single bit aggregate (persistent) ecc errors +DCGM_FI_DEV_ECC_DBE_AGG_TOTAL = 313 #Total double bit aggregate (persistent) ecc errors +DCGM_FI_DEV_ECC_SBE_VOL_L1 = 314 #L1 cache single bit volatile ecc errors +DCGM_FI_DEV_ECC_DBE_VOL_L1 = 315 #L1 cache double bit volatile ecc errors +DCGM_FI_DEV_ECC_SBE_VOL_L2 = 316 #L2 cache single bit volatile ecc errors +DCGM_FI_DEV_ECC_DBE_VOL_L2 = 317 #L2 cache double bit volatile ecc errors +DCGM_FI_DEV_ECC_SBE_VOL_DEV = 318 #Device memory single bit volatile ecc errors +DCGM_FI_DEV_ECC_DBE_VOL_DEV = 319 #Device memory double bit volatile ecc errors +DCGM_FI_DEV_ECC_SBE_VOL_REG = 320 #Register file single bit volatile ecc errors +DCGM_FI_DEV_ECC_DBE_VOL_REG = 321 #Register file double bit volatile ecc errors +DCGM_FI_DEV_ECC_SBE_VOL_TEX = 322 #Texture memory single bit volatile ecc errors +DCGM_FI_DEV_ECC_DBE_VOL_TEX = 323 #Texture memory double bit volatile ecc errors +DCGM_FI_DEV_ECC_SBE_AGG_L1 = 324 #L1 cache single bit aggregate (persistent) ecc errors +DCGM_FI_DEV_ECC_DBE_AGG_L1 = 325 #L1 cache double bit aggregate (persistent) ecc errors +DCGM_FI_DEV_ECC_SBE_AGG_L2 = 326 #L2 cache single bit aggregate (persistent) ecc errors +DCGM_FI_DEV_ECC_DBE_AGG_L2 = 327 #L2 cache double bit aggregate (persistent) ecc errors +DCGM_FI_DEV_ECC_SBE_AGG_DEV = 328 #Device memory single bit aggregate (persistent) ecc errors +DCGM_FI_DEV_ECC_DBE_AGG_DEV = 329 #Device memory double bit aggregate (persistent) ecc errors +DCGM_FI_DEV_ECC_SBE_AGG_REG = 330 #Register File single bit aggregate (persistent) ecc errors +DCGM_FI_DEV_ECC_DBE_AGG_REG = 331 #Register File double bit aggregate (persistent) ecc errors +DCGM_FI_DEV_ECC_SBE_AGG_TEX = 332 #Texture memory single bit aggregate (persistent) ecc errors +DCGM_FI_DEV_ECC_DBE_AGG_TEX = 333 #Texture memory double bit aggregate (persistent) ecc errors +DCGM_FI_DEV_RETIRED_SBE = 390 #Number of retired pages because of single bit errors +DCGM_FI_DEV_RETIRED_DBE = 391 #Number of retired pages because of double bit errors +DCGM_FI_DEV_RETIRED_PENDING = 392 #Number of pages pending retirement +#Row remapper fields (Ampere and newer) +DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS = 393 #Number of remapped rows for uncorrectable errors +DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS = 394 #Number of remapped rows for correctable errors +DCGM_FI_DEV_ROW_REMAP_FAILURE = 395 #Whether remapping of rows has failed +DCGM_FI_DEV_ROW_REMAP_PENDING = 396 #Whether remapping of rows is pending + +#Device NvLink Bandwidth and Error Counters +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 = 400 #NV Link flow control CRC Error Counter for Lane 0 +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 = 401 #NV Link flow control CRC Error Counter for Lane 1 +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 = 402 #NV Link flow control CRC Error Counter for Lane 2 +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 = 403 #NV Link flow control CRC Error Counter for Lane 3 +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 = 404 #NV Link flow control CRC Error Counter for Lane 4 +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 = 405 #NV Link flow control CRC Error Counter for Lane 5 +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = 409 #NV Link flow control CRC Error Counter total for all Lanes +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 = 410 #NV Link data CRC Error Counter for Lane 0 +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 = 411 #NV Link data CRC Error Counter for Lane 1 +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 = 412 #NV Link data CRC Error Counter for Lane 2 +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 = 413 #NV Link data CRC Error Counter for Lane 3 +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 = 414 #NV Link data CRC Error Counter for Lane 4 +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 = 415 #NV Link data CRC Error Counter for Lane 5 +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = 419 #NV Link data CRC Error Counter total for all Lanes +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 = 420 #NV Link Replay Error Counter for Lane 0 +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 = 421 #NV Link Replay Error Counter for Lane 1 +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 = 422 #NV Link Replay Error Counter for Lane 2 +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 = 423 #NV Link Replay Error Counter for Lane 3 +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 = 424 #NV Link Replay Error Counter for Lane 4 +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 = 425 #NV Link Replay Error Counter for Lane 3 +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL = 429 #NV Link Replay Error Counter total for all Lanes + +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 = 430 #NV Link Recovery Error Counter for Lane 0 +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 = 431 #NV Link Recovery Error Counter for Lane 1 +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 = 432 #NV Link Recovery Error Counter for Lane 2 +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 = 433 #NV Link Recovery Error Counter for Lane 3 +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 = 434 #NV Link Recovery Error Counter for Lane 4 +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 = 435 #NV Link Recovery Error Counter for Lane 5 +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = 439 #NV Link Recovery Error Counter total for all Lanes +DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 = 440 #NV Link Bandwidth Counter for Lane 0 +DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 = 441 #NV Link Bandwidth Counter for Lane 1 +DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 = 442 #NV Link Bandwidth Counter for Lane 2 +DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 = 443 #NV Link Bandwidth Counter for Lane 3 +DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 = 444 #NV Link Bandwidth Counter for Lane 4 +DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 = 445 #NV Link Bandwidth Counter for Lane 5 +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL = 449 #NV Link Bandwidth Counter total for all Lanes +DCGM_FI_DEV_GPU_NVLINK_ERRORS = 450 #GPU NVLink error information +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 = 451 +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 = 452 +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 = 453 +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 = 454 +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 = 455 +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 = 456 +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 = 406 +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 = 407 +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 = 408 +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 = 481 +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 = 482 +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 = 483 +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 = 457 +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 = 458 +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 = 459 +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 = 460 +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 = 461 +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 = 462 +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 = 416 +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 = 417 +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 = 418 +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 = 484 +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 = 485 +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 = 486 +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 = 463 +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 = 464 +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 = 465 +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 = 466 +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 = 467 +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 = 468 +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 = 426 +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 = 427 +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 = 428 +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 = 487 +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 = 488 +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 = 489 +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 = 469 +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 = 470 +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 = 471 +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 = 472 +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 = 473 +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 = 474 +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 = 436 +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 = 437 +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 = 438 +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 = 491 +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 = 492 +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 = 493 +DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 = 475 +DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 = 476 +DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 = 477 +DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 = 478 +DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 = 479 +DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 = 480 +DCGM_FI_DEV_NVLINK_BANDWIDTH_L12 = 446 +DCGM_FI_DEV_NVLINK_BANDWIDTH_L13 = 447 +DCGM_FI_DEV_NVLINK_BANDWIDTH_L14 = 448 +DCGM_FI_DEV_NVLINK_BANDWIDTH_L15 = 494 +DCGM_FI_DEV_NVLINK_BANDWIDTH_L16 = 495 +DCGM_FI_DEV_NVLINK_BANDWIDTH_L17 = 496 + +#Device Attributes associated with virtualization +DCGM_FI_DEV_VIRTUAL_MODE = 500 #Operating mode of the GPU +DCGM_FI_DEV_SUPPORTED_TYPE_INFO = 501 #Includes Count and Supported vGPU type information +DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS = 502 #Includes Count and List of Creatable vGPU type IDs +DCGM_FI_DEV_VGPU_INSTANCE_IDS = 503 #Includes Count and List of vGPU instance IDs +DCGM_FI_DEV_VGPU_UTILIZATIONS = 504 #Utilization values for vGPUs running on the device +DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION = 505 #Utilization values for processes running within vGPU VMs using the device +DCGM_FI_DEV_ENC_STATS = 506 #Current encoder statistics for a given device +DCGM_FI_DEV_FBC_STATS = 507 #Statistics of current active frame buffer capture sessions on a given device +DCGM_FI_DEV_FBC_SESSIONS_INFO = 508 #Information about active frame buffer capture sessions on a target device +DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS = 509 #Includes Count and currently Supported vGPU types on a device +DCGM_FI_DEV_VGPU_TYPE_INFO = 510 #Includes Static info of vGPU types supported on a device +DCGM_FI_DEV_VGPU_TYPE_NAME = 511 #Includes the name of a vGPU type supported on a device +DCGM_FI_DEV_VGPU_TYPE_CLASS = 512 #Includes the class of a vGPU type supported on a device +DCGM_FI_DEV_VGPU_TYPE_LICENSE = 513 #Includes the license info for a vGPU type supported on a device +#Related to vGPU Instance IDs +DCGM_FI_DEV_VGPU_VM_ID = 520 #vGPU VM ID +DCGM_FI_DEV_VGPU_VM_NAME = 521 #vGPU VM name +DCGM_FI_DEV_VGPU_TYPE = 522 #vGPU type of the vGPU instance +DCGM_FI_DEV_VGPU_UUID = 523 #UUID of the vGPU instance +DCGM_FI_DEV_VGPU_DRIVER_VERSION = 524 #Driver version of the vGPU instance +DCGM_FI_DEV_VGPU_MEMORY_USAGE = 525 #Memory usage of the vGPU instance +DCGM_FI_DEV_VGPU_LICENSE_STATUS = 526 #License status of the vGPU +DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT = 527 #Frame rate limit of the vGPU instance +DCGM_FI_DEV_VGPU_ENC_STATS = 528 #Current encoder statistics of the vGPU instance +DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO = 529 #Information about all active encoder sessions on the vGPU instance +DCGM_FI_DEV_VGPU_FBC_STATS = 530 #Statistics of current active frame buffer capture sessions on the vGPU instance +DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO = 531 #Information about active frame buffer capture sessions on the vGPU instance +DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE = 532 #License state information of the vGPU instance +DCGM_FI_DEV_VGPU_PCI_ID = 533 #PCI Id of the vGPU instance +DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID = 534 #GPU Instance Id of the vGPU instance +#Internal fields reserve the range 600..699 +#below fields related to NVSwitch +DCGM_FI_FIRST_NVSWITCH_FIELD_ID = 700 #Starting field ID of the NVSwitch instance +DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX = 780 +DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX = 781 +DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS = 782 +DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS = 783 +DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS = 784 +DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS = 785 +DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS = 786 +DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS = 787 +DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS = 788 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 = 789 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 = 790 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 = 791 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 = 792 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 = 793 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 = 794 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 = 795 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 = 796 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 = 797 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 = 798 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 = 799 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 = 800 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 = 801 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 = 802 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 = 803 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 = 804 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 = 805 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 = 806 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 = 807 +DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 = 808 +DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 = 809 +DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 = 810 +DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 = 811 +DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 = 812 +DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 = 813 +DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 = 814 +DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 = 815 +DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 = 816 DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS = 856 DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS = 857 - -# -# Profiling Fields -# -# Ratio of time the graphics engine is active. The graphics engine is active if -# a graphics/compute context is bound and the graphics pipe or compute pipe is -# busy. -DCGM_FI_PROF_GR_ENGINE_ACTIVE = 1001 - -# The ratio of cycles an SM has at least 1 warp assigned -DCGM_FI_PROF_SM_ACTIVE = 1002 -# (computed from the number of cycles and elapsed cycles) - -# The ratio of number of warps resident on an SM. -DCGM_FI_PROF_SM_OCCUPANCY = 1003 -# (number of resident as a ratio of the theoretical -# maximum number of warps per elapsed cycle) - -# The ratio of cycles the tensor (HMMA) pipe is active -DCGM_FI_PROF_PIPE_TENSOR_ACTIVE = 1004 -# (off the peak sustained elapsed cycles) - -# The ratio of cycles the device memory interface is active sending or -# receiving data. -DCGM_FI_PROF_DRAM_ACTIVE = 1005 -# Ratio of cycles the fp64 pipe is active. -DCGM_FI_PROF_PIPE_FP64_ACTIVE = 1006 -# Ratio of cycles the fp32 pipe is active. -DCGM_FI_PROF_PIPE_FP32_ACTIVE = 1007 -# Ratio of cycles the fp16 pipe is active. This does not include HMMA. -DCGM_FI_PROF_PIPE_FP16_ACTIVE = 1008 -# The number of bytes of active PCIe tx (transmit) data including both header -# and payload. -DCGM_FI_PROF_PCIE_TX_BYTES = 1009 -# The number of bytes of active PCIe rx (read) data including both header and -# payload. -DCGM_FI_PROF_PCIE_RX_BYTES = 1010 -# The number of bytes of active NvLink tx (transmit) data including both header -# and payload. -DCGM_FI_PROF_NVLINK_TX_BYTES = 1011 -# The number of bytes of active NvLink rx (receive) data including both header -# and payload. -DCGM_FI_PROF_NVLINK_RX_BYTES = 1012 - -# greater than maximum fields above. This value can increase in the future -DCGM_FI_MAX_FIELDS = 1013 - - -class struct_c_dcgm_field_meta_t(Structure): +DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT = 858 +DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN = 859 +DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN = 860 +DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX = 861 +DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX = 862 + +DCGM_FI_LAST_NVSWITCH_FIELD_ID = 899 #Last field ID of the NVSwitch instance +''' +Profiling Fields +''' +DCGM_FI_PROF_GR_ENGINE_ACTIVE = 1001 #Ratio of time the graphics engine is active. The graphics engine is +#active if a graphics/compute context is bound and the graphics pipe or +#compute pipe is busy. + +DCGM_FI_PROF_SM_ACTIVE = 1002 #The ratio of cycles an SM has at least 1 warp assigned +#(computed from the number of cycles and elapsed cycles) + +DCGM_FI_PROF_SM_OCCUPANCY = 1003 #The ratio of number of warps resident on an SM. +#(number of resident as a ratio of the theoretical +#maximum number of warps per elapsed cycle) + +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE = 1004 #The ratio of cycles the any tensor pipe is active +#(off the peak sustained elapsed cycles) + +DCGM_FI_PROF_DRAM_ACTIVE = 1005 #The ratio of cycles the device memory interface is active sending or receiving data. +DCGM_FI_PROF_PIPE_FP64_ACTIVE = 1006 #Ratio of cycles the fp64 pipe is active. +DCGM_FI_PROF_PIPE_FP32_ACTIVE = 1007 #Ratio of cycles the fp32 pipe is active. +DCGM_FI_PROF_PIPE_FP16_ACTIVE = 1008 #Ratio of cycles the fp16 pipe is active. This does not include HMMA. +DCGM_FI_PROF_PCIE_TX_BYTES = 1009 #The number of bytes of active PCIe tx (transmit) data including both header and payload. +DCGM_FI_PROF_PCIE_RX_BYTES = 1010 #The number of bytes of active PCIe rx (read) data including both header and payload. +DCGM_FI_PROF_NVLINK_TX_BYTES = 1011 #The number of bytes of active NvLink tx (transmit) data including both header and payload. +DCGM_FI_PROF_NVLINK_RX_BYTES = 1012 #The number of bytes of active NvLink rx (receive) data including both header and payload. +DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE = 1013 #The ratio of cycles the IMMA tensor pipe is active (off the peak sustained elapsed cycles) +DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE = 1014 #The ratio of cycles the HMMA tensor pipe is active (off the peak sustained elapsed cycles) +DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE = 1015 #The ratio of cycles the tensor (DFMA) pipe is active (off the peak sustained elapsed cycles) +DCGM_FI_PROF_PIPE_INT_ACTIVE = 1016 #Ratio of cycles the integer pipe is active. + +#Ratio of cycles each of the NVDEC engines are active. +DCGM_FI_PROF_NVDEC0_ACTIVE = 1017 +DCGM_FI_PROF_NVDEC1_ACTIVE = 1018 +DCGM_FI_PROF_NVDEC2_ACTIVE = 1019 +DCGM_FI_PROF_NVDEC3_ACTIVE = 1020 +DCGM_FI_PROF_NVDEC4_ACTIVE = 1021 +DCGM_FI_PROF_NVDEC5_ACTIVE = 1022 +DCGM_FI_PROF_NVDEC6_ACTIVE = 1023 +DCGM_FI_PROF_NVDEC7_ACTIVE = 1024 + +#Ratio of cycles each of the NVJPG engines are active. +DCGM_FI_PROF_NVJPG0_ACTIVE = 1025 +DCGM_FI_PROF_NVJPG1_ACTIVE = 1026 +DCGM_FI_PROF_NVJPG2_ACTIVE = 1027 +DCGM_FI_PROF_NVJPG3_ACTIVE = 1028 +DCGM_FI_PROF_NVJPG4_ACTIVE = 1029 +DCGM_FI_PROF_NVJPG5_ACTIVE = 1030 +DCGM_FI_PROF_NVJPG6_ACTIVE = 1031 +DCGM_FI_PROF_NVJPG7_ACTIVE = 1032 + +#Ratio of cycles each of the NVOFA engines are active. +DCGM_FI_PROF_NVOFA0_ACTIVE = 1033 +''' +The per-link number of bytes of active NvLink TX (transmit) or RX (transmit) data including both header and payload. +For example: DCGM_FI_PROF_NVLINK_L0_TX_BYTES -> L0 TX +To get the bandwidth for a link, add the RX and TX value together like +total = DCGM_FI_PROF_NVLINK_L0_TX_BYTES + DCGM_FI_PROF_NVLINK_L0_RX_BYTES +''' +DCGM_FI_PROF_NVLINK_L0_TX_BYTES = 1040 +DCGM_FI_PROF_NVLINK_L0_RX_BYTES = 1041 +DCGM_FI_PROF_NVLINK_L1_TX_BYTES = 1042 +DCGM_FI_PROF_NVLINK_L1_RX_BYTES = 1043 +DCGM_FI_PROF_NVLINK_L2_TX_BYTES = 1044 +DCGM_FI_PROF_NVLINK_L2_RX_BYTES = 1045 +DCGM_FI_PROF_NVLINK_L3_TX_BYTES = 1046 +DCGM_FI_PROF_NVLINK_L3_RX_BYTES = 1047 +DCGM_FI_PROF_NVLINK_L4_TX_BYTES = 1048 +DCGM_FI_PROF_NVLINK_L4_RX_BYTES = 1049 +DCGM_FI_PROF_NVLINK_L5_TX_BYTES = 1050 +DCGM_FI_PROF_NVLINK_L5_RX_BYTES = 1051 +DCGM_FI_PROF_NVLINK_L6_TX_BYTES = 1052 +DCGM_FI_PROF_NVLINK_L6_RX_BYTES = 1053 +DCGM_FI_PROF_NVLINK_L7_TX_BYTES = 1054 +DCGM_FI_PROF_NVLINK_L7_RX_BYTES = 1055 +DCGM_FI_PROF_NVLINK_L8_TX_BYTES = 1056 +DCGM_FI_PROF_NVLINK_L8_RX_BYTES = 1057 +DCGM_FI_PROF_NVLINK_L9_TX_BYTES = 1058 +DCGM_FI_PROF_NVLINK_L9_RX_BYTES = 1059 +DCGM_FI_PROF_NVLINK_L10_TX_BYTES = 1060 +DCGM_FI_PROF_NVLINK_L10_RX_BYTES = 1061 +DCGM_FI_PROF_NVLINK_L11_TX_BYTES = 1062 +DCGM_FI_PROF_NVLINK_L11_RX_BYTES = 1063 +DCGM_FI_PROF_NVLINK_L12_TX_BYTES = 1064 +DCGM_FI_PROF_NVLINK_L12_RX_BYTES = 1065 +DCGM_FI_PROF_NVLINK_L13_TX_BYTES = 1066 +DCGM_FI_PROF_NVLINK_L13_RX_BYTES = 1067 +DCGM_FI_PROF_NVLINK_L14_TX_BYTES = 1068 +DCGM_FI_PROF_NVLINK_L14_RX_BYTES = 1069 +DCGM_FI_PROF_NVLINK_L15_TX_BYTES = 1070 +DCGM_FI_PROF_NVLINK_L15_RX_BYTES = 1071 +DCGM_FI_PROF_NVLINK_L16_TX_BYTES = 1072 +DCGM_FI_PROF_NVLINK_L16_RX_BYTES = 1073 +DCGM_FI_PROF_NVLINK_L17_TX_BYTES = 1074 +DCGM_FI_PROF_NVLINK_L17_RX_BYTES = 1075 + +DCGM_FI_PROF_NVLINK_THROUGHPUT_FIRST = DCGM_FI_PROF_NVLINK_L0_TX_BYTES +DCGM_FI_PROF_NVLINK_THROUGHPUT_LAST = DCGM_FI_PROF_NVLINK_L17_RX_BYTES + +#greater than maximum fields above. This value can increase in the future +DCGM_FI_MAX_FIELDS = 1076 + + +class struct_c_dcgm_field_meta_t(dcgm_structs._DcgmStructure): # struct_c_dcgm_field_meta_t structure pass # opaque handle @@ -723,7 +526,7 @@ class struct_c_dcgm_field_meta_t(Structure): dcgm_field_meta_t = POINTER(struct_c_dcgm_field_meta_t) -class _PrintableStructure(Structure): +class _PrintableStructure(dcgm_structs._DcgmStructure): """ Abstract class that produces nicer __str__ output than ctypes.Structure. e.g. instead of: @@ -736,13 +539,11 @@ class _PrintableStructure(Structure): e.g. class that has _field_ 'hex_value', c_uint could be formatted with _fmt_ = {"hex_value" : "%08X"} to produce nicer output. - Default formatting string for all fields can be set with key "" - like: + Default fomratting string for all fields can be set with key "" like: _fmt_ = {"" : "%d MHz"} # e.g all values are numbers in MHz. If not set it's assumed to be just "%s" - Exact format of returned str from this class is subject to change in the - future. + Exact format of returned str from this class is subject to change in the future. """ _fmt_ = {} @@ -770,11 +571,8 @@ def __str__(self): # Structure to hold formatting information for values class c_dcgm_field_output_format_t(_PrintableStructure): - _fields_ = [ - ("shortName", c_char * SHORTNAME_LENGTH), - ("unit", c_char * UNIT_LENGTH), - ("width", c_short), - ] + _fields_ = [('shortName', c_char * SHORTNAME_LENGTH), + ('unit', c_char * UNIT_LENGTH), ('width', c_short)] TAG_LENGTH = 48 @@ -793,24 +591,14 @@ class c_dcgm_field_meta_t(_PrintableStructure): ] -# Class for maintaining properties for each sampling type like Power, -# Utilization and Clock. +# Class for maintaining properties for each sampling type like Power, Utilization and Clock. class pySamplingProperties: - """ - The instance of this class is used to hold information related to each - sampling event type. - """ + ''' + The instance of this class is used to hold information related to each sampling event type. + ''' - def __init__( - self, - name, - sampling_type, - sample_val_type, - timeIntervalIdle, - timeIntervalBoost, - min_value, - max_value, - ): + def __init__(self, name, sampling_type, sample_val_type, timeIntervalIdle, + timeIntervalBoost, min_value, max_value): self.name = name self.sampling_type = sampling_type self.timeIntervalIdle = timeIntervalIdle @@ -827,19 +615,12 @@ def DcgmFieldsInit(): def DcgmFieldGetById(fieldId): - """ + ''' Get metadata for a field, given its fieldId - Parameters - ---------- - fieldId : - Field ID to get metadata for. - - Returns - ------- - c_dcgm_field_meta_t or None - Returns c_dcgm_field_meta_t on success or None on error. - """ + :param fieldId: Field ID to get metadata for + :return: c_dcgm_field_meta_t struct on success. None on error. + ''' DcgmFieldsInit() fn = dcgmFP("DcgmFieldGetById") @@ -854,25 +635,18 @@ def DcgmFieldGetById(fieldId): def DcgmFieldGetByTag(tag): - """ + ''' Get metadata for a field, given its string tag - Parameters - --------- - tag : - Field tag to get metadata for. Example 'brand'. - - Returns - ------- - c_dcgm_field_meta_t or None - Returns c_dcgm_field_meta_t on success or None on error. - """ + :param tag: Field tag to get metadata for. Example 'brand' + :return: c_dcgm_field_meta_t struct on success. None on error. + ''' DcgmFieldsInit() c_dcgm_field_meta_t() fn = dcgmFP("DcgmFieldGetByTag") fn.restype = POINTER(c_dcgm_field_meta_t) - c_field_meta_ptr = fn(c_char_p(tag)) + c_field_meta_ptr = fn(c_char_p(tag.encode('utf-8'))) if not c_field_meta_ptr: return None diff --git a/model_analyzer/monitor/dcgm/dcgm_fields_collectd.py b/model_analyzer/monitor/dcgm/dcgm_fields_collectd.py new file mode 100644 index 000000000..7a29edc9e --- /dev/null +++ b/model_analyzer/monitor/dcgm/dcgm_fields_collectd.py @@ -0,0 +1,671 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from model_analyzer.monitor.dcgm.dcgm_fields import * +from model_analyzer.monitor.dcgm.dcgm_fields_internal import * +import sys + + +class CollectdMetadata: + ''' + Constructor + @params: + name: string identifying the dcgm field. The field_name as opposed to + field_id.Address:port of the host to connect. Defaults to localhost + kind: collectd type string. + used: a bool indicating whether or not the field is to be defined in + a collectd types.db file when GenerateCollectdTypesDB() is called + (generally if this file is run as a python3 mainline). We enumerate + all the dcgm fields, but only generate types.db records for those + supported at the current time. Others may or may not have correct + collectd type definitions (generally one might be a guage where it + is more correctly a counter). The idea is that an intrepid user may + enable generation of additional dcgm fields that they wish to collect + but are not officially supported yet. + ''' + + def __init__(self, name, kind, used=False): + self.name = name + self.kind = kind + self.used = used + + +# collectd metadata definition table. + +CollectdMetadataDict = { + DCGM_FI_DRIVER_VERSION: + None, + DCGM_FI_NVML_VERSION: + None, + DCGM_FI_PROCESS_NAME: + None, + DCGM_FI_CUDA_DRIVER_VERSION: + CollectdMetadata("cuda_driver_version", "value:GAUGE:U:U"), + DCGM_FI_DEV_COUNT: + CollectdMetadata("device_count", "value:GAUGE:U:U"), + DCGM_FI_DEV_NAME: + None, + DCGM_FI_DEV_BRAND: + None, + DCGM_FI_DEV_NVML_INDEX: + CollectdMetadata("nvml_index", "value:GAUGE:U:U"), + DCGM_FI_DEV_SERIAL: + None, + DCGM_FI_DEV_CPU_AFFINITY_0: + CollectdMetadata("cpu_affinity_0", "value:GAUGE:U:U"), + DCGM_FI_DEV_CPU_AFFINITY_1: + CollectdMetadata("cpu_affinity_1", "value:GAUGE:U:U"), + DCGM_FI_DEV_CPU_AFFINITY_2: + CollectdMetadata("cpu_affinity_2", "value:GAUGE:U:U"), + DCGM_FI_DEV_CPU_AFFINITY_3: + CollectdMetadata("cpu_affinity_3", "value:GAUGE:U:U"), + DCGM_FI_DEV_UUID: + None, + DCGM_FI_DEV_MINOR_NUMBER: + CollectdMetadata("minor_number", "value:GAUGE:U:U"), + DCGM_FI_DEV_OEM_INFOROM_VER: + None, + DCGM_FI_DEV_ECC_INFOROM_VER: + None, + DCGM_FI_DEV_POWER_INFOROM_VER: + None, + DCGM_FI_DEV_INFOROM_IMAGE_VER: + None, + DCGM_FI_DEV_INFOROM_CONFIG_CHECK: + CollectdMetadata("inforom_config_checksum", "value:GAUGE:U:U"), + DCGM_FI_DEV_PCI_BUSID: + None, + DCGM_FI_DEV_PCI_COMBINED_ID: + CollectdMetadata("pci_combined_id", "value:GAUGE:U:U"), + DCGM_FI_DEV_PCI_SUBSYS_ID: + CollectdMetadata("pci_subsys_id", "value:GAUGE:U:U"), + DCGM_FI_DEV_PCIE_TX_THROUGHPUT: + CollectdMetadata("pcie_tx_throughput", "value:GAUGE:0:U", True), + DCGM_FI_DEV_PCIE_RX_THROUGHPUT: + CollectdMetadata("pcie_rx_throughput", "value:GAUGE:0:U", True), + DCGM_FI_DEV_PCIE_REPLAY_COUNTER: + CollectdMetadata("pcie_replay_counter", "value:COUNTER:0:U", True), + DCGM_FI_DEV_SM_CLOCK: + CollectdMetadata("sm_clock", "value:GAUGE:0:U", True), + DCGM_FI_DEV_MEM_CLOCK: + CollectdMetadata("memory_clock", "value:GAUGE:0:U", True), + DCGM_FI_DEV_VIDEO_CLOCK: + CollectdMetadata("video_clock", "value:GAUGE:0:U", True), + DCGM_FI_DEV_APP_SM_CLOCK: + CollectdMetadata("sm_app_clock", "value:GAUGE:0:U", True), + DCGM_FI_DEV_APP_MEM_CLOCK: + CollectdMetadata("mem_app_clock", "value:GAUGE:0:U", True), + DCGM_FI_DEV_CLOCK_THROTTLE_REASONS: + CollectdMetadata("current_clock_throttle_reasons", "value:GAUGE:U:U"), + DCGM_FI_DEV_MAX_SM_CLOCK: + CollectdMetadata("sm_max_clock", "value:GAUGE:0:U", True), + DCGM_FI_DEV_MAX_MEM_CLOCK: + CollectdMetadata("memory_max_clock", "value:GAUGE:0:U", True), + DCGM_FI_DEV_MAX_VIDEO_CLOCK: + CollectdMetadata("video_max_clock", "value:GAUGE:0:U", True), + DCGM_FI_DEV_AUTOBOOST: + CollectdMetadata("autoboost", "value:GAUGE:U:U"), + DCGM_FI_DEV_GPU_TEMP: + CollectdMetadata("gpu_temp", "value:GAUGE:U:U", True), + DCGM_FI_DEV_MEM_MAX_OP_TEMP: + CollectdMetadata("gpu_mem_max_op_temp", "value:GAUGE:U:U"), + DCGM_FI_DEV_GPU_MAX_OP_TEMP: + CollectdMetadata("gpu_max_op_temp", "value:GAUGE:U:U"), + DCGM_FI_DEV_SLOWDOWN_TEMP: + CollectdMetadata("slowdown_temp", "value:GAUGE:U:U"), + DCGM_FI_DEV_SHUTDOWN_TEMP: + CollectdMetadata("shutdown_temp", "value:GAUGE:U:U"), + DCGM_FI_DEV_POWER_MGMT_LIMIT: + CollectdMetadata("power_management_limit", "value:GAUGE:U:U"), + DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN: + CollectdMetadata("power_management_limit_min", "value:GAUGE:U:U"), + DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX: + CollectdMetadata("power_management_limit_max", "value:GAUGE:U:U"), + DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF: + CollectdMetadata("power_management_limit_default", "value:GAUGE:U:U"), + DCGM_FI_DEV_POWER_USAGE: + CollectdMetadata("power_usage", "value:GAUGE:0:U", True), + DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION: + CollectdMetadata("total_energy_consumption", "value:GAUGE:0:U", + True), # left as guage since zeroed at driver reload + DCGM_FI_DEV_ENFORCED_POWER_LIMIT: + CollectdMetadata("enforced_power_limit", "value:GAUGE:U:U"), + DCGM_FI_DEV_PSTATE: + CollectdMetadata("pstate", "value:GAUGE:U:U"), + DCGM_FI_DEV_FAN_SPEED: + CollectdMetadata("fan_speed", "value:GAUGE:U:U"), + DCGM_FI_DEV_COMPUTE_MODE: + CollectdMetadata("compute_mode", "value:GAUGE:U:U"), + DCGM_FI_DEV_PERSISTENCE_MODE: + CollectdMetadata("persistance_mode", "value:GAUGE:U:U"), + DCGM_FI_DEV_MIG_MODE: + CollectdMetadata("mig_mode", "value:GAUGE:U:U"), + DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR: + None, + DCGM_FI_DEV_MIG_MAX_SLICES: + CollectdMetadata("mig_max_slices", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_CURRENT: + CollectdMetadata("ecc", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_PENDING: + CollectdMetadata("ecc_pending", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_SBE_VOL_TOTAL: + CollectdMetadata("ecc_sbe_volatile_total", "value:COUNTER:0:U", True), + DCGM_FI_DEV_ECC_DBE_VOL_TOTAL: + CollectdMetadata("ecc_dbe_volatile_total", "value:COUNTER:0:U", True), + DCGM_FI_DEV_ECC_SBE_AGG_TOTAL: + CollectdMetadata("ecc_sbe_aggregate_total", "value:COUNTER:0:U", True), + DCGM_FI_DEV_ECC_DBE_AGG_TOTAL: + CollectdMetadata("ecc_dbe_aggregate_total", "value:COUNTER:0:U", True), + DCGM_FI_DEV_ECC_SBE_VOL_L1: + CollectdMetadata("ecc_sbe_volatile_l1", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_DBE_VOL_L1: + CollectdMetadata("ecc_dbe_volatile_l1", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_SBE_VOL_L2: + CollectdMetadata("ecc_sbe_volatile_l2", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_DBE_VOL_L2: + CollectdMetadata("ecc_dbe_volatile_l2", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_SBE_VOL_DEV: + CollectdMetadata("ecc_sbe_volatile_device", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_DBE_VOL_DEV: + CollectdMetadata("ecc_dbe_volatile_device", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_SBE_VOL_REG: + CollectdMetadata("ecc_sbe_volatile_register", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_DBE_VOL_REG: + CollectdMetadata("ecc_dbe_volatile_register", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_SBE_VOL_TEX: + CollectdMetadata("ecc_sbe_volatile_texture", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_DBE_VOL_TEX: + CollectdMetadata("ecc_dbe_volatile_texture", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_SBE_AGG_L1: + CollectdMetadata("ecc_sbe_aggregate_l1", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_DBE_AGG_L1: + CollectdMetadata("ecc_dbe_aggregate_l1", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_SBE_AGG_L2: + CollectdMetadata("ecc_sbe_aggregate_l2", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_DBE_AGG_L2: + CollectdMetadata("ecc_dbe_aggregate_l2", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_SBE_AGG_DEV: + CollectdMetadata("ecc_sbe_aggregate_device", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_DBE_AGG_DEV: + CollectdMetadata("ecc_dbe_aggregate_device", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_SBE_AGG_REG: + CollectdMetadata("ecc_sbe_aggregate_register", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_DBE_AGG_REG: + CollectdMetadata("ecc_dbe_aggregate_register", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_SBE_AGG_TEX: + CollectdMetadata("ecc_sbe_aggregate_texture", "value:GAUGE:U:U"), + DCGM_FI_DEV_ECC_DBE_AGG_TEX: + CollectdMetadata("ecc_dbe_aggregate_texture", "value:GAUGE:U:U"), + DCGM_FI_DEV_GPU_UTIL: + CollectdMetadata("gpu_utilization", "value:GAUGE:0.0:1.0", True), + DCGM_FI_DEV_MEM_COPY_UTIL: + CollectdMetadata("mem_copy_utilization", "value:GAUGE:0:100", True), + DCGM_FI_DEV_ENC_UTIL: + CollectdMetadata("enc_utilization", "value:GAUGE:0:100"), + DCGM_FI_DEV_DEC_UTIL: + CollectdMetadata("dec_utilization", "value:GAUGE:0:100"), + DCGM_FI_DEV_VBIOS_VERSION: + None, + DCGM_FI_DEV_BAR1_TOTAL: + CollectdMetadata("bar1_total", "value:GAUGE:U:U"), + DCGM_FI_DEV_BAR1_USED: + CollectdMetadata("bar1_used", "value:GAUGE:U:U"), + DCGM_FI_DEV_BAR1_FREE: + CollectdMetadata("bar1_free", "value:GAUGE:U:U"), + DCGM_FI_DEV_FB_TOTAL: + CollectdMetadata("fb_total", "value:GAUGE:0.0:U", True), + DCGM_FI_DEV_FB_FREE: + CollectdMetadata("fb_free", "value:GAUGE:0.0:U", True), + DCGM_FI_DEV_FB_USED: + CollectdMetadata("fb_used", "value:GAUGE:0.0:U", True), + DCGM_FI_DEV_FB_RESERVED: + CollectdMetadata("fb_resv", "value:GAUGE:0.0:U", True), + DCGM_FI_DEV_VIRTUAL_MODE: + CollectdMetadata("virtualization_mode", "value:GAUGE:U:U"), + DCGM_FI_DEV_VGPU_INSTANCE_IDS: + None, + DCGM_FI_DEV_VGPU_UTILIZATIONS: + None, + DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION: + None, + DCGM_FI_DEV_VGPU_VM_ID: + None, + DCGM_FI_DEV_VGPU_VM_NAME: + None, + DCGM_FI_DEV_VGPU_TYPE: + CollectdMetadata("vgpu_instance_type", "value:GAUGE:U:U"), + DCGM_FI_DEV_VGPU_UUID: + None, + DCGM_FI_DEV_VGPU_DRIVER_VERSION: + None, + DCGM_FI_DEV_VGPU_MEMORY_USAGE: + CollectdMetadata("vgpu_instance_memory_usage", "value:GAUGE:U:U"), + DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE: + CollectdMetadata("vgpu_instance_license_state", "value:GAUGE:U:U"), + DCGM_FI_DEV_VGPU_LICENSE_STATUS: + CollectdMetadata("vgpu_instance_license_status", "value:GAUGE:U:U"), + DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT: + CollectdMetadata("vgpu_instance_frame_rate_limit", "value:GAUGE:U:U"), + DCGM_FI_DEV_VGPU_PCI_ID: + CollectdMetadata("vgpu_instance_pci_id", "value:GAUGE:U:U"), + DCGM_FI_DEV_VGPU_ENC_STATS: + None, + DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO: + None, + DCGM_FI_DEV_VGPU_FBC_STATS: + None, + DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO: + None, + DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID: + None, + DCGM_FI_DEV_SUPPORTED_TYPE_INFO: + None, + DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS: + None, + DCGM_FI_DEV_VGPU_TYPE_INFO: + None, + DCGM_FI_DEV_VGPU_TYPE_NAME: + None, + DCGM_FI_DEV_VGPU_TYPE_CLASS: + None, + DCGM_FI_DEV_VGPU_TYPE_LICENSE: + None, + DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS: + None, + DCGM_FI_DEV_ENC_STATS: + None, + DCGM_FI_DEV_FBC_STATS: + None, + DCGM_FI_DEV_FBC_SESSIONS_INFO: + None, + DCGM_FI_DEV_ACCOUNTING_DATA: + None, + DCGM_FI_DEV_RETIRED_SBE: + CollectdMetadata("retired_pages_sbe", "value:COUNTER:0:U", True), + DCGM_FI_DEV_RETIRED_DBE: + CollectdMetadata("retired_pages_dbe", "value:COUNTER:0:U", True), + DCGM_FI_DEV_GRAPHICS_PIDS: + None, + DCGM_FI_DEV_COMPUTE_PIDS: + None, + DCGM_FI_DEV_SUPPORTED_CLOCKS: + None, + DCGM_FI_SYNC_BOOST: + None, + DCGM_FI_DEV_RETIRED_PENDING: + CollectdMetadata("retired_pages_pending", "value:GAUGE:0:1", + True), # boolean 1 = yes, 0 = no + DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS: + CollectdMetadata("uncorrectable_remapped_rows", "value:GAUGE:U:U"), + DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS: + CollectdMetadata("correctable_remapped_rows", "value:GAUGE:U:U"), + DCGM_FI_DEV_ROW_REMAP_FAILURE: + CollectdMetadata("row_remap_failure", "value:GAUGE:U:U"), + DCGM_FI_DEV_ROW_REMAP_PENDING: + CollectdMetadata("row_remap_pending", "value:GAUGE:U:U"), + DCGM_FI_DEV_INFOROM_CONFIG_VALID: + CollectdMetadata("inforom_config_valid", "value:GAUGE:U:U"), + DCGM_FI_DEV_XID_ERRORS: + CollectdMetadata("xid_errors", "value:GAUGE:0:U", True), + DCGM_FI_DEV_PCIE_MAX_LINK_GEN: + CollectdMetadata("pcie_max_link_gen", "value:GAUGE:U:U"), + DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH: + CollectdMetadata("pcie_max_link_width", "value:GAUGE:U:U"), + DCGM_FI_DEV_PCIE_LINK_GEN: + CollectdMetadata("pcie_link_gen", "value:GAUGE:U:U"), + DCGM_FI_DEV_PCIE_LINK_WIDTH: + CollectdMetadata("pcie_link_width", "value:GAUGE:U:U"), + DCGM_FI_DEV_POWER_VIOLATION: + CollectdMetadata("power_violation", "value:COUNTER:0:U", True), + DCGM_FI_DEV_THERMAL_VIOLATION: + CollectdMetadata("thermal_violation", "value:COUNTER:0:U", True), + DCGM_FI_GPU_TOPOLOGY_PCI: + None, + DCGM_FI_GPU_TOPOLOGY_NVLINK: + None, + DCGM_FI_GPU_TOPOLOGY_AFFINITY: + None, + DCGM_FI_DEV_SYNC_BOOST_VIOLATION: + CollectdMetadata("sync_boost_violation", "value:GAUGE:U:U"), + DCGM_FI_DEV_BOARD_LIMIT_VIOLATION: + CollectdMetadata("board_limit_violation", "value:GAUGE:U:U"), + DCGM_FI_DEV_LOW_UTIL_VIOLATION: + CollectdMetadata("low_util_violation", "value:GAUGE:U:U"), + DCGM_FI_DEV_RELIABILITY_VIOLATION: + CollectdMetadata("reliability_violation", "value:GAUGE:U:U"), + DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION: + CollectdMetadata("app_clock_violation", "value:GAUGE:U:U"), + DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION: + CollectdMetadata("base_clock_violation", "value:GAUGE:U:U"), + DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES: + CollectdMetadata("mem_util_samples", "value:GAUGE:U:U"), + DCGM_FI_DEV_GPU_UTIL_SAMPLES: + CollectdMetadata("gpu_util_samples", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0: + CollectdMetadata("nvlink_flit_crc_error_count_l0", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1: + CollectdMetadata("nvlink_flit_crc_error_count_l1", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2: + CollectdMetadata("nvlink_flit_crc_error_count_l2", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3: + CollectdMetadata("nvlink_flit_crc_error_count_l3", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4: + CollectdMetadata("nvlink_flit_crc_error_count_l4", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5: + CollectdMetadata("nvlink_flit_crc_error_count_l5", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL: + CollectdMetadata("nvlink_flit_crc_error_count_total", + "value:COUNTER:0:U", True), + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0: + CollectdMetadata("nvlink_data_crc_error_count_l0", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1: + CollectdMetadata("nvlink_data_crc_error_count_l1", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2: + CollectdMetadata("nvlink_data_crc_error_count_l2", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3: + CollectdMetadata("nvlink_data_crc_error_count_l3", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4: + CollectdMetadata("nvlink_data_crc_error_count_l4", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5: + CollectdMetadata("nvlink_data_crc_error_count_l5", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL: + CollectdMetadata("nvlink_data_crc_error_count_total", + "value:COUNTER:0:U", True), + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0: + CollectdMetadata("nvlink_replay_error_count_l0", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1: + CollectdMetadata("nvlink_replay_error_count_l1", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2: + CollectdMetadata("nvlink_replay_error_count_l2", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3: + CollectdMetadata("nvlink_replay_error_count_l3", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4: + CollectdMetadata("nvlink_replay_error_count_l4", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5: + CollectdMetadata("nvlink_replay_error_count_l5", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL: + CollectdMetadata("nvlink_replay_error_count_total", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0: + CollectdMetadata("nvlink_recovery_error_count_l0", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1: + CollectdMetadata("nvlink_recovery_error_count_l1", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2: + CollectdMetadata("nvlink_recovery_error_count_l2", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3: + CollectdMetadata("nvlink_recovery_error_count_l3", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4: + CollectdMetadata("nvlink_recovery_error_count_l4", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5: + CollectdMetadata("nvlink_recovery_error_count_l5", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL: + CollectdMetadata("nvlink_recovery_error_count_total", + "value:COUNTER:0:U", True), + DCGM_FI_DEV_NVLINK_BANDWIDTH_L0: + CollectdMetadata("nvlink_bandwidth_l0", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_BANDWIDTH_L1: + CollectdMetadata("nvlink_bandwidth_l1", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_BANDWIDTH_L2: + CollectdMetadata("nvlink_bandwidth_l2", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_BANDWIDTH_L3: + CollectdMetadata("nvlink_bandwidth_l3", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_BANDWIDTH_L4: + CollectdMetadata("nvlink_bandwidth_l4", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_BANDWIDTH_L5: + CollectdMetadata("nvlink_bandwidth_l5", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL: + CollectdMetadata("nvlink_bandwidth_total", "value:GAUGE:0:U", True), + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6: + CollectdMetadata("nvlink_flit_crc_error_count_l6", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7: + CollectdMetadata("nvlink_flit_crc_error_count_l7", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8: + CollectdMetadata("nvlink_flit_crc_error_count_l8", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9: + CollectdMetadata("nvlink_flit_crc_error_count_l9", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10: + CollectdMetadata("nvlink_flit_crc_error_count_l10", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11: + CollectdMetadata("nvlink_flit_crc_error_count_l11", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6: + CollectdMetadata("nvlink_data_crc_error_count_l6", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7: + CollectdMetadata("nvlink_data_crc_error_count_l7", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8: + CollectdMetadata("nvlink_data_crc_error_count_l8", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9: + CollectdMetadata("nvlink_data_crc_error_count_l9", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10: + CollectdMetadata("nvlink_data_crc_error_count_l10", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11: + CollectdMetadata("nvlink_data_crc_error_count_l11", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6: + CollectdMetadata("nvlink_replay_error_count_l6", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7: + CollectdMetadata("nvlink_replay_error_count_l7", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8: + CollectdMetadata("nvlink_replay_error_count_l8", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9: + CollectdMetadata("nvlink_replay_error_count_l9", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10: + CollectdMetadata("nvlink_replay_error_count_l10", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11: + CollectdMetadata("nvlink_replay_error_count_l11", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6: + CollectdMetadata("nvlink_recovery_error_count_l6", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7: + CollectdMetadata("nvlink_recovery_error_count_l7", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8: + CollectdMetadata("nvlink_recovery_error_count_l8", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9: + CollectdMetadata("nvlink_recovery_error_count_l9", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10: + CollectdMetadata("nvlink_recovery_error_count_l10", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11: + CollectdMetadata("nvlink_recovery_error_count_l11", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_BANDWIDTH_L6: + CollectdMetadata("nvlink_bandwidth_l6", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_BANDWIDTH_L7: + CollectdMetadata("nvlink_bandwidth_l7", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_BANDWIDTH_L8: + CollectdMetadata("nvlink_bandwidth_l8", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_BANDWIDTH_L9: + CollectdMetadata("nvlink_bandwidth_l9", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_BANDWIDTH_L10: + CollectdMetadata("nvlink_bandwidth_l10", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVLINK_BANDWIDTH_L11: + CollectdMetadata("nvlink_bandwidth_l11", "value:GAUGE:U:U"), + DCGM_FI_DEV_MEMORY_TEMP: + CollectdMetadata("memory_temp", "value:GAUGE:U:U", True), + DCGM_FI_DEV_GPU_NVLINK_ERRORS: + CollectdMetadata("gpu_nvlink_errors", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX: + CollectdMetadata("nvswitch_link_bandwidth_tx", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX: + CollectdMetadata("nvswitch_link_bandwidth_rx", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS: + CollectdMetadata("nvswitch_link_fatal_errors", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS: + CollectdMetadata("nvswitch_link_non_fatal_errors", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS: + CollectdMetadata("nvswitch_link_recovery_errors", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS: + CollectdMetadata("nvswitch_link_flit_errors", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS: + CollectdMetadata("nvswitch_link_crc_errors", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS: + CollectdMetadata("nvswitch_link_ecc_errors", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0: + CollectdMetadata("nvswitch_link_latency_low_vc0", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1: + CollectdMetadata("nvswitch_link_latency_low_vc1", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2: + CollectdMetadata("nvswitch_link_latency_low_vc2", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3: + CollectdMetadata("nvswitch_link_latency_low_vc3", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0: + CollectdMetadata("nvswitch_link_latency_medium_vc0", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1: + CollectdMetadata("nvswitch_link_latency_medium_vc1", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2: + CollectdMetadata("nvswitch_link_latency_medium_vc2", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3: + CollectdMetadata("nvswitch_link_latency_medium_vc3", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0: + CollectdMetadata("nvswitch_link_latency_high_vc0", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1: + CollectdMetadata("nvswitch_link_latency_high_vc1", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2: + CollectdMetadata("nvswitch_link_latency_high_vc2", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3: + CollectdMetadata("nvswitch_link_latency_high_vc3", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0: + CollectdMetadata("nvswitch_link_latency_panic_vc0", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1: + CollectdMetadata("nvswitch_link_latency_panic_vc1", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2: + CollectdMetadata("nvswitch_link_latency_panic_vc2", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3: + CollectdMetadata("nvswitch_link_latency_panic_vc3", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0: + CollectdMetadata("nvswitch_link_latency_count_vc0", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1: + CollectdMetadata("nvswitch_link_latency_count_vc1", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2: + CollectdMetadata("nvswitch_link_latency_count_vc2", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3: + CollectdMetadata("nvswitch_link_latency_count_vc3", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0: + CollectdMetadata("nvswitch_link_crc_errors_lane0", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1: + CollectdMetadata("nvswitch_link_crc_errors_lane1", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2: + CollectdMetadata("nvswitch_link_crc_errors_lane2", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3: + CollectdMetadata("nvswitch_link_crc_errors_lane3", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0: + CollectdMetadata("nvswitch_link_ecc_errors_lane0", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1: + CollectdMetadata("nvswitch_link_ecc_errors_lane1", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2: + CollectdMetadata("nvswitch_link_ecc_errors_lane2", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3: + CollectdMetadata("nvswitch_link_ecc_errors_lane3", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS: + CollectdMetadata("nvswitch_fatal_error", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS: + CollectdMetadata("nvswitch_non_fatal_error", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT: + CollectdMetadata("nvswitch_temperature_current", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN: + CollectdMetadata("nvswitch_temperature_limit_slowdown", + "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN: + CollectdMetadata("nvswitch_temperature_limit_shutdown", + "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX: + CollectdMetadata("nvswitch_throughput_tx", "value:GAUGE:U:U"), + DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX: + CollectdMetadata("nvswitch_throughput_rx", "value:GAUGE:U:U"), + DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY: + CollectdMetadata("cuda_compute_capability", "value:GAUGE:U:U"), + DCGM_FI_PROF_GR_ENGINE_ACTIVE: + CollectdMetadata("gr_engine_active", "value:GAUGE:0.0:1.0", True), + DCGM_FI_PROF_SM_ACTIVE: + CollectdMetadata("sm_active", "value:GAUGE:0.0:1.0", True), + DCGM_FI_PROF_SM_OCCUPANCY: + CollectdMetadata("sm_occupancy", "value:GAUGE:0:U", True), + DCGM_FI_PROF_PIPE_TENSOR_ACTIVE: + CollectdMetadata("tensor_active", "value:GAUGE:0.0:1.0", True), + DCGM_FI_PROF_DRAM_ACTIVE: + CollectdMetadata("dram_active", "value:GAUGE:0.0:1.0", True), + DCGM_FI_PROF_PIPE_FP64_ACTIVE: + CollectdMetadata("fp64_active", "value:GAUGE:U:U"), + DCGM_FI_PROF_PIPE_FP32_ACTIVE: + CollectdMetadata("fp32_active", "value:GAUGE:U:U"), + DCGM_FI_PROF_PIPE_FP16_ACTIVE: + CollectdMetadata("fp16_active", "value:GAUGE:U:U"), + DCGM_FI_PROF_PCIE_TX_BYTES: + CollectdMetadata("pcie_tx_bytes", "value:GAUGE:U:U"), + DCGM_FI_PROF_PCIE_RX_BYTES: + CollectdMetadata("pcie_rx_bytes", "value:GAUGE:U:U"), + DCGM_FI_PROF_NVLINK_TX_BYTES: + CollectdMetadata("nvlink_tx_bytes", "value:GAUGE:U:U"), + DCGM_FI_PROF_NVLINK_RX_BYTES: + CollectdMetadata("nvlink_rx_bytes", "value:GAUGE:U:U"), + DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE: + CollectdMetadata("tensor_imma_active", "value:GAUGE:0.0:1.0", True), + DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE: + CollectdMetadata("tensor_hmma_active", "value:GAUGE:0.0:1.0", True), +} + +__fieldDict = None + + +def GenerateCollectdTypesDB(): + length = max( + map(lambda x: len(x.name) if x else 0, CollectdMetadataDict.values())) + + fmt = "{0:<" + str(length) + "}" + fail = False + + for item in filter(None, CollectdMetadataDict.values()): + item_list = item.kind.split(':') + + # Some rudimentary syntax checking. + + if len(item_list) != 4: + sys.stderr.write( + 'Item ' + item.name + + ' has wrong number of collectd type fields - four required.\n') + fail = True + + if item_list[1] not in ['GAUGE', 'COUNTER', 'DERIVE', 'ABSOLUTE']: + sys.stderr.write( + 'Item ' + item.name + + ' should be one of GAUGE, COUNTER, DERIVE, ABSOLUTE.\n') + fail = True + + # We check this so we can enumerate all dcgm fields for possible + # inclusion, even if some are not (yet) formally supported. + + if item.used: + print(fmt.format(item.name), item.kind) + + if fail: + exit("Failed on db.types table syntax errors.\n") + + +def GetFieldByName(name): + global __fieldDict + + if name.isnumeric(): + return int(name) + + if __fieldDict == None: + __fieldDict = {} + + for key in CollectdMetadataDict: + item = CollectdMetadataDict[key] + + if item != None: + __fieldDict[item.name] = key + + if name not in __fieldDict.keys(): + return -1 + + return __fieldDict[name] + + +if __name__ == '__main__': + GenerateCollectdTypesDB() diff --git a/model_analyzer/monitor/dcgm/dcgm_fields_internal.py b/model_analyzer/monitor/dcgm/dcgm_fields_internal.py new file mode 100644 index 000000000..9502c959a --- /dev/null +++ b/model_analyzer/monitor/dcgm/dcgm_fields_internal.py @@ -0,0 +1,29 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +## +# Python bindings for the internal API of DCGM library (dcgm_fields_internal.hpp) +## + +from ctypes import * +from ctypes.util import find_library +import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs + +# Provides access to functions +dcgmFP = dcgm_structs._dcgmGetFunctionPointer + +#internal-only fields +DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES = 210 #Memory utilization samples +DCGM_FI_DEV_GPU_UTIL_SAMPLES = 211 #SM utilization samples +DCGM_FI_DEV_GRAPHICS_PIDS = 220 #Graphics processes running on the GPU. +DCGM_FI_DEV_COMPUTE_PIDS = 221 #Compute processes running on the GPU. diff --git a/model_analyzer/monitor/dcgm/dcgm_fluentd.py b/model_analyzer/monitor/dcgm/dcgm_fluentd.py new file mode 100644 index 000000000..24a345100 --- /dev/null +++ b/model_analyzer/monitor/dcgm/dcgm_fluentd.py @@ -0,0 +1,45 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from model_analyzer.monitor.dcgm.common.dcgm_client_main import main +from model_analyzer.monitor.dcgm.DcgmJsonReader import DcgmJsonReader +from socket import socket, AF_INET, SOCK_DGRAM + +# Displayed to the user +FLUENTD_NAME = 'Fluentd' +DEFAULT_FLUENTD_PORT = 24225 + +# Fluentd Configuration +# ===================== +# In order to use this client, Fluentd needs to accept json over udp. +# The default port is 24225 + + +class DcgmFluentd(DcgmJsonReader): + ########################################################################### + def __init__(self, publish_hostname, publish_port, **kwargs): + self.m_sock = socket(AF_INET, SOCK_DGRAM) + self.m_dest = (publish_hostname, publish_port) + super(DcgmFluentd, self).__init__(**kwargs) + + ########################################################################### + def SendToFluentd(self, payload): + self.m_sock.sendto(payload, self.m_dest) + + ########################################################################### + def CustomJsonHandler(self, outJson): + self.SendToFluentd(outJson) + + +if __name__ == '__main__': # pragma: no cover + main(DcgmFluentd, FLUENTD_NAME, DEFAULT_FLUENTD_PORT, add_target_host=True) diff --git a/model_analyzer/monitor/dcgm/dcgm_prometheus.py b/model_analyzer/monitor/dcgm/dcgm_prometheus.py new file mode 100644 index 000000000..f6f69a613 --- /dev/null +++ b/model_analyzer/monitor/dcgm/dcgm_prometheus.py @@ -0,0 +1,326 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields +import time +import logging +import os +import argparse +import sys +import signal + +dir_path = os.path.dirname(os.path.realpath(__file__)) +parent_dir_path = os.path.abspath(os.path.join(dir_path, os.pardir)) +sys.path.insert(0, parent_dir_path) + +from model_analyzer.monitor.dcgm.DcgmReader import DcgmReader +from model_analyzer.monitor.dcgm.common import dcgm_client_cli_parser as cli + +if 'DCGM_TESTING_FRAMEWORK' in os.environ: + try: + from prometheus_tester_api import start_http_server, Gauge + except: + logging.critical( + "prometheus_tester_api missing, reinstall test framework.") + sys.exit(3) +else: + try: + from prometheus_client import start_http_server, Gauge + except ImportError: + pass + logging.critical( + "prometheus_client not installed, please run: \"pip install prometheus_client\"" + ) + sys.exit(3) + +DEFAULT_FIELDS = [ + dcgm_fields.DCGM_FI_DEV_PCI_BUSID, #Needed for plugin_instance + dcgm_fields.DCGM_FI_DEV_POWER_USAGE, + dcgm_fields.DCGM_FI_DEV_GPU_TEMP, + dcgm_fields.DCGM_FI_DEV_SM_CLOCK, + dcgm_fields.DCGM_FI_DEV_GPU_UTIL, + dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING, + dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, + dcgm_fields.DCGM_FI_DEV_RETIRED_DBE, + dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, + dcgm_fields.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, + dcgm_fields.DCGM_FI_DEV_FB_TOTAL, + dcgm_fields.DCGM_FI_DEV_FB_FREE, + dcgm_fields.DCGM_FI_DEV_FB_USED, + dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER, + dcgm_fields.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, + dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, + dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION, + dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION, + dcgm_fields.DCGM_FI_DEV_XID_ERRORS, + dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, + dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, + dcgm_fields.DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, + dcgm_fields.DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, +] + + +class DcgmPrometheus(DcgmReader): + ########################################################################### + def __init__(self): + #Have DCGM update its watches twice as fast as our update interval so we don't get out of phase by our update interval + updateIntervalUsec = int( + (1000000 * g_settings['prometheusPublishInterval']) / 2) + #Add our PID to our field group name so we can have multiple instances running + fieldGroupName = 'dcgm_prometheus_' + str(os.getpid()) + + DcgmReader.__init__(self, + ignoreList=g_settings['ignoreList'], + fieldIds=g_settings['publishFieldIds'], + updateFrequency=updateIntervalUsec, + fieldGroupName=fieldGroupName, + hostname=g_settings['dcgmHostName']) + self.m_existingGauge = {} + + ########################################################################### + ''' + This function is implemented from the base class : DcgmReader. It converts each + field / value from the fvs dictionary to a gauge and publishes the gauge to the + prometheus client server. + + @params: + fvs : The fieldvalue dictionary that contains info about the values of field Ids for each gpuId. + ''' + + def CustomDataHandler(self, fvs): + if not self.m_existingGauge: + self.SetupGauges() + + for _, fieldIds in self.m_publishFields.items(): + if fieldIds is None: + continue + + for fieldId in fieldIds: + if fieldId in self.m_dcgmIgnoreFields: + continue + + g = self.m_existingGauge[fieldId] + + for gpuId in list(fvs.keys()): + gpuFv = fvs[gpuId] + val = gpuFv[fieldId][-1] + + #Skip blank values. Otherwise, we'd have to insert a placeholder blank value based on the fieldId + if val.isBlank: + continue + + gpuUuid = self.m_gpuIdToUUId[gpuId] + gpuBusId = self.m_gpuIdToBusId[gpuId] + gpuUniqueId = gpuUuid if g_settings['sendUuid'] else gpuBusId + + # pylint doesn't find the labels member for Gauge, but it exists. Ignore the warning + g.labels(gpuId, gpuUniqueId).set(val.value) # pylint: disable=no-member + + logging.debug( + 'Sent GPU %d %s %s = %s' % + (gpuId, gpuUniqueId, self.m_fieldIdToInfo[fieldId].tag, + str(val.value))) + + ############################################################################### + ''' + NOTE: even though some fields are monotonically increasing and therefore fit the mold to be + counters, all are published as gauges so that DCGM is the sole authority on the state of the + system, preventing problems around down times, driver reboots, and the unlikely event of + flashing the inforom. + For specific information about which fields monotonically increase, see the API guide or + dcgm_fields.h + ''' + + def SetupGauges(self): + for _, fieldIds in self.m_publishFields.items(): + if fieldIds is None: + continue + + for fieldId in fieldIds: + if fieldId in self.m_dcgmIgnoreFields: + continue + + uniqueIdName = 'GpuUuid' if g_settings[ + 'sendUuid'] else 'GpuBusID' + + fieldTag = self.m_fieldIdToInfo[fieldId].tag + self.m_existingGauge[fieldId] = Gauge("dcgm_" + fieldTag, + 'DCGM_PROMETHEUS', + ['GpuID', uniqueIdName]) + + ############################################################################### + ''' + Scrape the fieldvalue data and publish. This function calls the process function of + the base class DcgmReader. + ''' + + def Scrape(self, data=None): + return self.Process() + + ############################################################################### + def LogBasicInformation(self): + # Reconnect causes everything to get initialized + self.Reconnect() + + logging.info('Started prometheus client') + + fieldTagList = '' + + for _, fieldIds in self.m_publishFields.items(): + if fieldIds is None: + continue + + for fieldId in fieldIds: + if fieldId in self.m_dcgmIgnoreFields: + continue + + if fieldTagList == '': + fieldTagList = self.m_fieldIdToInfo[fieldId].tag + else: + fieldTagList = fieldTagList + ", %s" % ( + self.m_fieldIdToInfo[fieldId].tag) + + logging.info("Publishing fields: '%s'" % (fieldTagList)) + + ############################################################################### + def LogError(self, msg): + logging.error(msg) + + ############################################################################### + def LogInfo(self, msg): + logging.info(msg) + + +############################################################################### +def exit_handler(signum, frame): + g_settings['shouldExit'] = True + + +############################################################################### +def main_loop(prometheus_obj, publish_interval): + try: + while True: + prometheus_obj.Scrape(prometheus_obj) + time.sleep(publish_interval) + + if g_settings['shouldExit'] == True: + prometheus_obj.LogInfo('Received a signal...shutting down') + break + except KeyboardInterrupt: + print("Caught CTRL-C. Exiting") + + +############################################################################### +def initialize_globals(): + ''' + Name of the host. + ''' + global g_settings + g_settings = {} + + g_settings['shouldExit'] = False + ''' + List of the ids that are present in g_settings['publishFieldIds'] but ignored for watch. + ''' + g_settings['ignoreList'] = [ + dcgm_fields.DCGM_FI_DEV_PCI_BUSID, + ] + ''' + Those are initialized by the CLI parser. We only list them here for clarity. + ''' + for key in [ + 'dcgmHostName', + 'prometheusPort', + 'prometheusPublishInterval', + 'publishFieldIds', + ]: + g_settings[key] = None + + +############################################################################### +def parse_command_line(): + parser = cli.create_parser( + name='Prometheus', + field_ids=DEFAULT_FIELDS, + ) + + cli.add_custom_argument(parser, + '--send-uuid', + dest='send_uuid', + default=False, + action='store_true', + help='Send GPU UUID instead of bus id') + + args = cli.run_parser(parser) + field_ids = cli.get_field_ids(args) + numeric_log_level = cli.get_log_level(args) + + # Defaults to localhost, so we need to set it to None + if args.embedded: + g_settings['dcgmHostName'] = None + else: + g_settings['dcgmHostName'] = args.hostname + + g_settings['prometheusPort'] = args.publish_port + + g_settings['prometheusPublishInterval'] = args.interval + + logfile = args.logfile + + g_settings['publishFieldIds'] = field_ids + + g_settings['sendUuid'] = args.send_uuid + + if logfile != None: + logging.basicConfig(level=numeric_log_level, + filename=logfile, + filemode='w+', + format='%(asctime)s %(levelname)s: %(message)s') + else: + logging.basicConfig(level=numeric_log_level, + stream=sys.stdout, + filemode='w+', + format='%(asctime)s %(levelname)s: %(message)s') + + +############################################################################### +def initialize_signal_handlers(): + signal.signal(signal.SIGINT, exit_handler) + signal.signal(signal.SIGTERM, exit_handler) + + +############################################################################### +def main(): + initialize_globals() + + initialize_signal_handlers() + + parse_command_line() + + prometheus_obj = DcgmPrometheus() + + logging.info("Starting Prometheus server on port " + + str(g_settings['prometheusPort'])) + + #start prometheus client server. + start_http_server(g_settings['prometheusPort']) + + prometheus_obj.LogBasicInformation() + + main_loop(prometheus_obj, g_settings['prometheusPublishInterval']) + + prometheus_obj.Shutdown() + + +if __name__ == '__main__': + main() diff --git a/model_analyzer/monitor/dcgm/dcgm_structs.py b/model_analyzer/monitor/dcgm/dcgm_structs.py index e401c4181..233d15564 100755 --- a/model_analyzer/monitor/dcgm/dcgm_structs.py +++ b/model_analyzer/monitor/dcgm/dcgm_structs.py @@ -1,6 +1,4 @@ -#!/usr/bin/env python3 - -# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,49 +11,30 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +## +# Python bindings for "dcgm_structs.h" +## -import json -import os -import platform -import string +from ctypes import * +from ctypes.util import find_library import sys +import os import threading -from ctypes import ( - CDLL, - POINTER, - Array, - Structure, - Union, - c_bool, - c_byte, - c_char, - c_char_p, - c_double, - c_int, - c_int32, - c_int64, - c_longlong, - c_short, - c_uint, - c_uint16, - c_uint32, - c_uint64, - c_ulong, - c_ushort, - c_void_p, - sizeof, -) - -import distro - -import model_analyzer.monitor.dcgm.dcgm_value as dcgmvalue +import string +import json +import model_analyzer.monitor.dcgm.dcgmvalue as dcgmvalue +import platform +from inspect import isclass DCGM_MAX_STR_LENGTH = 256 DCGM_MAX_NUM_DEVICES = 32 # DCGM 2.0 and newer = 32. DCGM 1.8 and older = 16 DCGM_MAX_NUM_SWITCHES = 12 -DCGM_NVLINK_MAX_LINKS_PER_GPU = 12 +DCGM_NVLINK_MAX_LINKS_PER_GPU = 18 DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1 = 6 -DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH = 36 +DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY2 = 12 +DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH_V1 = 36 # Max NvLinks per NvSwitch pre-Hopper +DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH = 64 +DCGM_LANE_MAX_LANES_PER_NVSWICH_LINK = 4 DCGM_MAX_CLOCKS = 256 DCGM_MAX_NUM_GROUPS = 64 DCGM_MAX_BLOB_LENGTH = 4096 @@ -66,8 +45,7 @@ DCGM_DEVICE_UUID_BUFFER_SIZE = 80 DCGM_MAX_FBC_SESSIONS = 256 -# When more than one value is returned from a query, which order should it be -# returned in? +#When more than one value is returned from a query, which order should it be returned in? DCGM_ORDER_ASCENDING = 1 DCGM_ORDER_DESCENDING = 2 @@ -83,125 +61,72 @@ DCGM_FBC_SESSION_TYPE_VID = 3 # FB capture for a Vid buffer DCGM_FBC_SESSION_TYPE_HWENC = 4 # FB capture for a NVENC HW buffer -# C Type mappings # -# Enums +## C Type mappings ## +## Enums # Return types _dcgmReturn_t = c_uint -# Success -DCGM_ST_OK = 0 -# A bad parameter was passed to a function -DCGM_ST_BADPARAM = -1 -# A generic, unspecified error -DCGM_ST_GENERIC_ERROR = -3 -# An out of memory error occurred -DCGM_ST_MEMORY = -4 -# Setting not configured -DCGM_ST_NOT_CONFIGURED = -5 -# Feature not supported -DCGM_ST_NOT_SUPPORTED = -6 -# DCGM Init error -DCGM_ST_INIT_ERROR = -7 -# When NVML returns error. -DCGM_ST_NVML_ERROR = -8 -# Object is in pending state of something else -DCGM_ST_PENDING = -9 -# Object is in undefined state -DCGM_ST_UNINITIALIZED = -10 -# Requested operation timed out -DCGM_ST_TIMEOUT = -11 -# Version mismatch between received and understood API -DCGM_ST_VER_MISMATCH = -12 -# Unknown field id -DCGM_ST_UNKNOWN_FIELD = -13 -# No data is available -DCGM_ST_NO_DATA = -14 +DCGM_ST_OK = 0 # Success +DCGM_ST_BADPARAM = -1 # A bad parameter was passed to a function +DCGM_ST_GENERIC_ERROR = -3 # A generic, unspecified error +DCGM_ST_MEMORY = -4 # An out of memory error occured +DCGM_ST_NOT_CONFIGURED = -5 # Setting not configured +DCGM_ST_NOT_SUPPORTED = -6 # Feature not supported +DCGM_ST_INIT_ERROR = -7 # DCGM Init error +DCGM_ST_NVML_ERROR = -8 # When NVML returns error. +DCGM_ST_PENDING = -9 # Object is in pending state of something else +DCGM_ST_UNINITIALIZED = -10 # Object is in undefined state +DCGM_ST_TIMEOUT = -11 # Requested operation timed out +DCGM_ST_VER_MISMATCH = -12 # Version mismatch between received and understood API +DCGM_ST_UNKNOWN_FIELD = -13 # Unknown field id +DCGM_ST_NO_DATA = -14 # No data is available DCGM_ST_STALE_DATA = -15 -# The given field is not being updated by the cache manager -DCGM_ST_NOT_WATCHED = -16 -# We are not permissioned to perform the desired action -DCGM_ST_NO_PERMISSION = -17 -# GPU is no longer reachable -DCGM_ST_GPU_IS_LOST = -18 -# GPU requires a reset -DCGM_ST_RESET_REQUIRED = -19 -# Unable to find function -DCGM_ST_FUNCTION_NOT_FOUND = -20 -# Connection to the host engine is not valid any longer -DCGM_ST_CONNECTION_NOT_VALID = -21 -# This GPU is not supported by DCGM -DCGM_ST_GPU_NOT_SUPPORTED = -22 -# The GPUs of the provided group are not compatible with each other for the -# requested operation -DCGM_ST_GROUP_INCOMPATIBLE = -23 +DCGM_ST_NOT_WATCHED = -16 # The given field is not being updated by the cache manager +DCGM_ST_NO_PERMISSION = -17 # We are not permissioned to perform the desired action +DCGM_ST_GPU_IS_LOST = -18 # GPU is no longer reachable +DCGM_ST_RESET_REQUIRED = -19 # GPU requires a reset +DCGM_ST_FUNCTION_NOT_FOUND = -20 # Unable to find function +DCGM_ST_CONNECTION_NOT_VALID = -21 # Connection to the host engine is not valid any longer +DCGM_ST_GPU_NOT_SUPPORTED = -22 # This GPU is not supported by DCGM +DCGM_ST_GROUP_INCOMPATIBLE = -23 # The GPUs of the provided group are not compatible with each other for the requested operation DCGM_ST_MAX_LIMIT = -24 -# DCGM library could not be found -DCGM_ST_LIBRARY_NOT_FOUND = -25 -# Duplicate key passed to the function -DCGM_ST_DUPLICATE_KEY = -26 -# GPU is already a part of a sync boost group -DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27 -# GPU is a not a part of sync boost group -DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP = -28 -# This operation cannot be performed when the host engine is running as -# non-root -DCGM_ST_REQUIRES_ROOT = -29 -# DCGM GPU Diagnostic was successfully executed, but reported an error. -DCGM_ST_NVVS_ERROR = -30 -# An input argument is not large enough -DCGM_ST_INSUFFICIENT_SIZE = -31 -# The given field ID is not supported by the API being called -DCGM_ST_FIELD_UNSUPPORTED_BY_API = -32 -# This request is serviced by a module of DCGM that is not currently loaded -DCGM_ST_MODULE_NOT_LOADED = -33 -# The requested operation could not be completed because the affected resource -# is in use -DCGM_ST_IN_USE = -34 -# The specified group is empty and this operation is not valid with an empty -# group -DCGM_ST_GROUP_IS_EMPTY = -35 -# Profiling is not supported for this group of GPUs or GPU -DCGM_ST_PROFILING_NOT_SUPPORTED = -36 -# The third-party Profiling module returned an unrecoverable error -DCGM_ST_PROFILING_LIBRARY_ERROR = -37 -# The requested profiling metrics cannot be collected in a single pass -DCGM_ST_PROFILING_MULTI_PASS = -38 -# A diag instance is already running, cannot run a new diag until the current -# one finishes. -DCGM_ST_DIAG_ALREADY_RUNNING = -39 -# The DCGM GPU Diagnostic returned JSON that cannot be parsed -DCGM_ST_DIAG_BAD_JSON = -40 -# Error while launching the DCGM GPU Diagnostic -DCGM_ST_DIAG_BAD_LAUNCH = -41 -# There is too much variance while training the diagnostic -DCGM_ST_DIAG_VARIANCE = -42 -# A field value met or exceeded the error threshold. -DCGM_ST_DIAG_THRESHOLD_EXCEEDED = -43 -# The installed driver version is insufficient for this API -DCGM_ST_INSUFFICIENT_DRIVER_VERSION = -44 -# The specified GPU instance does not exist -DCGM_ST_INSTANCE_NOT_FOUND = -45 -# The specified GPU compute instance does not exist -DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND = -46 -# Could not kill a child process within the retries -DCGM_ST_CHILD_NOT_KILLED = -47 -# Detected an error in a 3rd-party library -DCGM_ST_3RD_PARTY_LIBRARY_ERROR = -48 -# Not enough resources available -DCGM_ST_INSUFFICIENT_RESOURCES = -49 - -# All the GPUs on the node are added to the group -DCGM_GROUP_DEFAULT = 0 -# Creates an empty group -DCGM_GROUP_EMPTY = 1 -# All NvSwitches of the node are added to the group -DCGM_GROUP_DEFAULT_NVSWITCHES = 2 -# All GPU instances of the node are added to the group -DCGM_GROUP_DEFAULT_INSTANCES = 3 -# All compute instances of the node are added to the group -DCGM_GROUP_DEFAULT_COMPUTE_INSTANCES = 4 -# All entities are added to this default group -DCGM_GROUP_DEFAULT_ENTITIES = 5 +DCGM_ST_LIBRARY_NOT_FOUND = -25 # DCGM library could not be found +DCGM_ST_DUPLICATE_KEY = -26 #Duplicate key passed to the function +DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27 #GPU is already a part of a sync boost group +DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP = -28 #GPU is a not a part of sync boost group +DCGM_ST_REQUIRES_ROOT = -29 #This operation cannot be performed when the host engine is running as non-root +DCGM_ST_NVVS_ERROR = -30 #DCGM GPU Diagnostic was successfully executed, but reported an error. +DCGM_ST_INSUFFICIENT_SIZE = -31 #An input argument is not large enough +DCGM_ST_FIELD_UNSUPPORTED_BY_API = -32 #The given field ID is not supported by the API being called +DCGM_ST_MODULE_NOT_LOADED = -33 #This request is serviced by a module of DCGM that is not currently loaded +DCGM_ST_IN_USE = -34 #The requested operation could not be completed because the affected resource is in use +DCGM_ST_GROUP_IS_EMPTY = -35 # The specified group is empty and this operation is not valid with an empty group +DCGM_ST_PROFILING_NOT_SUPPORTED = -36 # Profiling is not supported for this group of GPUs or GPU +DCGM_ST_PROFILING_LIBRARY_ERROR = -37 # The third-party Profiling module returned an unrecoverable error +DCGM_ST_PROFILING_MULTI_PASS = -38 # The requested profiling metrics cannot be collected in a single pass +DCGM_ST_DIAG_ALREADY_RUNNING = -39 # A diag instance is already running, cannot run a new diag until the current one finishes. +DCGM_ST_DIAG_BAD_JSON = -40 # The DCGM GPU Diagnostic returned JSON that cannot be parsed +DCGM_ST_DIAG_BAD_LAUNCH = -41 # Error while launching the DCGM GPU Diagnostic +DCGM_ST_DIAG_UNUSED = -42 # Unused +DCGM_ST_DIAG_THRESHOLD_EXCEEDED = -43 # A field value met or exceeded the error threshold. +DCGM_ST_INSUFFICIENT_DRIVER_VERSION = -44 # The installed driver version is insufficient for this API +DCGM_ST_INSTANCE_NOT_FOUND = -45 # The specified GPU instance does not exist +DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND = -46 # The specified GPU compute instance does not exist +DCGM_ST_CHILD_NOT_KILLED = -47 # Couldn't kill a child process within the retries +DCGM_ST_3RD_PARTY_LIBRARY_ERROR = -48 # Detected an error in a 3rd-party library +DCGM_ST_INSUFFICIENT_RESOURCES = -49 # Not enough resources available +DCGM_ST_PLUGIN_EXCEPTION = -50 # Exception thrown from a diagnostic plugin +DCGM_ST_NVVS_ISOLATE_ERROR = -51 # The diagnostic returned an error that indicates the need for isolation +DCGM_ST_NVVS_BINARY_NOT_FOUND = -52 # The NVVS binary was not found in the specified location +DCGM_ST_NVVS_KILLED = -53 # The NVVS process was killed by a signal +DCGM_ST_PAUSED = -54 # The hostengine and all modules are paused + +DCGM_GROUP_DEFAULT = 0 # All the GPUs on the node are added to the group +DCGM_GROUP_EMPTY = 1 # Creates an empty group +DCGM_GROUP_DEFAULT_NVSWITCHES = 2 # All NvSwitches of the node are added to the group +DCGM_GROUP_DEFAULT_INSTANCES = 3 # All GPU instances of the node are added to the group +DCGM_GROUP_DEFAULT_COMPUTE_INSTANCES = 4 # All compute instances of the node are added to the group +DCGM_GROUP_DEFAULT_ENTITIES = 5 # All entities are added to this default group DCGM_GROUP_ALL_GPUS = 0x7FFFFFFF DCGM_GROUP_ALL_NVSWITCHES = 0x7FFFFFFE @@ -209,26 +134,17 @@ DCGM_GROUP_ALL_COMPUTE_INSTANCES = 0x7FFFFFFC DCGM_GROUP_ALL_ENTITIES = 0x7FFFFFFB -# Maximum number of entities per entity group -DCGM_GROUP_MAX_ENTITIES = 64 +DCGM_GROUP_MAX_ENTITIES = 64 #Maximum number of entities per entity group -# The target configuration values to be applied -DCGM_CONFIG_TARGET_STATE = 0 -# The current configuration state -DCGM_CONFIG_CURRENT_STATE = 1 +DCGM_CONFIG_TARGET_STATE = 0 # The target configuration values to be applied +DCGM_CONFIG_CURRENT_STATE = 1 # The current configuration state -# Represents the power cap to be applied for each member of the group -DCGM_CONFIG_POWER_CAP_INDIVIDUAL = 0 -# Represents the power budget for the entire group -DCGM_CONFIG_POWER_BUDGET_GROUP = 1 +DCGM_CONFIG_POWER_CAP_INDIVIDUAL = 0 # Represents the power cap to be applied for each member of the group +DCGM_CONFIG_POWER_BUDGET_GROUP = 1 # Represents the power budget for the entire group -# Default compute mode -- multiple contexts per device -DCGM_CONFIG_COMPUTEMODE_DEFAULT = 0 -# Compute-prohibited mode -- no contexts per device -DCGM_CONFIG_COMPUTEMODE_PROHIBITED = 1 -# Compute-exclusive-process mode -- only one context per device, usable from -# multiple threads at a time -DCGM_CONFIG_COMPUTEMODE_EXCLUSIVE_PROCESS = 2 +DCGM_CONFIG_COMPUTEMODE_DEFAULT = 0 # Default compute mode -- multiple contexts per device +DCGM_CONFIG_COMPUTEMODE_PROHIBITED = 1 # Compute-prohibited mode -- no contexts per device +DCGM_CONFIG_COMPUTEMODE_EXCLUSIVE_PROCESS = 2 #* Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time DCGM_TOPOLOGY_BOARD = 0x1 DCGM_TOPOLOGY_SINGLE = 0x2 @@ -249,19 +165,26 @@ DCGM_TOPOLOGY_NVLINK11 = 0x40000 DCGM_TOPOLOGY_NVLINK12 = 0x80000 -# Diagnostic per gpu tests - fixed indices for -# dcgmDiagResponsePerGpu_t.results[] +# Diagnostic per gpu tests - fixed indices for dcgmDiagResponsePerGpu_t.results[] DCGM_MEMORY_INDEX = 0 DCGM_DIAGNOSTIC_INDEX = 1 DCGM_PCI_INDEX = 2 -DCGM_SM_PERF_INDEX = 3 -DCGM_TARGETED_PERF_INDEX = 4 +DCGM_SM_STRESS_INDEX = 3 +DCGM_TARGETED_STRESS_INDEX = 4 DCGM_TARGETED_POWER_INDEX = 5 DCGM_MEMORY_BANDWIDTH_INDEX = 6 -DCGM_PER_GPU_TEST_COUNT = 7 +DCGM_MEMTEST_INDEX = 7 +DCGM_PULSE_TEST_INDEX = 8 +DCGM_EUD_TEST_INDEX = 9 +DCGM_UNUSED2_TEST_INDEX = 10 +DCGM_UNUSED3_TEST_INDEX = 11 +DCGM_UNUSED4_TEST_INDEX = 12 +DCGM_UNUSED5_TEST_INDEX = 13 +DCGM_PER_GPU_TEST_COUNT_V7 = 9 +DCGM_PER_GPU_TEST_COUNT_V8 = 13 # DCGM Diag Level One test indices -DCGM_SWTEST_BLACKLIST = 0 +DCGM_SWTEST_DENYLIST = 0 DCGM_SWTEST_NVML_LIBRARY = 1 DCGM_SWTEST_CUDA_MAIN_LIBRARY = 2 DCGM_SWTEST_CUDA_RUNTIME_LIBRARY = 3 @@ -284,70 +207,116 @@ class DCGM_INTROSPECT_STATE(object): # Lib loading dcgmLib = None libLoadLock = threading.Lock() -# Incremented on each dcgmInit and decremented on dcgmShutdown -_dcgmLib_refcount = 0 +_dcgmLib_refcount = 0 # Incremented on each dcgmInit and decremented on dcgmShutdown class DCGMError(Exception): - """ - Class to return error values for DCGM - """ - + """ Class to return error values for DCGM """ _valClassMapping = dict() # List of currently known error codes _error_code_to_string = { - DCGM_ST_OK: "Success", - DCGM_ST_BADPARAM: "Bad parameter passed to function", - DCGM_ST_GENERIC_ERROR: "Generic unspecified error", - DCGM_ST_MEMORY: "Out of memory error", - DCGM_ST_NOT_CONFIGURED: "Setting not configured", - DCGM_ST_NOT_SUPPORTED: "Feature not supported", - DCGM_ST_INIT_ERROR: "DCGM initialization error", - DCGM_ST_NVML_ERROR: "NVML error", - DCGM_ST_PENDING: "Object is in a pending state", - DCGM_ST_UNINITIALIZED: "Object is in an undefined state", - DCGM_ST_TIMEOUT: "Timeout", - DCGM_ST_VER_MISMATCH: "API version mismatch", - DCGM_ST_UNKNOWN_FIELD: "Unknown field", - DCGM_ST_NO_DATA: "No data is available", - DCGM_ST_STALE_DATA: "Data is considered stale", - DCGM_ST_NOT_WATCHED: "Field is not being updated", - DCGM_ST_NO_PERMISSION: "Not permissioned", - DCGM_ST_GPU_IS_LOST: "GPU is unreachable", - DCGM_ST_RESET_REQUIRED: "GPU requires a reset", - DCGM_ST_FUNCTION_NOT_FOUND: "Unable to find function", - DCGM_ST_CONNECTION_NOT_VALID: "The connection to the host engine is not valid any longer", - DCGM_ST_GPU_NOT_SUPPORTED: "This GPU is not supported by DCGM", - DCGM_ST_GROUP_INCOMPATIBLE: "GPUs are incompatible with each other for\ - the requested operation", - DCGM_ST_MAX_LIMIT: "Max limit reached for the object", - DCGM_ST_LIBRARY_NOT_FOUND: "DCGM library could not be found", - DCGM_ST_DUPLICATE_KEY: "Duplicate key passed to function", - DCGM_ST_GPU_IN_SYNC_BOOST_GROUP: "GPU is already a part of a sync boost group", - DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP: "GPU is not a part of the sync boost group", - DCGM_ST_REQUIRES_ROOT: "This operation is not supported when the host engine\ - is running as non root", - DCGM_ST_NVVS_ERROR: "DCGM GPU Diagnostic returned an error.", - DCGM_ST_INSUFFICIENT_SIZE: "An input argument is not large enough", - DCGM_ST_FIELD_UNSUPPORTED_BY_API: "The given field ID is not supported by the API being called", - DCGM_ST_MODULE_NOT_LOADED: "This request is serviced by a module of DCGM that\ - is not currently loaded", - DCGM_ST_IN_USE: "The requested operation could not be completed because\ - the affected resource is in use", - DCGM_ST_GROUP_IS_EMPTY: "The specified group is empty, and this operation\ - is incompatible with an empty group", - DCGM_ST_PROFILING_NOT_SUPPORTED: "Profiling is not supported for this group of GPUs or GPU", - DCGM_ST_PROFILING_LIBRARY_ERROR: "The third-party Profiling module returned an unrecoverable error", - DCGM_ST_PROFILING_MULTI_PASS: "The requested profiling metrics\ - cannot be collected in a single pass", - DCGM_ST_DIAG_ALREADY_RUNNING: "A diag instance is already running, cannot\ - run a new diag until the current one finishes", - DCGM_ST_DIAG_BAD_JSON: "The GPU Diagnostic returned Json that cannot be parsed.", - DCGM_ST_DIAG_BAD_LAUNCH: "Error while launching the GPU Diagnostic.", - DCGM_ST_DIAG_VARIANCE: "The results of training DCGM GPU Diagnostic cannot\ - be trusted because they vary too much from run to run", - DCGM_ST_DIAG_THRESHOLD_EXCEEDED: "A field value met or exceeded the error threshold.", - DCGM_ST_INSUFFICIENT_DRIVER_VERSION: "The installed driver version is insufficient for this API", + DCGM_ST_OK: + "Success", + DCGM_ST_BADPARAM: + "Bad parameter passed to function", + DCGM_ST_GENERIC_ERROR: + "Generic unspecified error", + DCGM_ST_MEMORY: + "Out of memory error", + DCGM_ST_NOT_CONFIGURED: + "Setting not configured", + DCGM_ST_NOT_SUPPORTED: + "Feature not supported", + DCGM_ST_INIT_ERROR: + "DCGM initialization error", + DCGM_ST_NVML_ERROR: + "NVML error", + DCGM_ST_PENDING: + "Object is in a pending state", + DCGM_ST_UNINITIALIZED: + "Object is in an undefined state", + DCGM_ST_TIMEOUT: + "Timeout", + DCGM_ST_VER_MISMATCH: + "API version mismatch", + DCGM_ST_UNKNOWN_FIELD: + "Unknown field", + DCGM_ST_NO_DATA: + "No data is available", + DCGM_ST_STALE_DATA: + "Data is considered stale", + DCGM_ST_NOT_WATCHED: + "Field is not being updated", + DCGM_ST_NO_PERMISSION: + "Not permissioned", + DCGM_ST_GPU_IS_LOST: + "GPU is unreachable", + DCGM_ST_RESET_REQUIRED: + "GPU requires a reset", + DCGM_ST_FUNCTION_NOT_FOUND: + "Unable to find function", + DCGM_ST_CONNECTION_NOT_VALID: + "The connection to the host engine is not valid any longer", + DCGM_ST_GPU_NOT_SUPPORTED: + "This GPU is not supported by DCGM", + DCGM_ST_GROUP_INCOMPATIBLE: + "GPUs are incompatible with each other for the requested operation", + DCGM_ST_MAX_LIMIT: + "Max limit reached for the object", + DCGM_ST_LIBRARY_NOT_FOUND: + "DCGM library could not be found", + DCGM_ST_DUPLICATE_KEY: + "Duplicate key passed to function", + DCGM_ST_GPU_IN_SYNC_BOOST_GROUP: + "GPU is already a part of a sync boost group", + DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP: + "GPU is not a part of the sync boost group", + DCGM_ST_REQUIRES_ROOT: + "This operation is not supported when the host engine is running as non root", + DCGM_ST_NVVS_ERROR: + "DCGM GPU Diagnostic returned an error.", + DCGM_ST_INSUFFICIENT_SIZE: + "An input argument is not large enough", + DCGM_ST_FIELD_UNSUPPORTED_BY_API: + "The given field ID is not supported by the API being called", + DCGM_ST_MODULE_NOT_LOADED: + "This request is serviced by a module of DCGM that is not currently loaded", + DCGM_ST_IN_USE: + "The requested operation could not be completed because the affected resource is in use", + DCGM_ST_GROUP_IS_EMPTY: + "The specified group is empty, and this operation is incompatible with an empty group", + DCGM_ST_PROFILING_NOT_SUPPORTED: + "Profiling is not supported for this group of GPUs or GPU", + DCGM_ST_PROFILING_LIBRARY_ERROR: + "The third-party Profiling module returned an unrecoverable error", + DCGM_ST_PROFILING_MULTI_PASS: + "The requested profiling metrics cannot be collected in a single pass", + DCGM_ST_DIAG_ALREADY_RUNNING: + "A diag instance is already running, cannot run a new diag until the current one finishes", + DCGM_ST_DIAG_BAD_JSON: + "The GPU Diagnostic returned Json that cannot be parsed.", + DCGM_ST_DIAG_BAD_LAUNCH: + "Error while launching the GPU Diagnostic.", + DCGM_ST_DIAG_UNUSED: + "Unused error code", + DCGM_ST_DIAG_THRESHOLD_EXCEEDED: + "A field value met or exceeded the error threshold.", + DCGM_ST_INSUFFICIENT_DRIVER_VERSION: + "The installed driver version is insufficient for this API", + DCGM_ST_INSTANCE_NOT_FOUND: + "The specified GPU instance does not exist", + DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND: + "The specified GPU compute instance does not exist", + DCGM_ST_CHILD_NOT_KILLED: + "Couldn't kill a child process within the retries", + DCGM_ST_3RD_PARTY_LIBRARY_ERROR: + "Detected an error in a 3rd-party library", + DCGM_ST_INSUFFICIENT_RESOURCES: + "Not enough resources available", + DCGM_ST_PLUGIN_EXCEPTION: + "Exception thrown from a diagnostic plugin", + DCGM_ST_NVVS_ISOLATE_ERROR: + "The diagnostic returned an error that indicates the need for isolation", } def __new__(typ, value): @@ -366,11 +335,9 @@ def __str__(self): try: if self.value not in DCGMError._error_code_to_string: DCGMError._error_code_to_string[self.value] = str( - _dcgmErrorString(self.value) - ) + _dcgmErrorString(self.value)) msg = DCGMError._error_code_to_string[self.value] - # Ensure we catch all exceptions, otherwise the error code will be - # hidden in a traceback + # Ensure we catch all exceptions, otherwise the error code will be hidden in a traceback except BaseException: msg = "DCGM Error with code %d" % self.value @@ -383,16 +350,16 @@ def __str__(self): def __eq__(self, other): return self.value == other.value + def __hash__(self): + return hash(self.value) + def SetAdditionalInfo(self, msg): """ - Sets msg as additional information returned by the string - representation of DCGMError and subclasses. Example output for - DCGMError_Uninitialized subclass, with msg set to 'more info msg - here' is "DCGMError_Uninitialized: Object is in an undefined state: - 'more info msg here'". - - Ensure that msg is a string or an object for which the __str__() - method does not throw an error + Sets msg as additional information returned by the string representation of DCGMError and subclasses. + Example output for DCGMError_Uninitialized subclass, with msg set to 'more info msg here' is + "DCGMError_Uninitialized: Object is in an undefined state: 'more info msg here'". + + Ensure that msg is a string or an object for which the __str__() method does not throw an error """ self.info = msg @@ -402,33 +369,34 @@ def dcgmExceptionClass(error_code): def _extractDCGMErrorsAsClasses(): - """ + ''' Generates a hierarchy of classes on top of DCGMLError class. - Each DCGM Error gets a new DCGMError subclass. This way try,except blocks - can filter appropriate exceptions more easily. + Each DCGM Error gets a new DCGMError subclass. This way try,except blocks can filter appropriate + exceptions more easily. DCGMError is a parent class. Each DCGM_ST_* gets it's own subclass. e.g. DCGM_ST_UNINITIALIZED will be turned into DCGMError_Uninitialized - """ - + ''' this_module = sys.modules[__name__] dcgmErrorsNames = filter(lambda x: x.startswith("DCGM_ST_"), dir(this_module)) for err_name in dcgmErrorsNames: # e.g. Turn DCGM_ST_UNINITIALIZED into DCGMError_Uninitialized class_name = "DCGMError_" + string.capwords( - err_name.replace("DCGM_ST_", ""), "_" - ).replace("_", "") + err_name.replace("DCGM_ST_", ""), "_").replace("_", "") err_val = getattr(this_module, err_name) def gen_new(val): + def new(typ): + # pylint: disable=E1121 obj = DCGMError.__new__(typ, val) return obj return new - new_error_class = type(class_name, (DCGMError,), {"__new__": gen_new(err_val)}) + new_error_class = type(class_name, (DCGMError,), + {'__new__': gen_new(err_val)}) new_error_class.__module__ = __name__ setattr(this_module, class_name, new_error_class) DCGMError._valClassMapping[err_val] = new_error_class @@ -445,7 +413,118 @@ class struct_c_dcgmUnit_t(Structure): _dcgmUnit_t = POINTER(struct_c_dcgmUnit_t) -class _PrintableStructure(Structure): +class _WrappedStructure(): + + def __init__(self, obj): + self.__dict__["_obj"] = obj + + def __getattr__(self, key): + value = getattr(self._obj, key) + if isinstance(value, bytes): + return value.decode('utf-8') + if isclass(value): + return _WrappedStructure(value) + return value + + def __getitem__(self, key): + value = self._obj[key] + if isinstance(value, bytes): + return value.decode('utf-8') + if isclass(value): + return _WrappedStructure(value) + return value + + def __setattr__(self, key, raw_value): + + def find_field_type(fields, key): + field = (f[1] for f in fields if f[0] == key) + try: + return next(field) + except StopIteration: + return None + + if (key == '_obj'): + raise RuntimeError("Cannot set _obj") + + value = raw_value + fieldtype = find_field_type(self._obj._fields_, key) + + if fieldtype == c_uint and not isinstance(value, c_uint32): + value = int(value) + elif fieldtype == c_int and not isinstance(value, c_int32): + value = int(value) + elif isinstance(raw_value, str): + value = raw_value.encode('utf-8') + + self._obj[key] = value + return value + + +class _DcgmStructure(Structure): + + def __getattribute__(self, key): + value = super().__getattribute__(key) + if isinstance(value, bytes): + return value.decode('utf-8') + if isclass(value): + return _WrappedStructure(value) + return value + + def __setattr__(self, key, raw_value): + + def find_field_type(fields, key): + field = (f[1] for f in fields if f[0] == key) + try: + return next(field) + except StopIteration: + return None + + value = raw_value + fieldtype = find_field_type(self._fields_, key) + + if fieldtype == c_uint and not isinstance(value, c_uint32): + value = int(value) + elif fieldtype == c_int and not isinstance(value, c_int32): + value = int(value) + elif isinstance(raw_value, str): + value = raw_value.encode('utf-8') + + return super().__setattr__(key, value) + + +class DcgmUnion(Union): + + def __getattribute__(self, key): + value = super().__getattribute__(key) + if isinstance(value, bytes): + return value.decode('utf-8') + if isclass(value): + return _WrappedStructure(value) + return value + + def __setattr__(self, key, raw_value): + + def find_field_type(fields, key): + field = (f[1] for f in fields if f[0] == key) + try: + return next(field) + except StopIteration: + return None + + value = raw_value + fieldtype = find_field_type(self._fields_, key) + + if fieldtype == c_uint and not isinstance(value, c_uint32): + value = int(value) + elif fieldtype == c_int and not isinstance(value, c_int32): + value = int(value) + elif isinstance(raw_value, str): + value = raw_value.encode('utf-8') + + return super().__setattr__(key, value) + + +class _PrintableStructure(_DcgmStructure): """ Abstract class that produces nicer __str__ output than ctypes.Structure. e.g. instead of: @@ -458,15 +537,12 @@ class _PrintableStructure(Structure): e.g. class that has _field_ 'hex_value', c_uint could be formatted with _fmt_ = {"hex_value" : "%08X"} to produce nicer output. - Default formatting string for all fields can be set with key "" - like: + Default fomratting string for all fields can be set with key "" like: _fmt_ = {"" : "%d MHz"} # e.g all values are numbers in MHz. If not set it's assumed to be just "%s" - Exact format of returned str from this class is subject to change in the - future. + Exact format of returned str from this class is subject to change in the future. """ - _fmt_ = {} def __str__(self): @@ -480,7 +556,7 @@ def __str__(self): elif "" in self._fmt_: fmt = self._fmt_[""] result.append(("%s: " + fmt) % (key, value)) - return self.__class__.__name__ + "(" + string.join(result, ", ") + ")" + return self.__class__.__name__ + "(" + ", ".join(result) + ")" def FieldsSizeof(self): size = 0 @@ -489,10 +565,8 @@ def FieldsSizeof(self): return size +#JSON serializer for DCGM structures class DcgmJSONEncoder(json.JSONEncoder): - """ - JSON serializer for DCGM structures - """ def default(self, o): # pylint: disable=method-hidden if isinstance(o, _PrintableStructure): @@ -519,21 +593,18 @@ def default(self, o): # pylint: disable=method-hidden retVal.append(subVal) return retVal - # Let the parent class handle this/fail + #Let the parent class handle this/fail return json.JSONEncoder.default(self, o) +# Creates a unique version number for each struct def make_dcgm_version(struct, ver): - """ - Creates a unique version number for each struct - """ - return sizeof(struct) | (ver << 24) -# Function access -# function pointers are cached to prevent unnecessary libLoadLock locking -_dcgmGetFunctionPointer_cache = dict() +# Function access ## +_dcgmGetFunctionPointer_cache = dict( +) # function pointers are cached to prevent unnecessary libLoadLock locking def _dcgmGetFunctionPointer(name): @@ -557,18 +628,14 @@ def _dcgmGetFunctionPointer(name): libLoadLock.release() -# -# C function wrappers -# +# C function wrappers ## def _LoadDcgmLibrary(libDcgmPath=None): """ Load the library if it isn't loaded already - :param libDcgmPath: Optional path to the libdcgm*.so libraries. Will use - system defaults if not specified. + :param libDcgmPath: Optional path to the libdcgm*.so libraries. Will use system defaults if not specified. :type libDcgmPath: str :return: None """ - global dcgmLib if dcgmLib is None: @@ -580,30 +647,24 @@ def _LoadDcgmLibrary(libDcgmPath=None): if dcgmLib is None: try: if sys.platform[:3] == "win": - # cdecl calling convention load nvml.dll from - # %ProgramFiles%/NVIDIA Corporation/NVSMI/nvml.dll + # cdecl calling convention + # load nvml.dll from %ProgramFiles%/NVIDIA Corporation/NVSMI/nvml.dll dcgmLib = CDLL( os.path.join( os.getenv("ProgramFiles", "C:/Program Files"), - "NVIDIA Corporation/NVSMI/dcgm.dll", - ) - ) + "NVIDIA Corporation/NVSMI/dcgm.dll")) else: - if not libDcgmPath: - ( - dist_name, - dist_version, - dist_id, - ) = distro.linux_distribution(full_distribution_name=0) - dist_name = dist_name.lower() - if dist_name in {"ubuntu", "debian"}: - libDcgmPath = "/usr/lib/{}-linux-gnu".format( - platform.machine() - ) - elif dist_name in {"fedora", "redhat", "centos", "suse"}: - libDcgmPath = "/usr/lib64" - - dcgmLib = CDLL(os.path.join(libDcgmPath, "libdcgm.so.2")) + if libDcgmPath: + lib_file = os.path.join(libDcgmPath, "libdcgm.so.3") + else: + # Try Debian-based distros + lib_file = '/usr/lib/{}-linux-gnu/libdcgm.so.3'.format( + platform.machine()) + if not os.path.isfile(lib_file): + # Presume Redhat-based distros + lib_file = '/usr/lib64/libdcgm.so.3' + + dcgmLib = CDLL(lib_file) except OSError as ose: _dcgmCheckReturn(DCGM_ST_LIBRARY_NOT_FOUND) @@ -652,20 +713,23 @@ def _dcgmErrorString(result): return str +# Represents a link object. type should be one of DCGM_FE_GPU or +# DCGM_FE_SWITCH. gpuId or switchID the associated gpu or switch; +# +class c_dcgm_link_t(_PrintableStructure): + _fields = [('type', c_uint8), ('index', c_uint8), ('id', c_uint16)] + + class c_dcgmConnectV2Params_v1(_PrintableStructure): - _fields_ = [("version", c_uint), ("persistAfterDisconnect", c_uint)] + _fields_ = [('version', c_uint), ('persistAfterDisconnect', c_uint)] c_dcgmConnectV2Params_version1 = make_dcgm_version(c_dcgmConnectV2Params_v1, 1) class c_dcgmConnectV2Params_v2(_PrintableStructure): - _fields_ = [ - ("version", c_uint), - ("persistAfterDisconnect", c_uint), - ("timeoutMs", c_uint), - ("addressIsUnixSocket", c_uint), - ] + _fields_ = [('version', c_uint), ('persistAfterDisconnect', c_uint), + ('timeoutMs', c_uint), ('addressIsUnixSocket', c_uint)] c_dcgmConnectV2Params_version2 = make_dcgm_version(c_dcgmConnectV2Params_v2, 2) @@ -683,22 +747,22 @@ class c_dcgmHostengineHealth_v1(_PrintableStructure): dcgmHostengineHealth_version = dcgmHostengineHealth_version1 -# Represents memory and proc clocks for a device +#Represents memory and proc clocks for a device class c_dcgmClockSet_v1(_PrintableStructure): _fields_ = [ - ("version", c_uint), - ("memClock", c_uint), # Memory Clock - ("smClock", c_uint), # SM Clock + ('version', c_uint), + ('memClock', c_uint), #/* Memory Clock */ + ('smClock', c_uint) #/* SM Clock */ ] -# Represents a entityGroupId + entityId pair to uniquely identify a given -# entityId inside a group of entities +# Represents a entityGroupId + entityId pair to uniquely identify a given entityId inside +# a group of entities # Added in DCGM 1.5.0 class c_dcgmGroupEntityPair_t(_PrintableStructure): _fields_ = [ - ("entityGroupId", c_uint32), # Entity Group ID entity belongs to - ("entityId", c_uint32), # Entity ID of the entity + ('entityGroupId', c_uint32), #Entity Group ID entity belongs to + ('entityId', c_uint32) #Entity ID of the entity ] @@ -707,12 +771,10 @@ class c_dcgmGroupEntityPair_t(_PrintableStructure): # * Added in DCGM 1.5.0 # */ class c_dcgmGroupInfo_v2(_PrintableStructure): - _fields_ = [ - ("version", c_uint), - ("count", c_uint), - ("groupName", c_char * DCGM_MAX_STR_LENGTH), - ("entityList", c_dcgmGroupEntityPair_t * DCGM_GROUP_MAX_ENTITIES), - ] + _fields_ = [('version', c_uint), ('count', c_uint), + ('groupName', c_char * DCGM_MAX_STR_LENGTH), + ('entityList', + c_dcgmGroupEntityPair_t * DCGM_GROUP_MAX_ENTITIES)] c_dcgmGroupInfo_version2 = make_dcgm_version(c_dcgmGroupInfo_v2, 2) @@ -723,19 +785,25 @@ class c_dcgmGroupInfo_v2(_PrintableStructure): DcgmMigProfileGpuInstanceSlice3 = 3 # GPU instance slice 3 DcgmMigProfileGpuInstanceSlice4 = 4 # GPU instance slice 4 DcgmMigProfileGpuInstanceSlice7 = 5 # GPU instance slice 7 +DcgmMigProfileGpuInstanceSlice8 = 6 # GPU instance slice 8 +DcgmMigProfileGpuInstanceSlice6 = 7 # GPU instance slice 6 +DcgmMigProfileGpuInstanceSlice1Rev1 = 8 # GPU instance slice 1 revision 1 +DcgmMigProfileGpuInstanceSlice2Rev1 = 9 # GPU instance slice 2 revision 1 +DcgmMigProfileGpuInstanceSlice1Rev2 = 10 # GPU instance slice 1 revision 2 DcgmMigProfileComputeInstanceSlice1 = 30 # compute instance slice 1 DcgmMigProfileComputeInstanceSlice2 = 31 # compute instance slice 2 DcgmMigProfileComputeInstanceSlice3 = 32 # compute instance slice 3 DcgmMigProfileComputeInstanceSlice4 = 33 # compute instance slice 4 DcgmMigProfileComputeInstanceSlice7 = 34 # compute instance slice 7 +DcgmMigProfileComputeInstanceSlice8 = 35 # compute instance slice 8 +DcgmMigProfileComputeInstanceSlice6 = 36 # compute instance slice 6 +DcgmMigProfileComputeInstanceSlice1Rev1 = 37 # compute instance slice 1 revision 1 +# /** +# * Represents a pair of entity pairings to uniquely identify an entity and its place in the hierarchy. +# */ class c_dcgmMigHierarchyInfo_t(_PrintableStructure): - """ - Represents a pair of entity pairings to uniquely identify an entity and - its place in the hierarchy. - """ - _fields_ = [ ("entity", c_dcgmGroupEntityPair_t), ("parent", c_dcgmGroupEntityPair_t), @@ -743,36 +811,50 @@ class c_dcgmMigHierarchyInfo_t(_PrintableStructure): ] -DCGM_MAX_INSTANCES_PER_GPU = 7 -# There can never be more compute instances per GPU than instances per GPU -# because a compute instance is part of an instance +class c_dcgmMigEntityInfo_t(_PrintableStructure): + _fields_ = [ + ('gpuUuid', c_char * 128), # GPU UUID + ('nvmlGpuIndex', c_uint), # GPU index from NVML + ('nvmlInstanceId', c_uint), # GPU instance index within GPU + ('nvmlComputeInstanceId', + c_uint), # GPU Compute instance index within GPU instance + ('nvmlMigProfileId', + c_uint), # Unique profile ID for GPU or Compute instances + ('nvmlProfileSlices', c_uint), # Number of slices in the MIG profile + ] + + +class c_dcgmMigHierarchyInfo_v2(_PrintableStructure): + _fields_ = [ + ('entity', c_dcgmGroupEntityPair_t), + ('parent', c_dcgmGroupEntityPair_t), + ('info', c_dcgmMigEntityInfo_t), + ] + + +DCGM_MAX_INSTANCES_PER_GPU = 8 +# There can never be more compute instances per GPU than instances per GPU because a compute instance +# is part of an instance DCGM_MAX_COMPUTE_INSTANCES_PER_GPU = DCGM_MAX_INSTANCES_PER_GPU -# Currently, there cannot be more than 14 instances + compute instances. There -# are always 7 compute instances and never more than 7 instances +# Currently, there cannot be more than 14 instances + compute instances. There are always 7 compute instances +# and never more than 7 instances DCGM_MAX_TOTAL_INSTANCES = 14 DCGM_MAX_HIERARCHY_INFO = DCGM_MAX_NUM_DEVICES * DCGM_MAX_TOTAL_INSTANCES DCGM_MAX_INSTANCES = DCGM_MAX_NUM_DEVICES * DCGM_MAX_INSTANCES_PER_GPU -# The maximum compute instances are always the same as the maximum instances -# because each compute instances is part of an instance +# The maximum compute instances are always the same as the maximum instances because each compute instances +# is part of an instance DCGM_MAX_COMPUTE_INSTANCES = DCGM_MAX_INSTANCES -# Ask the hostengine to wait to process reconfiguring the GPUs -DCGM_MIG_RECONFIG_DELAY_PROCESSING = 0x1 +DCGM_MIG_RECONFIG_DELAY_PROCESSING = 0x1 # Ask the hostengine to wait to process reconfiguring the GPUs -class c_dcgmMigHierarchy_v1(_PrintableStructure): - """ - Structure to store the GPU hierarchy for a system - """ - - _fields_ = [ - ("version", c_uint), - ("count", c_uint), - ("entityList", c_dcgmMigHierarchyInfo_t * DCGM_MAX_HIERARCHY_INFO), - ] +class c_dcgmMigHierarchy_v2(_PrintableStructure): + _fields_ = [('version', c_uint), ('count', c_uint), + ('entityList', + c_dcgmMigHierarchyInfo_v2 * DCGM_MAX_HIERARCHY_INFO)] -c_dcgmMigHierarchy_version1 = make_dcgm_version(c_dcgmMigHierarchy_v1, 1) +c_dcgmMigHierarchy_version2 = make_dcgm_version(c_dcgmMigHierarchy_v2, 2) class c_dcgmDeleteMigEntity_v1(_PrintableStructure): @@ -786,13 +868,11 @@ class c_dcgmDeleteMigEntity_v1(_PrintableStructure): c_dcgmDeleteMigEntity_version1 = make_dcgm_version(c_dcgmDeleteMigEntity_v1, 1) -# -# Enum values for the kinds of MIG creations -# -# Create a GPU instance -DcgmMigCreateGpuInstance = 0 -# Create a compute instance -DcgmMigCreateComputeInstance = 1 +# /** +# * Enum values for the kinds of MIG creations +# */ +DcgmMigCreateGpuInstance = 0 # Create a GPU instance +DcgmMigCreateComputeInstance = 1 # Create a compute instance class c_dcgmCreateMigEntity_v1(_PrintableStructure): @@ -808,331 +888,391 @@ class c_dcgmCreateMigEntity_v1(_PrintableStructure): c_dcgmCreateMigEntity_version1 = make_dcgm_version(c_dcgmCreateMigEntity_v1, 1) +# /** +# * Structure to represent error attributes +# */ class c_dcgmErrorInfo_v1(_PrintableStructure): - """ - Structure to represent error attributes - """ - - _fields_ = [("gpuId", c_uint), ("fieldId", c_ushort), ("status", c_int)] + _fields_ = [('gpuId', c_uint), ('fieldId', c_ushort), ('status', c_int)] +# /** +# * Represents list of supported clocks for a device +# */ class c_dcgmDeviceSupportedClockSets_v1(_PrintableStructure): - """ - Represents list of supported clocks for a device - """ - - _fields_ = [ - ("version", c_uint), - ("count", c_uint), - ("clockSet", c_dcgmClockSet_v1 * DCGM_MAX_CLOCKS), - ] + _fields_ = [('version', c_uint), ('count', c_uint), + ('clockSet', c_dcgmClockSet_v1 * DCGM_MAX_CLOCKS)] +# /** +# * Represents accounting information for a device and pid +# */ class c_dcgmDevicePidAccountingStats_v1(_PrintableStructure): - """ - epresents accounting information for a device and pid - """ - - _fields_ = [ - ("version", c_uint32), - ("pid", c_uint32), - ("gpuUtilization", c_uint32), - ("memoryUtilization", c_uint32), - ("maxMemoryUsage", c_uint64), - ("startTimestamp", c_uint64), - ("activeTimeUsec", c_uint64), - ] + _fields_ = [('version', c_uint32), ('pid', c_uint32), + ('gpuUtilization', c_uint32), ('memoryUtilization', c_uint32), + ('maxMemoryUsage', c_uint64), ('startTimestamp', c_uint64), + ('activeTimeUsec', c_uint64)] +# /** +# * Represents thermal information +# */ class c_dcgmDeviceThermals_v1(_PrintableStructure): - """ - Represents thermal information - """ - - _fields_ = [("version", c_uint), ("slowdownTemp", c_uint), ("shutdownTemp", c_uint)] + _fields_ = [('version', c_uint), ('slowdownTemp', c_uint), + ('shutdownTemp', c_uint)] +# /** +# * Represents various power limits +# */ class c_dcgmDevicePowerLimits_v1(_PrintableStructure): - """ - Represents various power limits - """ - - _fields_ = [ - ("version", c_uint), - ("curPowerLimit", c_uint), - ("defaultPowerLimit", c_uint), - ("enforcedPowerLimit", c_uint), - ("minPowerLimit", c_uint), - ("maxPowerLimit", c_uint), - ] + _fields_ = [('version', c_uint), ('curPowerLimit', c_uint), + ('defaultPowerLimit', c_uint), ('enforcedPowerLimit', c_uint), + ('minPowerLimit', c_uint), ('maxPowerLimit', c_uint)] +# /** +# * Represents device identifiers +# */ class c_dcgmDeviceIdentifiers_v1(_PrintableStructure): - """ - Represents device identifiers - """ - - _fields_ = [ - ("version", c_uint), - ("brandName", c_char * DCGM_MAX_STR_LENGTH), - ("deviceName", c_char * DCGM_MAX_STR_LENGTH), - ("pciBusId", c_char * DCGM_MAX_STR_LENGTH), - ("serial", c_char * DCGM_MAX_STR_LENGTH), - ("uuid", c_char * DCGM_MAX_STR_LENGTH), - ("vbios", c_char * DCGM_MAX_STR_LENGTH), - ("inforomImageVersion", c_char * DCGM_MAX_STR_LENGTH), - ("pciDeviceId", c_uint32), - ("pciSubSystemId", c_uint32), - ("driverVersion", c_char * DCGM_MAX_STR_LENGTH), - ("virtualizationMode", c_uint32), - ] + _fields_ = [('version', c_uint), + ('brandName', c_char * DCGM_MAX_STR_LENGTH), + ('deviceName', c_char * DCGM_MAX_STR_LENGTH), + ('pciBusId', c_char * DCGM_MAX_STR_LENGTH), + ('serial', c_char * DCGM_MAX_STR_LENGTH), + ('uuid', c_char * DCGM_MAX_STR_LENGTH), + ('vbios', c_char * DCGM_MAX_STR_LENGTH), + ('inforomImageVersion', c_char * DCGM_MAX_STR_LENGTH), + ('pciDeviceId', c_uint32), ('pciSubSystemId', c_uint32), + ('driverVersion', c_char * DCGM_MAX_STR_LENGTH), + ('virtualizationMode', c_uint32)] +# /** +# * Represents memory utilization +# */ class c_dcgmDeviceMemoryUsage_v1(_PrintableStructure): - """ - Represents memory utilization - """ - - _fields_ = [ - ("version", c_uint), - ("bar1Total", c_uint), - ("fbTotal", c_uint), - ("fbUsed", c_uint), - ("fbFree", c_uint), - ] + _fields_ = [('version', c_uint), ('bar1Total', c_uint), ('fbTotal', c_uint), + ('fbUsed', c_uint), ('fbFree', c_uint)] +# /** +# * Represents utilization values of vGPUs running on the device +# */ class c_dcgmDeviceVgpuUtilInfo_v1(_PrintableStructure): - """ - Represents utilization values of vGPUs running on the device - """ - - _fields_ = [ - ("version", c_uint), - ("vgpuId", c_uint), - ("smUtil", c_uint), - ("memUtil", c_uint), - ("encUtil", c_uint), - ("decUtil", c_uint), - ] + _fields_ = [('version', c_uint), ('vgpuId', c_uint), ('smUtil', c_uint), + ('memUtil', c_uint), ('encUtil', c_uint), ('decUtil', c_uint)] # /** # * Utilization values for processes running within vGPU VMs using the device # */ class c_dcgmDeviceVgpuProcessUtilInfo_v1(_PrintableStructure): - _fields_ = [ - ("version", c_uint), - ("vgpuId", c_uint), - ("pid", c_uint), - ("processName", c_char * DCGM_VGPU_NAME_BUFFER_SIZE), - ("smUtil", c_uint), - ("memUtil", c_uint), - ("encUtil", c_uint), - ("decUtil", c_uint), - ] + _fields_ = [('version', c_uint), ('vgpuId', c_uint), ('pid', c_uint), + ('processName', c_char * DCGM_VGPU_NAME_BUFFER_SIZE), + ('smUtil', c_uint), ('memUtil', c_uint), ('encUtil', c_uint), + ('decUtil', c_uint)] # /** # * Represents current encoder statistics for the given device/vGPU instance # */ class c_dcgmDeviceEncStats_v1(_PrintableStructure): - _fields_ = [ - ("version", c_uint), - ("sessionCount", c_uint), - ("averageFps", c_uint), - ("averageLatency", c_uint), - ] + _fields_ = [('version', c_uint), ('sessionCount', c_uint), + ('averageFps', c_uint), ('averageLatency', c_uint)] +# /** +# * Represents information about active encoder sessions on the given vGPU instance +# */ class c_dcgmDeviceVgpuEncSessions_v1(_PrintableStructure): - """ - Represents information about active encoder sessions on the given vGPU - instance - """ + _fields_ = [('version', c_uint), ('vgpuId', c_uint), ('sessionId', c_uint), + ('pid', c_uint), ('codecType', c_uint), ('hResolution', c_uint), + ('vResolution', c_uint), ('averageFps', c_uint), + ('averageLatency', c_uint)] + + +# /** +# * Represents current frame buffer capture sessions statistics for the given device/vGPU instance +# */ +class c_dcgmDeviceFbcStats_v1(_PrintableStructure): + _fields_ = [('version', c_uint), ('sessionCount', c_uint), + ('averageFps', c_uint), ('averageLatency', c_uint)] + +# /** +# * Represents information about active FBC session on the given device/vGPU instance +# */ +class c_dcgmDeviceFbcSessionInfo_t(_PrintableStructure): + _fields_ = [('version', c_uint), ('sessionId', c_uint), ('pid', c_uint), + ('vgpuId', c_uint), ('displayOrdinal', c_uint), + ('sessionType', c_uint), ('sessionFlags', c_uint), + ('hMaxResolution', c_uint), ('vMaxResolution', c_uint), + ('hResolution', c_uint), ('vResolution', c_uint), + ('averageFps', c_uint), ('averageLatency', c_uint)] + + +# /** +# * Represents all the active FBC sessions on the given device/vGPU instance +# */ +class c_dcgmDeviceFbcSessions_v1(_PrintableStructure): + _fields_ = [('version', c_uint), ('sessionCount', c_uint), + ('sessionInfo', + c_dcgmDeviceFbcSessionInfo_t * DCGM_MAX_FBC_SESSIONS)] + + +# /** +# * Represents static info related to vGPU types supported on the device +# */ +class c_dcgmDeviceVgpuTypeInfo_v1(_PrintableStructure): + _fields_ = [('version', c_uint), ('vgpuTypeId', c_uint), + ('vgpuTypeName', c_char * DCGM_VGPU_NAME_BUFFER_SIZE), + ('vgpuTypeClass', c_char * DCGM_VGPU_NAME_BUFFER_SIZE), + ('vgpuTypeLicense', c_char * DCGM_GRID_LICENSE_BUFFER_SIZE), + ('deviceId', c_uint), ('subsystemId', c_uint), + ('numDisplayHeads', c_uint), ('maxInstances', c_uint), + ('frameRateLimit', c_uint), ('maxResolutionX', c_uint), + ('maxResolutionY', c_uint), ('fbTotal', c_uint)] + + +class c_dcgmDeviceVgpuTypeInfo_v2(_PrintableStructure): + _fields_ = [('version', c_uint), ('vgpuTypeId', c_uint), + ('vgpuTypeName', c_char * DCGM_VGPU_NAME_BUFFER_SIZE), + ('vgpuTypeClass', c_char * DCGM_VGPU_NAME_BUFFER_SIZE), + ('vgpuTypeLicense', c_char * DCGM_GRID_LICENSE_BUFFER_SIZE), + ('deviceId', c_uint), ('subsystemId', c_uint), + ('numDisplayHeads', c_uint), ('maxInstances', c_uint), + ('frameRateLimit', c_uint), ('maxResolutionX', c_uint), + ('maxResolutionY', c_uint), ('fbTotal', c_uint), + ('gpuInstanceProfileId', c_uint)] + + +dcgmDeviceVgpuTypeInfo_version2 = make_dcgm_version(c_dcgmDeviceVgpuTypeInfo_v2, + 2) + + +class c_dcgmDeviceSettings_v2(_PrintableStructure): _fields_ = [ - ("version", c_uint), - ("vgpuId", c_uint), - ("sessionId", c_uint), - ("pid", c_uint), - ("codecType", c_uint), - ("hResolution", c_uint), - ("vResolution", c_uint), - ("averageFps", c_uint), - ("averageLatency", c_uint), + ('version', c_uint), + ('persistenceModeEnabled', c_uint), + ('migModeEnabled', c_uint), + ('confidentialComputeMode', c_uint), ] -class c_dcgmDeviceFbcStats_v1(_PrintableStructure): - """ - Represents current frame buffer capture sessions statistics for the given - device/vGPU instance - """ +# /** +# * Represents attributes corresponding to a device +# */ +class c_dcgmDeviceAttributes_deprecated_v1(_PrintableStructure): + _fields_ = [('version', c_uint), + ('clockSets', c_dcgmDeviceSupportedClockSets_v1), + ('thermalSettings', c_dcgmDeviceThermals_v1), + ('powerLimits', c_dcgmDevicePowerLimits_v1), + ('identifiers', c_dcgmDeviceIdentifiers_v1), + ('memoryUsage', c_dcgmDeviceMemoryUsage_v1), + ('unused', c_char * 208)] + + +dcgmDeviceAttributes_deprecated_version1 = make_dcgm_version( + c_dcgmDeviceAttributes_deprecated_v1, 1) + +# /** +# * Represents attributes corresponding to a device +# */ +class c_dcgmDeviceAttributes_v3(_PrintableStructure): _fields_ = [ - ("version", c_uint), - ("sessionCount", c_uint), - ("averageFps", c_uint), - ("averageLatency", c_uint), + ('version', c_uint), + ('clockSets', c_dcgmDeviceSupportedClockSets_v1), + ('thermalSettings', c_dcgmDeviceThermals_v1), + ('powerLimits', c_dcgmDevicePowerLimits_v1), + ('identifiers', c_dcgmDeviceIdentifiers_v1), + ('memoryUsage', c_dcgmDeviceMemoryUsage_v1), + ('settings', c_dcgmDeviceSettings_v2), ] -class c_dcgmDeviceFbcSessionInfo_t(_PrintableStructure): - """ - Represents information about active FBC session on the given device/vGPU - instance - """ +dcgmDeviceAttributes_version3 = make_dcgm_version(c_dcgmDeviceAttributes_v3, 3) + +# /** +# * Represents attributes info for a MIG device +# */ +class c_dcgmDeviceMigAttributesInfo_v1(_PrintableStructure): _fields_ = [ - ("version", c_uint), - ("sessionId", c_uint), - ("pid", c_uint), - ("vgpuId", c_uint), - ("displayOrdinal", c_uint), - ("sessionType", c_uint), - ("sessionFlags", c_uint), - ("hMaxResolution", c_uint), - ("vMaxResolution", c_uint), - ("hResolution", c_uint), - ("vResolution", c_uint), - ("averageFps", c_uint), - ("averageLatency", c_uint), + ('version', c_uint), + ('gpuInstanceId', c_uint), + ('computeInstanceId', c_uint), + ('multiprocessorCount', c_uint), + ('sharedCopyEngineCount', c_uint), + ('sharedDecoderCount', c_uint), + ('sharedEncoderCount', c_uint), + ('sharedJpegCount', c_uint), + ('sharedOfaCount', c_uint), + ('gpuInstanceSliceCount', c_uint), + ('computeInstanceSliceCount', c_uint), + ('memorySizeMB', c_uint64), ] -class c_dcgmDeviceFbcSessions_v1(_PrintableStructure): - """ - Represents all the active FBC sessions on the given device/vGPU instance - """ +dcgmDeviceMigAttributesInfo_version1 = make_dcgm_version( + c_dcgmDeviceMigAttributesInfo_v1, 1) + +# /** +# * Represents attributes for a MIG device +# */ +class c_dcgmDeviceMigAttributes_v1(_PrintableStructure): _fields_ = [ - ("version", c_uint), - ("sessionCount", c_uint), - ("sessionInfo", c_dcgmDeviceFbcSessionInfo_t * DCGM_MAX_FBC_SESSIONS), + ('version', c_uint), + ('migDevicesCount', c_uint), + ('migAttributesInfo', c_dcgmDeviceMigAttributesInfo_v1), ] -class c_dcgmDeviceVgpuTypeInfo_v1(_PrintableStructure): - """ - Represents static info related to vGPU types supported on the device - """ +dcgmDeviceMigAttributes_version1 = make_dcgm_version( + c_dcgmDeviceMigAttributes_v1, 1) + +# /** +# * Represents GPU instance profile information +# */ +class c_dcgmGpuInstanceProfileInfo_v1(_PrintableStructure): _fields_ = [ - ("version", c_uint), - ("vgpuTypeId", c_uint), - ("vgpuTypeName", c_char * DCGM_VGPU_NAME_BUFFER_SIZE), - ("vgpuTypeClass", c_char * DCGM_VGPU_NAME_BUFFER_SIZE), - ("vgpuTypeLicense", c_char * DCGM_GRID_LICENSE_BUFFER_SIZE), - ("deviceId", c_uint), - ("subsystemId", c_uint), - ("numDisplayHeads", c_uint), - ("maxInstances", c_uint), - ("frameRateLimit", c_uint), - ("maxResolutionX", c_uint), - ("maxResolutionY", c_uint), - ("fbTotal", c_uint), + ('version', c_uint), + ('id', c_uint), + ('isP2pSupported', c_uint), + ('sliceCount', c_uint), + ('instanceCount', c_uint), + ('multiprocessorCount', c_uint), + ('copyEngineCount', c_uint), + ('decoderCount', c_uint), + ('encoderCount', c_uint), + ('jpegCount', c_uint), + ('ofaCount', c_uint), + ('memorySizeMB', c_uint64), ] -class c_dcgmDeviceSettings_v1(_PrintableStructure): +dcgmGpuInstanceProfileInfo_version1 = make_dcgm_version( + c_dcgmGpuInstanceProfileInfo_v1, 1) + + +# /** +# * Represents GPU instance profiles +# */ +class c_dcgmGpuInstanceProfiles_v1(_PrintableStructure): _fields_ = [ - ("version", c_uint), - ("persistenceModeEnabled", c_uint), - ("migModeEnabled", c_uint), + ('version', c_uint), + ('profileCount', c_uint), + ('profileInfo', c_dcgmGpuInstanceProfileInfo_v1), ] -class c_dcgmDeviceAttributes_v1(_PrintableStructure): - """ - Represents attributes corresponding to a device - """ +dcgmGpuInstanceProfiles_version1 = make_dcgm_version( + c_dcgmGpuInstanceProfiles_v1, 1) + +# /** +# * Represents Compute instance profile information +# */ +class c_dcgmComputeInstanceProfileInfo_v1(_PrintableStructure): _fields_ = [ - ("version", c_uint), - ("clockSets", c_dcgmDeviceSupportedClockSets_v1), - ("thermalSettings", c_dcgmDeviceThermals_v1), - ("powerLimits", c_dcgmDevicePowerLimits_v1), - ("identifiers", c_dcgmDeviceIdentifiers_v1), - ("memoryUsage", c_dcgmDeviceMemoryUsage_v1), - ("unused", c_char * 208), + ('version', c_uint), + ('gpuInstanceId', c_uint), + ('id', c_uint), + ('sliceCount', c_uint), + ('instanceCount', c_uint), + ('multiprocessorCount', c_uint), + ('sharedCopyEngineCount', c_uint), + ('sharedDecoderCount', c_uint), + ('sharedEncoderCount', c_uint), + ('sharedJpegCount', c_uint), + ('sharedOfaCount', c_uint), ] -dcgmDeviceAttributes_version1 = make_dcgm_version(c_dcgmDeviceAttributes_v1, 1) - +dcgmComputeInstanceProfileInfo_version1 = make_dcgm_version( + c_dcgmComputeInstanceProfileInfo_v1, 1) -class c_dcgmDeviceAttributes_v2(_PrintableStructure): - """ - Represents attributes corresponding to a device - """ +# /** +# * Represents Compute instance profiles +# */ +class c_dcgmComputeInstanceProfiles_v1(_PrintableStructure): _fields_ = [ - ("version", c_uint), - ("clockSets", c_dcgmDeviceSupportedClockSets_v1), - ("thermalSettings", c_dcgmDeviceThermals_v1), - ("powerLimits", c_dcgmDevicePowerLimits_v1), - ("identifiers", c_dcgmDeviceIdentifiers_v1), - ("memoryUsage", c_dcgmDeviceMemoryUsage_v1), - ("settings", c_dcgmDeviceSettings_v1), + ('version', c_uint), + ('profileCount', c_uint), + ('profileInfo', c_dcgmComputeInstanceProfileInfo_v1), ] -dcgmDeviceAttributes_version2 = make_dcgm_version(c_dcgmDeviceAttributes_v2, 2) +dcgmComputeInstanceProfiles_version1 = make_dcgm_version( + c_dcgmComputeInstanceProfiles_v1, 1) +# /** +# * Represents vGPU attributes corresponding to a device +# */ class c_dcgmVgpuDeviceAttributes_v6(_PrintableStructure): - """ - Represents vGPU attributes corresponding to a device - """ - _fields_ = [ - ("version", c_uint), - ("activeVgpuInstanceCount", c_uint), - ("activeVgpuInstanceIds", c_uint * DCGM_MAX_VGPU_INSTANCES_PER_PGPU), - ("creatableVgpuTypeCount", c_uint), - ("creatableVgpuTypeIds", c_uint * DCGM_MAX_VGPU_TYPES_PER_PGPU), - ("supportedVgpuTypeCount", c_uint), - ( - "supportedVgpuTypeInfo", - c_dcgmDeviceVgpuTypeInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU, - ), - ("vgpuUtilInfo", c_dcgmDeviceVgpuUtilInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU), - ("gpuUtil", c_uint), - ("memCopyUtil", c_uint), - ("encUtil", c_uint), - ("decUtil", c_uint), + ('version', c_uint), ('activeVgpuInstanceCount', c_uint), + ('activeVgpuInstanceIds', c_uint * DCGM_MAX_VGPU_INSTANCES_PER_PGPU), + ('creatableVgpuTypeCount', c_uint), + ('creatableVgpuTypeIds', c_uint * DCGM_MAX_VGPU_TYPES_PER_PGPU), + ('supportedVgpuTypeCount', c_uint), + ('supportedVgpuTypeInfo', + c_dcgmDeviceVgpuTypeInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU), + ('vgpuUtilInfo', + c_dcgmDeviceVgpuUtilInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU), + ('gpuUtil', c_uint), ('memCopyUtil', c_uint), ('encUtil', c_uint), + ('decUtil', c_uint) ] -dcgmVgpuDeviceAttributes_version6 = make_dcgm_version(c_dcgmVgpuDeviceAttributes_v6, 1) +dcgmVgpuDeviceAttributes_version6 = make_dcgm_version( + c_dcgmVgpuDeviceAttributes_v6, 1) -class c_dcgmVgpuInstanceAttributes_v1(_PrintableStructure): - """ - Represents attributes specific to vGPU instance - """ - +class c_dcgmVgpuDeviceAttributes_v7(_PrintableStructure): _fields_ = [ - ("version", c_uint), - ("vmId", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE), - ("vmName", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE), - ("vgpuTypeId", c_uint), - ("vgpuUuid", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE), - ("vgpuDriverVersion", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE), - ("fbUsage", c_uint), - ("licenseStatus", c_uint), - ("frameRateLimit", c_uint), + ('version', c_uint), ('activeVgpuInstanceCount', c_uint), + ('activeVgpuInstanceIds', c_uint * DCGM_MAX_VGPU_INSTANCES_PER_PGPU), + ('creatableVgpuTypeCount', c_uint), + ('creatableVgpuTypeIds', c_uint * DCGM_MAX_VGPU_TYPES_PER_PGPU), + ('supportedVgpuTypeCount', c_uint), + ('supportedVgpuTypeInfo', + c_dcgmDeviceVgpuTypeInfo_v2 * DCGM_MAX_VGPU_TYPES_PER_PGPU), + ('vgpuUtilInfo', + c_dcgmDeviceVgpuUtilInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU), + ('gpuUtil', c_uint), ('memCopyUtil', c_uint), ('encUtil', c_uint), + ('decUtil', c_uint) ] +dcgmVgpuDeviceAttributes_version7 = make_dcgm_version( + c_dcgmVgpuDeviceAttributes_v7, 7) + + +# /** +# * Represents attributes specific to vGPU instance +# */ +class c_dcgmVgpuInstanceAttributes_v1(_PrintableStructure): + _fields_ = [('version', c_uint), + ('vmId', c_char * DCGM_DEVICE_UUID_BUFFER_SIZE), + ('vmName', c_char * DCGM_DEVICE_UUID_BUFFER_SIZE), + ('vgpuTypeId', c_uint), + ('vgpuUuid', c_char * DCGM_DEVICE_UUID_BUFFER_SIZE), + ('vgpuDriverVersion', c_char * DCGM_DEVICE_UUID_BUFFER_SIZE), + ('fbUsage', c_uint), ('licenseStatus', c_uint), + ('frameRateLimit', c_uint)] + + dcgmVgpuInstanceAttributes_version1 = make_dcgm_version( - c_dcgmVgpuInstanceAttributes_v1, 1 -) + c_dcgmVgpuInstanceAttributes_v1, 1) class c_dcgmConfigPowerLimit(_PrintableStructure): - _fields_ = [("type", c_uint), ("val", c_uint)] + _fields_ = [('type', c_uint), ('val', c_uint)] class c_dcgmConfigPerfStateSettings_t(_PrintableStructure): @@ -1146,12 +1286,12 @@ class c_dcgmConfigPerfStateSettings_t(_PrintableStructure): class c_dcgmDeviceConfig_v1(_PrintableStructure): _fields_ = [ # version must always be first - ("version", c_uint), - ("gpuId", c_uint), - ("mEccMode", c_uint), - ("mComputeMode", c_uint), - ("mPerfState", c_dcgmConfigPerfStateSettings_t), - ("mPowerLimit", c_dcgmConfigPowerLimit), + ('version', c_uint), + ('gpuId', c_uint), + ('mEccMode', c_uint), + ('mComputeMode', c_uint), + ('mPerfState', c_dcgmConfigPerfStateSettings_t), + ('mPowerLimit', c_dcgmConfigPowerLimit) ] @@ -1162,16 +1302,16 @@ class c_dcgmDeviceConfig_v1(_PrintableStructure): class c_dcgmDeviceVgpuConfig_v1(_PrintableStructure): _fields_ = [ # version must always be first - ("version", c_uint), - ("gpuId", c_uint), - ("mEccMode", c_uint), - ("mComputeMode", c_uint), - ("mPerfState", c_dcgmConfigPerfStateSettings_t), - ("mPowerLimit", c_dcgmConfigPowerLimit), + ('version', c_uint), + ('gpuId', c_uint), + ('mEccMode', c_uint), + ('mComputeMode', c_uint), + ('mPerfState', c_dcgmConfigPerfStateSettings_t), + ('mPowerLimit', c_dcgmConfigPowerLimit) ] def SetBlank(self): - # Does not set version or gpuId + #Does not set version or gpuId self.mEccMode = dcgmvalue.DCGM_INT32_BLANK self.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK self.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK @@ -1224,7 +1364,8 @@ class c_dcgmUnwatchFieldValue_v1(_PrintableStructure): _fields_ = [] -dcgmUnwatchFieldValue_version1 = make_dcgm_version(c_dcgmUnwatchFieldValue_v1, 1) +dcgmUnwatchFieldValue_version1 = make_dcgm_version(c_dcgmUnwatchFieldValue_v1, + 1) class c_dcgmUpdateAllFields_v1(_PrintableStructure): @@ -1233,9 +1374,19 @@ class c_dcgmUpdateAllFields_v1(_PrintableStructure): dcgmUpdateAllFields_version1 = make_dcgm_version(c_dcgmUpdateAllFields_v1, 1) -dcgmGetMultipleValuesForField_version1 = 1 +dcgmGetMultipleValuesForFieldResponse_version1 = 1 + +# policy enums (and table indices) +DCGM_POLICY_COND_IDX_DBE = 0 +DCGM_POLICY_COND_IDX_PCI = 1 +DCGM_POLICY_COND_IDX_MAX_PAGES_RETIRED = 2 +DCGM_POLICY_COND_IDX_THERMAL = 3 +DCGM_POLICY_COND_IDX_POWER = 4 +DCGM_POLICY_COND_IDX_NVLINK = 5 +DCGM_POLICY_COND_IDX_XID = 6 +DCGM_POLICY_COND_IDX_MAX = 7 -# policy enums +# policy enum bitmasks DCGM_POLICY_COND_DBE = 0x1 DCGM_POLICY_COND_PCI = 0x2 DCGM_POLICY_COND_MAX_PAGES_RETIRED = 0x4 @@ -1251,12 +1402,13 @@ class c_dcgmUpdateAllFields_v1(_PrintableStructure): DCGM_POLICY_ISOLATION_NONE = 0 DCGM_POLICY_ACTION_NONE = 0 -DCGM_POLICY_ACTION_GPURESET = 1 # Deprecated +DCGM_POLICY_ACTION_GPURESET = 1 #Deprecated DCGM_POLICY_VALID_NONE = 0 DCGM_POLICY_VALID_SV_SHORT = 1 DCGM_POLICY_VALID_SV_MED = 2 DCGM_POLICY_VALID_SV_LONG = 3 +DCGM_POLICY_VALID_SV_XLONG = 4 DCGM_POLICY_FAILURE_NONE = 0 @@ -1264,6 +1416,7 @@ class c_dcgmUpdateAllFields_v1(_PrintableStructure): DCGM_DIAG_LVL_SHORT = 10 DCGM_DIAG_LVL_MED = 20 DCGM_DIAG_LVL_LONG = 30 +DCGM_DIAG_LVL_XLONG = 40 DCGM_DIAG_RESULT_PASS = 0 DCGM_DIAG_RESULT_SKIP = 1 @@ -1272,7 +1425,7 @@ class c_dcgmUpdateAllFields_v1(_PrintableStructure): DCGM_DIAG_RESULT_NOT_RUN = 4 -class c_dcgmPolicyConditionParmTypes_t(Union): +class c_dcgmPolicyConditionParmTypes_t(DcgmUnion): _fields_ = [ ("boolean", c_bool), ("llval", c_longlong), @@ -1280,7 +1433,7 @@ class c_dcgmPolicyConditionParmTypes_t(Union): class c_dcgmPolicyConditionParms_t(_PrintableStructure): - _fields_ = [("tag", c_uint), ("val", c_dcgmPolicyConditionParmTypes_t)] + _fields_ = [('tag', c_uint), ('val', c_dcgmPolicyConditionParmTypes_t)] class c_dcgmPolicy_v1(_PrintableStructure): @@ -1303,39 +1456,39 @@ class c_dcgmPolicy_v1(_PrintableStructure): class c_dcgmPolicyConditionPci_t(_PrintableStructure): _fields_ = [ ("timestamp", c_longlong), # timestamp of the error - ("counter", c_uint), # value of the PCIe replay counter + ("counter", c_uint) # value of the PCIe replay counter ] class c_dcgmPolicyConditionDbe_t(_PrintableStructure): - LOCATIONS = {"L1": 0, "L2": 1, "DEVICE": 2, "REGISTER": 3, "TEXTURE": 4} + LOCATIONS = {'L1': 0, 'L2': 1, 'DEVICE': 2, 'REGISTER': 3, 'TEXTURE': 4} _fields_ = [ - ("timestamp", c_longlong), # timestamp of the error + ("timestamp", c_longlong), # timestamp of the error ("location", c_int), # location of the error (one of self.LOCATIONS) - ("numerrors", c_uint), # number of errors + ("numerrors", c_uint) # number of errors ] class c_dcgmPolicyConditionMpr_t(_PrintableStructure): _fields_ = [ - ("timestamp", c_longlong), # timestamp of the error - ("sbepages", c_uint), # number of pending pages due to SBE - ("dbepages", c_uint), # number of pending pages due to DBE + ("timestamp", c_longlong), # timestamp of the error + ("sbepages", c_uint), # number of pending pages due to SBE + ("dbepages", c_uint) # number of pending pages due to DBE ] class c_dcgmPolicyConditionThermal_t(_PrintableStructure): _fields_ = [ ("timestamp", c_longlong), # timestamp of the error - ("thermalViolation", c_uint), # Temperature reached that violated policy + ("thermalViolation", c_uint) # Temperature reached that violated policy ] class c_dcgmPolicyConditionPower_t(_PrintableStructure): _fields_ = [ ("timestamp", c_longlong), # timestamp of the error - ("powerViolation", c_uint), # Power value reached that violated policyy + ("powerViolation", c_uint) # Power value reached that violated policyy ] @@ -1343,59 +1496,49 @@ class c_dcgmPolicyConditionNvlink_t(_PrintableStructure): _fields_ = [ ("timestamp", c_longlong), # timestamp of the error ("fieldId", c_ushort), # FieldId of the nvlink error counter - ("counter", c_uint), # Error value reached that violated policyy + ("counter", c_uint) # Error value reached that violated policyy ] class c_dcgmPolicyConditionXID_t(_PrintableStructure): _fields_ = [ ("timestamp", c_longlong), # timestamp of the error - ("errnum", c_uint), # XID error number + ("errnum", c_uint) # XID error number ] class c_dcgmPolicyCallbackResponse_v1(_PrintableStructure): - class Value(Union): + + class Value(DcgmUnion): # implement more of the fields when a test requires them _fields_ = [ - ("dbe", c_dcgmPolicyConditionDbe_t), # ECC DBE return structure - ("pci", c_dcgmPolicyConditionPci_t), # PCI replay error return structure - ( - "mpr", - c_dcgmPolicyConditionMpr_t, - ), # Max retired pages limit return structure - ( - "thermal", - c_dcgmPolicyConditionThermal_t, - ), # Thermal policy violations return structure - ( - "power", - c_dcgmPolicyConditionPower_t, - ), # Power policy violations return structure - ( - "nvlink", - c_dcgmPolicyConditionNvlink_t, + ("dbe", c_dcgmPolicyConditionDbe_t + ), # ECC DBE return structure + ("pci", c_dcgmPolicyConditionPci_t + ), # PCI replay error return structure + ("mpr", c_dcgmPolicyConditionMpr_t + ), # Max retired pages limit return structure + ("thermal", c_dcgmPolicyConditionThermal_t + ), # Thermal policy violations return structure + ("power", c_dcgmPolicyConditionPower_t + ), # Power policy violations return structure + ("nvlink", c_dcgmPolicyConditionNvlink_t ), # Nvlink policy violations return structure.. - ( - "xid", - c_dcgmPolicyConditionXID_t, - ), # XID policy violations return structure + ("xid", c_dcgmPolicyConditionXID_t + ) # XID policy violations return structure ] _fields_ = [ ("version", c_uint), ("condition", c_int), # an OR'ed list of DCGM_POLICY_COND_* - ("val", Value), + ("val", Value) ] -class c_dcgmFieldValue_v1_value(Union): - _fields_ = [ - ("i64", c_int64), - ("dbl", c_double), - ("str", c_char * DCGM_MAX_STR_LENGTH), - ("blob", c_byte * DCGM_MAX_BLOB_LENGTH), - ] +class c_dcgmFieldValue_v1_value(DcgmUnion): + _fields_ = [('i64', c_int64), ('dbl', c_double), + ('str', c_char * DCGM_MAX_STR_LENGTH), + ('blob', c_byte * DCGM_MAX_BLOB_LENGTH)] # This structure is used to represent value for the field to be queried. @@ -1414,8 +1557,7 @@ class c_dcgmFieldValue_v1(_PrintableStructure): dcgmFieldValue_version1 = make_dcgm_version(c_dcgmFieldValue_v1, 1) -# This structure is used to represent value for the field to be queried -# (version 2) +# This structure is used to represent value for the field to be queried (version 2) class c_dcgmFieldValue_v2(_PrintableStructure): _fields_ = [ # version must always be first @@ -1433,7 +1575,7 @@ class c_dcgmFieldValue_v2(_PrintableStructure): dcgmFieldValue_version2 = make_dcgm_version(c_dcgmFieldValue_v2, 2) -# Field value flags used by dcgm_agent.dcgmEntitiesGetLatestValues() +#Field value flags used by dcgm_agent.dcgmEntitiesGetLatestValues() DCGM_FV_FLAG_LIVE_DATA = 0x00000001 DCGM_HEALTH_WATCH_PCIE = 0x1 @@ -1458,7 +1600,7 @@ class c_dcgmFieldValue_v2(_PrintableStructure): class c_dcgmDiagErrorDetail_t(_PrintableStructure): - _fields_ = [("msg", c_char * 1024), ("code", c_uint)] + _fields_ = [('msg', c_char * 1024), ('code', c_uint)] DCGM_HEALTH_WATCH_MAX_INCIDENTS = DCGM_GROUP_MAX_ENTITIES @@ -1486,37 +1628,36 @@ class c_dcgmHealthResponse_v4(_PrintableStructure): class c_dcgmHealthSetParams_v2(_PrintableStructure): - _fields_ = [ - ("version", c_uint32), - ("groupId", c_void_p), - ("systems", c_uint32), - ("updateInterval", c_int64), - ("maxKeepAge", c_double), - ] + _fields_ = [('version', c_uint32), ('groupId', c_void_p), + ('systems', c_uint32), ('updateInterval', c_int64), + ('maxKeepAge', c_double)] dcgmHealthSetParams_version2 = make_dcgm_version(c_dcgmHealthSetParams_v2, 2) -# Pid info structs +#Pid info structs class c_dcgmStatSummaryInt64_t(_PrintableStructure): - _fields_ = [("minValue", c_int64), ("maxValue", c_int64), ("average", c_int64)] + _fields_ = [('minValue', c_int64), ('maxValue', c_int64), + ('average', c_int64)] class c_dcgmStatSummaryInt32_t(_PrintableStructure): - _fields_ = [("minValue", c_int32), ("maxValue", c_int32), ("average", c_int32)] + _fields_ = [('minValue', c_int32), ('maxValue', c_int32), + ('average', c_int32)] class c_dcgmStatSummaryFp64_t(_PrintableStructure): - _fields_ = [("minValue", c_double), ("maxValue", c_double), ("average", c_double)] + _fields_ = [('minValue', c_double), ('maxValue', c_double), + ('average', c_double)] class c_dcgmProcessUtilInfo_t(_PrintableStructure): - _fields_ = [("pid", c_uint), ("smUtil", c_double), ("memUtil", c_double)] + _fields_ = [('pid', c_uint), ('smUtil', c_double), ('memUtil', c_double)] class c_dcgmHealthResponseInfo_t(_PrintableStructure): - _fields_ = [("system", c_uint), ("health", c_uint)] + _fields_ = [('system', c_uint), ('health', c_uint)] DCGM_MAX_PID_INFO_NUM = 16 @@ -1524,167 +1665,153 @@ class c_dcgmHealthResponseInfo_t(_PrintableStructure): class c_dcgmPidSingleInfo_t(_PrintableStructure): _fields_ = [ - ("gpuId", c_uint32), - ("energyConsumed", c_int64), - ("pcieRxBandwidth", c_dcgmStatSummaryInt64_t), - ("pcieTxBandwidth", c_dcgmStatSummaryInt64_t), - ("pcieReplays", c_int64), - ("startTime", c_int64), - ("endTime", c_int64), - ("processUtilization", c_dcgmProcessUtilInfo_t), - ("smUtilization", c_dcgmStatSummaryInt32_t), - ("memoryUtilization", c_dcgmStatSummaryInt32_t), - ("eccSingleBit", c_uint32), # Deprecated - ("eccDoubleBit", c_uint32), - ("memoryClock", c_dcgmStatSummaryInt32_t), - ("smClock", c_dcgmStatSummaryInt32_t), - ("numXidCriticalErrors", c_int32), - ("xidCriticalErrorsTs", c_int64 * 10), - ("numOtherComputePids", c_int32), - ("otherComputePids", c_uint32 * DCGM_MAX_PID_INFO_NUM), - ("numOtherGraphicsPids", c_int32), - ("otherGraphicsPids", c_uint32 * DCGM_MAX_PID_INFO_NUM), - ("maxGpuMemoryUsed", c_int64), - ("powerViolationTime", c_int64), - ("thermalViolationTime", c_int64), - ("reliabilityViolationTime", c_int64), - ("boardLimitViolationTime", c_int64), - ("lowUtilizationTime", c_int64), - ("syncBoostTime", c_int64), - ("overallHealth", c_uint), - ("incidentCount", c_uint), - ("systems", c_dcgmHealthResponseInfo_t * DCGM_HEALTH_WATCH_COUNT_V1), + ('gpuId', c_uint32), + ('energyConsumed', c_int64), + ('pcieRxBandwidth', c_dcgmStatSummaryInt64_t), + ('pcieTxBandwidth', c_dcgmStatSummaryInt64_t), + ('pcieReplays', c_int64), + ('startTime', c_int64), + ('endTime', c_int64), + ('processUtilization', c_dcgmProcessUtilInfo_t), + ('smUtilization', c_dcgmStatSummaryInt32_t), + ('memoryUtilization', c_dcgmStatSummaryInt32_t), + ('eccSingleBit', c_uint32), #Deprecated + ('eccDoubleBit', c_uint32), + ('memoryClock', c_dcgmStatSummaryInt32_t), + ('smClock', c_dcgmStatSummaryInt32_t), + ('numXidCriticalErrors', c_int32), + ('xidCriticalErrorsTs', c_int64 * 10), + ('numOtherComputePids', c_int32), + ('otherComputePids', c_uint32 * DCGM_MAX_PID_INFO_NUM), + ('numOtherGraphicsPids', c_int32), + ('otherGraphicsPids', c_uint32 * DCGM_MAX_PID_INFO_NUM), + ('maxGpuMemoryUsed', c_int64), + ('powerViolationTime', c_int64), + ('thermalViolationTime', c_int64), + ('reliabilityViolationTime', c_int64), + ('boardLimitViolationTime', c_int64), + ('lowUtilizationTime', c_int64), + ('syncBoostTime', c_int64), + ('overallHealth', c_uint), + ('incidentCount', c_uint), + ('systems', c_dcgmHealthResponseInfo_t * DCGM_HEALTH_WATCH_COUNT_V1) ] class c_dcgmPidInfo_v2(_PrintableStructure): - _fields_ = [ - ("version", c_uint32), - ("pid", c_uint32), - ("unused", c_uint32), - ("numGpus", c_int32), - ("summary", c_dcgmPidSingleInfo_t), - ("gpus", c_dcgmPidSingleInfo_t * DCGM_MAX_NUM_DEVICES), - ] + _fields_ = [('version', c_uint32), ('pid', c_uint32), ('unused', c_uint32), + ('numGpus', c_int32), ('summary', c_dcgmPidSingleInfo_t), + ('gpus', c_dcgmPidSingleInfo_t * DCGM_MAX_NUM_DEVICES)] dcgmPidInfo_version2 = make_dcgm_version(c_dcgmPidInfo_v2, 2) class c_dcgmRunningProcess_v1(_PrintableStructure): - _fields_ = [("version", c_uint32), ("pid", c_uint32), ("memoryUsed", c_uint64)] + _fields_ = [('version', c_uint32), ('pid', c_uint32), + ('memoryUsed', c_uint64)] dcgmRunningProcess_version1 = make_dcgm_version(c_dcgmRunningProcess_v1, 1) +c_dcgmRunningProcess_t = c_dcgmRunningProcess_v1 + class c_dcgmGpuUsageInfo_t(_PrintableStructure): _fields_ = [ - ("gpuId", c_uint32), - ("energyConsumed", c_int64), - ("powerUsage", c_dcgmStatSummaryFp64_t), - ("pcieRxBandwidth", c_dcgmStatSummaryInt64_t), - ("pcieTxBandwidth", c_dcgmStatSummaryInt64_t), - ("pcieReplays", c_int64), - ("startTime", c_int64), - ("endTime", c_int64), - ("smUtilization", c_dcgmStatSummaryInt32_t), - ("memoryUtilization", c_dcgmStatSummaryInt32_t), - ("eccSingleBit", c_uint32), # Deprecated - ("eccDoubleBit", c_uint32), - ("memoryClock", c_dcgmStatSummaryInt32_t), - ("smClock", c_dcgmStatSummaryInt32_t), - ("numXidCriticalErrors", c_int32), - ("xidCriticalErrorsTs", c_int64 * 10), - ("numComputePids", c_int32), - ("computePids", c_dcgmProcessUtilInfo_t * DCGM_MAX_PID_INFO_NUM), - ("numGraphicsPids", c_int32), - ("graphicsPids", c_dcgmProcessUtilInfo_t * DCGM_MAX_PID_INFO_NUM), - ("maxGpuMemoryUsed", c_int64), - ("powerViolationTime", c_int64), - ("thermalViolationTime", c_int64), - ("reliabilityViolationTime", c_int64), - ("boardLimitViolationTime", c_int64), - ("lowUtilizationTime", c_int64), - ("syncBoostTime", c_int64), - ("overallHealth", c_uint), - ("incidentCount", c_uint), - ("systems", c_dcgmHealthResponseInfo_t * DCGM_HEALTH_WATCH_COUNT_V1), + ('gpuId', c_uint32), + ('energyConsumed', c_int64), + ('powerUsage', c_dcgmStatSummaryFp64_t), + ('pcieRxBandwidth', c_dcgmStatSummaryInt64_t), + ('pcieTxBandwidth', c_dcgmStatSummaryInt64_t), + ('pcieReplays', c_int64), + ('startTime', c_int64), + ('endTime', c_int64), + ('smUtilization', c_dcgmStatSummaryInt32_t), + ('memoryUtilization', c_dcgmStatSummaryInt32_t), + ('eccSingleBit', c_uint32), #Deprecated + ('eccDoubleBit', c_uint32), + ('memoryClock', c_dcgmStatSummaryInt32_t), + ('smClock', c_dcgmStatSummaryInt32_t), + ('numXidCriticalErrors', c_int32), + ('xidCriticalErrorsTs', c_int64 * 10), + ('numComputePids', c_int32), + ('computePids', c_dcgmProcessUtilInfo_t * DCGM_MAX_PID_INFO_NUM), + ('numGraphicsPids', c_int32), + ('graphicsPids', c_dcgmProcessUtilInfo_t * DCGM_MAX_PID_INFO_NUM), + ('maxGpuMemoryUsed', c_int64), + ('powerViolationTime', c_int64), + ('thermalViolationTime', c_int64), + ('reliabilityViolationTime', c_int64), + ('boardLimitViolationTime', c_int64), + ('lowUtilizationTime', c_int64), + ('syncBoostTime', c_int64), + ('overallHealth', c_uint), + ('incidentCount', c_uint), + ('systems', c_dcgmHealthResponseInfo_t * DCGM_HEALTH_WATCH_COUNT_V1) ] class c_dcgmJobInfo_v3(_PrintableStructure): - _fields_ = [ - ("version", c_uint32), - ("numGpus", c_int32), - ("summary", c_dcgmGpuUsageInfo_t), - ("gpus", c_dcgmGpuUsageInfo_t * DCGM_MAX_NUM_DEVICES), - ] + _fields_ = [('version', c_uint32), ('numGpus', c_int32), + ('summary', c_dcgmGpuUsageInfo_t), + ('gpus', c_dcgmGpuUsageInfo_t * DCGM_MAX_NUM_DEVICES)] dcgmJobInfo_version3 = make_dcgm_version(c_dcgmJobInfo_v3, 3) class c_dcgmDiagTestResult_v2(_PrintableStructure): - _fields_ = [ - ("result", c_uint), - ("error", c_dcgmDiagErrorDetail_t), - ("info", c_char * 1024), - ] + _fields_ = [('result', c_uint), ('error', c_dcgmDiagErrorDetail_t), + ('info', c_char * 1024)] -class c_dcgmDiagResponsePerGpu_v2(_PrintableStructure): - _fields_ = [ - ("gpuId", c_uint), - ("hwDiagnosticReturn", c_uint), - ("results", c_dcgmDiagTestResult_v2 * DCGM_PER_GPU_TEST_COUNT), - ] +class c_dcgmDiagResponsePerGpu_v4(_PrintableStructure): + _fields_ = [('gpuId', c_uint), ('hwDiagnosticReturn', c_uint), + ('results', + c_dcgmDiagTestResult_v2 * DCGM_PER_GPU_TEST_COUNT_V8)] DCGM_SWTEST_COUNT = 10 LEVEL_ONE_MAX_RESULTS = 16 -class c_dcgmDiagResponse_v6(_PrintableStructure): +class c_dcgmDiagResponse_v8(_PrintableStructure): _fields_ = [ - ("version", c_uint), - ("gpuCount", c_uint), - ("levelOneTestCount", c_uint), - ("levelOneResults", c_dcgmDiagTestResult_v2 * LEVEL_ONE_MAX_RESULTS), - ("perGpuResponses", c_dcgmDiagResponsePerGpu_v2 * DCGM_MAX_NUM_DEVICES), - ("systemError", c_dcgmDiagErrorDetail_t), - ("trainingMsg", c_char * 1024), + ('version', c_uint), ('gpuCount', c_uint), + ('levelOneTestCount', c_uint), + ('levelOneResults', c_dcgmDiagTestResult_v2 * LEVEL_ONE_MAX_RESULTS), + ('perGpuResponses', c_dcgmDiagResponsePerGpu_v4 * DCGM_MAX_NUM_DEVICES), + ('systemError', c_dcgmDiagErrorDetail_t), ('_unused', c_char * 1024) ] -dcgmDiagResponse_version6 = make_dcgm_version(c_dcgmDiagResponse_v6, 6) +dcgmDiagResponse_version8 = make_dcgm_version(c_dcgmDiagResponse_v8, 8) DCGM_AFFINITY_BITMASK_ARRAY_SIZE = 8 class c_dcgmDeviceTopologyPath_t(_PrintableStructure): - _fields_ = [("gpuId", c_uint32), ("path", c_uint32), ("localNvLinkIds", c_uint32)] + _fields_ = [('gpuId', c_uint32), ('path', c_uint32), + ('localNvLinkIds', c_uint32)] class c_dcgmDeviceTopology_v1(_PrintableStructure): - _fields_ = [ - ("version", c_uint32), - ("cpuAffinityMask", c_ulong * DCGM_AFFINITY_BITMASK_ARRAY_SIZE), - ("numGpus", c_uint32), - ("gpuPaths", c_dcgmDeviceTopologyPath_t * (DCGM_MAX_NUM_DEVICES - 1)), - ] + _fields_ = [('version', c_uint32), + ('cpuAffinityMask', c_ulong * DCGM_AFFINITY_BITMASK_ARRAY_SIZE), + ('numGpus', c_uint32), + ('gpuPaths', + c_dcgmDeviceTopologyPath_t * (DCGM_MAX_NUM_DEVICES - 1))] dcgmDeviceTopology_version1 = make_dcgm_version(c_dcgmDeviceTopology_v1, 1) class c_dcgmGroupTopology_v1(_PrintableStructure): - _fields_ = [ - ("version", c_uint32), - ("groupCpuAffinityMask", c_ulong * DCGM_AFFINITY_BITMASK_ARRAY_SIZE), - ("numaOptimalFlag", c_uint32), - ("slowestPath", c_uint32), - ] + _fields_ = [('version', c_uint32), + ('groupCpuAffinityMask', + c_ulong * DCGM_AFFINITY_BITMASK_ARRAY_SIZE), + ('numaOptimalFlag', c_uint32), ('slowestPath', c_uint32)] dcgmGroupTopology_version1 = make_dcgm_version(c_dcgmGroupTopology_v1, 1) @@ -1697,202 +1824,50 @@ class c_dcgmGroupTopology_v1(_PrintableStructure): class c_dcgmFieldGroupInfo_v1(_PrintableStructure): - _fields_ = [ - ("version", c_uint32), - ("numFieldIds", c_uint32), - ("fieldGroupId", c_void_p), - ("fieldGroupName", c_char * DCGM_MAX_STR_LENGTH), - ("fieldIds", c_uint16 * DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP), - ] + _fields_ = [('version', c_uint32), ('numFieldIds', c_uint32), + ('fieldGroupId', c_void_p), + ('fieldGroupName', c_char * DCGM_MAX_STR_LENGTH), + ('fieldIds', c_uint16 * DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP)] dcgmFieldGroupInfo_version1 = make_dcgm_version(c_dcgmFieldGroupInfo_v1, 1) class c_dcgmAllFieldGroup_v1(_PrintableStructure): - _fields_ = [ - ("version", c_uint32), - ("numFieldGroups", c_uint32), - ("fieldGroups", c_dcgmFieldGroupInfo_v1 * DCGM_MAX_NUM_FIELD_GROUPS), - ] + _fields_ = [('version', c_uint32), ('numFieldGroups', c_uint32), + ('fieldGroups', + c_dcgmFieldGroupInfo_v1 * DCGM_MAX_NUM_FIELD_GROUPS)] dcgmAllFieldGroup_version1 = make_dcgm_version(c_dcgmAllFieldGroup_v1, 1) -class DCGM_INTROSPECT_LVL(object): - """ - Identifies a level to retrieve field introspection info for - """ - - INVALID = 0 - FIELD = 1 - FIELD_GROUP = 2 - ALL_FIELDS = 3 - - -class c_dcgmIntrospectContext_v1(_PrintableStructure): - """ - Identifies the retrieval context for introspection API calls. - """ - - _fields_ = [ - ("version", c_uint32), - # one of DCGM_INTROSPECT_LVL_? - ("introspectLvl", c_int), - # Only needed if \ref introspectLvl is FIELD_GROUP - ("fieldGroupId", c_void_p), - ] - - -dcgmIntrospectContext_version1 = make_dcgm_version(c_dcgmIntrospectContext_v1, 1) - - class c_dcgmIntrospectMemory_v1(_PrintableStructure): _fields_ = [ - ("version", c_uint32), - ( - # The total number of bytes being used to store all of the fields - # being watched - "bytesUsed", - c_longlong, - ), + ('version', c_uint32), + ('bytesUsed', c_longlong + ) # The total number of bytes being used to store all of the fields being watched ] dcgmIntrospectMemory_version1 = make_dcgm_version(c_dcgmIntrospectMemory_v1, 1) -class c_dcgmIntrospectFieldsExecTime_v1(_PrintableStructure): - _fields_ = [ - ( - # version number (dcgmIntrospectFieldsExecTime_version) - "version", - c_uint32, - ), - ( - # the mean update frequency of all fields - "meanUpdateFreqUsec", - c_longlong, - ), - ( - # the sum of every field's most recent execution time after they - # have been normalized to \ref meanUpdateFreqUsec. - # This is roughly how long it takes to update fields every \ref - # meanUpdateFreqUsec - "recentUpdateUsec", - c_double, - ), - ( - # The total amount of time, ever, that has been spent updating all - # the fields - "totalEverUpdateUsec", - c_longlong, - ), - ] - - -dcgmIntrospectFieldsExecTime_version1 = make_dcgm_version( - c_dcgmIntrospectFieldsExecTime_v1, 1 -) - - -class c_dcgmIntrospectFullFieldsExecTime_v2(_PrintableStructure): - """ - Full introspection info for field execution time - """ - - _fields_ = [ - ("version", c_uint32), - ( - "aggregateInfo", - c_dcgmIntrospectFieldsExecTime_v1, - ), # info that includes global and device scope - ( - "hasGlobalInfo", - c_int, - ), # 0 means \ref globalInfo is populated, !0 means it's not - ( - "globalInfo", - c_dcgmIntrospectFieldsExecTime_v1, - ), # info that only includes global field scope - ( - "gpuInfoCount", - c_uint, - ), # count of how many entries in \ref gpuInfo are populated - ( - "gpuIdsForGpuInfo", - c_uint * DCGM_MAX_NUM_DEVICES, - ), # the GPU ID at a given index identifies which gpu - # the corresponding entry in \ref gpuInfo is from - ( - "gpuInfo", - c_dcgmIntrospectFieldsExecTime_v1 * DCGM_MAX_NUM_DEVICES, - ), # info that is separated by the - # GPU ID that the watches were for - ] - - -dcgmIntrospectFullFieldsExecTime_version2 = make_dcgm_version( - c_dcgmIntrospectFullFieldsExecTime_v2, 2 -) - - -class c_dcgmIntrospectFullMemory_v1(_PrintableStructure): - """ - Full introspection info for field memory - """ - - _fields_ = [ - ("version", c_uint32), - ( - "aggregateInfo", - c_dcgmIntrospectMemory_v1, - ), # info that includes global and device scope - ( - "hasGlobalInfo", - c_int, - ), # 0 means \ref globalInfo is populated, !0 means it's not - ( - "globalInfo", - c_dcgmIntrospectMemory_v1, - ), # info that only includes global field scope - ( - "gpuInfoCount", - c_uint, - ), # count of how many entries in \ref gpuInfo are populated - ( - "gpuIdsForGpuInfo", - c_uint * DCGM_MAX_NUM_DEVICES, - ), # the GPU ID at a given index identifies which gpu - # the corresponding entry in \ref gpuInfo is from - ( - "gpuInfo", - c_dcgmIntrospectMemory_v1 * DCGM_MAX_NUM_DEVICES, - ), # info that is separated by the - # GPU ID that the watches were for - ] - - -dcgmIntrospectFullMemory_version1 = make_dcgm_version(c_dcgmIntrospectFullMemory_v1, 1) - - class c_dcgmIntrospectCpuUtil_v1(_PrintableStructure): _fields_ = [ - ("version", c_uint32), # version number (dcgmIntrospectCpuUtil_version) - ("total", c_double), # fraction of device's CPU resources that were used - ( - "kernel", - c_double, - ), # fraction of device's CPU resources that were used in kernel mode - ( - "user", - c_double, - ), # fraction of device's CPU resources that were used in user mode + ('version', c_uint32 + ), #!< version number (dcgmIntrospectCpuUtil_version) + ('total', c_double + ), #!< fraction of device's CPU resources that were used + ('kernel', c_double + ), #!< fraction of device's CPU resources that were used in kernel mode + ('user', c_double + ), #!< fraction of device's CPU resources that were used in user mode ] -dcgmIntrospectCpuUtil_version1 = make_dcgm_version(c_dcgmIntrospectCpuUtil_v1, 1) +dcgmIntrospectCpuUtil_version1 = make_dcgm_version(c_dcgmIntrospectCpuUtil_v1, + 1) DCGM_MAX_CONFIG_FILE_LEN = 10000 DCGM_MAX_TEST_NAMES = 20 @@ -1907,229 +1882,173 @@ class c_dcgmIntrospectCpuUtil_v1(_PrintableStructure): # Flags options for running the GPU diagnostic DCGM_RUN_FLAGS_VERBOSE = 0x0001 DCGM_RUN_FLAGS_STATSONFAIL = 0x0002 +# UNUSED DCGM_RUN_FLAGS_TRAIN = 0x0004 +# UNUSED DCGM_RUN_FLAGS_FORCE_TRAIN = 0x0008 -# Enable fail early checks for the Targeted Stress, Targeted Power, SM Stress, -# and Diagnostic tests -DCGM_RUN_FLAGS_FAIL_EARLY = 0x0010 +DCGM_RUN_FLAGS_FAIL_EARLY = 0x0010 # Enable fail early checks for the Targeted Stress, Targeted Power, SM Stress, and Diagnostic tests -class c_dcgmRunDiag_v6(_PrintableStructure): +class c_dcgmRunDiag_v7(_PrintableStructure): _fields_ = [ - ("version", c_uint), # version of this message - ( - # flags specifying binary options for running it. Currently verbose - # and stats on fail - "flags", - c_uint, - ), - ( - "debugLevel", - c_uint, + ('version', c_uint), # version of this message + ('flags', c_uint + ), # flags specifying binary options for running it. Currently verbose and stats on fail + ('debugLevel', c_uint ), # 0-5 for the debug level the GPU diagnostic will use for logging - ( - # group of GPUs to verify. Cannot be specified together with - # gpuList. - "groupId", - c_void_p, - ), - ("validate", c_uint), # 0-3 for which tests to run. Optional. - ( - "testNames", - c_char * DCGM_MAX_TEST_NAMES * DCGM_MAX_TEST_NAMES_LEN, - ), # Specified list of test names. Optional. - ( - # Parameters to set for specified tests in the format: - # testName.parameterName=parameterValue. Optional. - "testParms", - c_char * DCGM_MAX_TEST_PARMS * DCGM_MAX_TEST_PARMS_LEN, - ), - ( - # Comma-separated list of gpus. Cannot be specified with the - # groupId. - "gpuList", - c_char * DCGM_GPU_LIST_LEN, - ), - ( - "debugLogFile", - c_char * DCGM_PATH_LEN, + ('groupId', c_void_p + ), # group of GPUs to verify. Cannot be specified together with gpuList. + ('validate', c_uint), # 0-3 for which tests to run. Optional. + ('testNames', c_char * DCGM_MAX_TEST_NAMES * + DCGM_MAX_TEST_NAMES_LEN), # Specifed list of test names. Optional. + ('testParms', c_char * DCGM_MAX_TEST_PARMS * DCGM_MAX_TEST_PARMS_LEN + ), # Parameters to set for specified tests in the format: testName.parameterName=parameterValue. Optional. + ('fakeGpuList', c_char * DCGM_GPU_LIST_LEN + ), # Comma-separated list of fake gpus. Cannot be specified with the groupId or gpuList. + ('gpuList', c_char * DCGM_GPU_LIST_LEN + ), # Comma-separated list of gpus. Cannot be specified with the groupId. + ('debugLogFile', c_char * DCGM_PATH_LEN ), # Alternate name for the debug log file that should be used - ( - "statsPath", - c_char * DCGM_PATH_LEN, + ('statsPath', c_char * DCGM_PATH_LEN ), # Path that the plugin's statistics files should be written to - ( - "configFileContents", - c_char * DCGM_MAX_CONFIG_FILE_LEN, + ('configFileContents', c_char * DCGM_MAX_CONFIG_FILE_LEN ), # Contents of nvvs config file (likely yaml) - ( - # Throttle reasons to ignore as either integer mask or csv list of - # reasons - "throttleMask", - c_char * DCGM_THROTTLE_MASK_LEN, - ), - ("pluginPath", c_char * DCGM_PATH_LEN), # Custom path to the diagnostic plugins - ("trainingValues", c_uint), # Number of iterations for training. - ( - # Acceptable training variance as a percentage of the value. - # (0-100) - "trainingVariance", - c_uint, - ), - ( - # Acceptable training tolerance as a percentage of the value. - # (0-100) - "trainingTolerance", - c_uint, - ), - ( - "goldenValuesFile", - c_char * DCGM_PATH_LEN, - ), # The path where the golden values should be recorded - ( - # How often the fail early checks should occur when - # DCGM_RUN_FLAGS_FAIL_EARLY is set. - "failCheckInterval", - c_uint, - ), + ('throttleMask', c_char * DCGM_THROTTLE_MASK_LEN + ), # Throttle reasons to ignore as either integer mask or csv list of reasons + ('pluginPath', + c_char * DCGM_PATH_LEN), # Custom path to the diagnostic plugins + ('_unusedInt1', c_uint), # Unused + ('_unusedInt2', c_uint), # Unused + ('_unusedInt3', c_uint), # Unused + ('_unusedBuf', c_char * DCGM_PATH_LEN), # Unused + ('failCheckInterval', c_uint + ), # How often the fail early checks should occur when DCGM_RUN_FLAGS_FAIL_EARLY is set. ] -dcgmRunDiag_version6 = make_dcgm_version(c_dcgmRunDiag_v6, 6) +dcgmRunDiag_version7 = make_dcgm_version(c_dcgmRunDiag_v7, 7) # Latest c_dcgmRunDiag class -c_dcgmRunDiag_t = c_dcgmRunDiag_v6 +c_dcgmRunDiag_t = c_dcgmRunDiag_v7 # Latest version for dcgmRunDiag_t -dcgmRunDiag_version = dcgmRunDiag_version6 +dcgmRunDiag_version = dcgmRunDiag_version7 -# Flags for dcgmGetEntityGroupEntities's flags parameter -# Only return entities that are supported by DCGM. -DCGM_GEGE_FLAG_ONLY_SUPPORTED = 0x00000001 +#Flags for dcgmGetEntityGroupEntities's flags parameter +DCGM_GEGE_FLAG_ONLY_SUPPORTED = 0x00000001 #Only return entities that are supported by DCGM. -# Identifies a GPU NVLink error type returned by DCGM_FI_DEV_GPU_NVLINK_ERRORS -# NVLink link recovery error occurred -DCGM_GPU_NVLINK_ERROR_RECOVERY_REQUIRED = 1 -# NVLink link fatal error occurred -DCGM_GPU_NVLINK_ERROR_FATAL = 2 +#Identifies a GPU NVLink error type returned by DCGM_FI_DEV_GPU_NVLINK_ERRORS +DCGM_GPU_NVLINK_ERROR_RECOVERY_REQUIRED = 1 # NVLink link recovery error occurred +DCGM_GPU_NVLINK_ERROR_FATAL = 2 # NVLink link fatal error occurred # Topology hints for dcgmSelectGpusByTopology() DCGM_TOPO_HINT_F_NONE = 0x00000000 # No hints specified -# Ignore the health of the GPUs when picking GPUs for job execution. -DCGM_TOPO_HINT_F_IGNOREHEALTH = 0x00000001 +DCGM_TOPO_HINT_F_IGNOREHEALTH = 0x00000001 # Ignore the health of the GPUs when picking GPUs for job execution. # By default, only healthy GPUs are considered. class c_dcgmTopoSchedHint_v1(_PrintableStructure): _fields_ = [ - ("version", c_uint), # version of this message - ("inputGpuIds", c_uint64), # bitmask of the GPU ids to choose from - ("numGpus", c_uint32), # the number of GPUs that DCGM should choose - ( - "hintFlags", - c_uint64, - ), # Hints to ignore certain factors for the scheduling hint + ('version', c_uint), # version of this message + ('inputGpuIds', c_uint64), # bitmask of the GPU ids to choose from + ('numGpus', c_uint32), # the number of GPUs that DCGM should chooose + ('hintFlags', + c_uint64), # Hints to ignore certain factors for the scheduling hint ] dcgmTopoSchedHint_version1 = make_dcgm_version(c_dcgmTopoSchedHint_v1, 1) -# DCGM NvLink link states used by c_dcgmNvLinkGpuLinkStatus_v1 & 2 and -# c_dcgmNvLinkNvSwitchLinkStatus_t's linkState field -# NvLink is unsupported by this GPU (Default for GPUs) -DcgmNvLinkLinkStateNotSupported = 0 -# NvLink is supported for this link but this link is disabled (Default for -# NvSwitches) -DcgmNvLinkLinkStateDisabled = 1 -# This NvLink link is down (inactive) -DcgmNvLinkLinkStateDown = 2 -# This NvLink link is up (active) -DcgmNvLinkLinkStateUp = 3 +#DCGM NvLink link states used by c_dcgmNvLinkGpuLinkStatus_v1 & 2 and c_dcgmNvLinkNvSwitchLinkStatus_t's linkState field +DcgmNvLinkLinkStateNotSupported = 0 # NvLink is unsupported by this GPU (Default for GPUs) +DcgmNvLinkLinkStateDisabled = 1 # NvLink is supported for this link but this link is disabled (Default for NvSwitches) +DcgmNvLinkLinkStateDown = 2 # This NvLink link is down (inactive) +DcgmNvLinkLinkStateUp = 3 # This NvLink link is up (active) # State of NvLink links for a GPU class c_dcgmNvLinkGpuLinkStatus_v1(_PrintableStructure): _fields_ = [ - ("entityId", c_uint32), # Entity ID of the GPU (gpuId) - ( - "linkState", - c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1, - ), # Link state of each link of this GPU + ('entityId', c_uint32), # Entity ID of the GPU (gpuId) + ('linkState', c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1 + ), #Link state of each link of this GPU ] # State of NvLink links for a GPU class c_dcgmNvLinkGpuLinkStatus_v2(_PrintableStructure): _fields_ = [ - ("entityId", c_uint32), # Entity ID of the GPU (gpuId) - ( - "linkState", - c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_GPU, - ), # Link state of each link of this GPU + ('entityId', c_uint32), # Entity ID of the GPU (gpuId) + ('linkState', c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY2 + ), #Link state of each link of this GPU + ] + + +class c_dcgmNvLinkGpuLinkStatus_v3(_PrintableStructure): + _fields_ = [ + ('entityId', c_uint32), # Entity ID of the GPU (gpuId) + ('linkState', c_uint32 * + DCGM_NVLINK_MAX_LINKS_PER_GPU), #Link state of each link of this GPU ] -# State of NvLink links for a NvSwitch -class c_dcgmNvLinkNvSwitchLinkStatus_t(_PrintableStructure): +#State of NvLink links for a NvSwitch +class c_dcgmNvLinkNvSwitchLinkStatus_v1(_PrintableStructure): _fields_ = [ - ("entityId", c_uint32), # Entity ID of the NvSwitch (physicalId) - ( - "linkState", - c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH, - ), # Link state of each link of this NvSwitch + ('entityId', c_uint32), # Entity ID of the NvSwitch (physicalId) + ('linkState', c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH_V1 + ) #Link state of each link of this NvSwitch ] -class c_dcgmNvLinkStatus_v1(_PrintableStructure): +class c_dcgmNvLinkStatus_v2(_PrintableStructure): """ NvSwitch link status for all GPUs and NvSwitches in the system """ _fields_ = [ - ( - "version", - c_uint32, + ('version', c_uint32 ), # version of this message. Should be dcgmNvLinkStatus_version1 - ("numGpus", c_uint32), # Number of GPUs populated in gpus[] - ( - "gpus", - c_dcgmNvLinkGpuLinkStatus_v1 * DCGM_MAX_NUM_DEVICES, - ), # Per-GPU NvLink link statuses - ("numNvSwitches", c_uint32), # Number of NvSwitches populated in nvSwitches[] - ( - "nvSwitches", - c_dcgmNvLinkNvSwitchLinkStatus_t * DCGM_MAX_NUM_SWITCHES, - ), # Per-NvSwitch NvLink link statuses + ('numGpus', c_uint32), # Number of GPUs populated in gpus[] + ('gpus', c_dcgmNvLinkGpuLinkStatus_v2 * + DCGM_MAX_NUM_DEVICES), #Per-GPU NvLink link statuses + ('numNvSwitches', + c_uint32), # Number of NvSwitches populated in nvSwitches[] + ('nvSwitches', c_dcgmNvLinkNvSwitchLinkStatus_v1 * DCGM_MAX_NUM_SWITCHES + ) #Per-NvSwitch NvLink link statuses ] -dcgmNvLinkStatus_version1 = make_dcgm_version(c_dcgmNvLinkStatus_v1, 1) +dcgmNvLinkStatus_version2 = make_dcgm_version(c_dcgmNvLinkStatus_v2, 2) -class c_dcgmNvLinkStatus_v2(_PrintableStructure): - """ - NvSwitch link status for all GPUs and NvSwitches in the system - """ +#State of NvLink links for a NvSwitch +class c_dcgmNvLinkNvSwitchLinkStatus_v2(_PrintableStructure): + _fields_ = [ + ('entityId', c_uint32), # Entity ID of the NvSwitch (physicalId) + ('linkState', c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH + ) #Link state of each link of this NvSwitch + ] + +class c_dcgmNvLinkStatus_v3(_PrintableStructure): + ''' + NvSwitch link status for all GPUs and NvSwitches in the system + ''' _fields_ = [ - ( - "version", - c_uint32, + ('version', c_uint32 ), # version of this message. Should be dcgmNvLinkStatus_version1 - ("numGpus", c_uint32), # Number of GPUs populated in gpus[] - ( - "gpus", - c_dcgmNvLinkGpuLinkStatus_v2 * DCGM_MAX_NUM_DEVICES, - ), # Per-GPU NvLink link statuses - ("numNvSwitches", c_uint32), # Number of NvSwitches populated in nvSwitches[] - ( - "nvSwitches", - c_dcgmNvLinkNvSwitchLinkStatus_t * DCGM_MAX_NUM_SWITCHES, - ), # Per-NvSwitch NvLink link statuses + ('numGpus', c_uint32), # Number of GPUs populated in gpus[] + ('gpus', c_dcgmNvLinkGpuLinkStatus_v3 * + DCGM_MAX_NUM_DEVICES), #Per-GPU NvLink link statuses + ('numNvSwitches', + c_uint32), # Number of NvSwitches populated in nvSwitches[] + ('nvSwitches', c_dcgmNvLinkNvSwitchLinkStatus_v2 * DCGM_MAX_NUM_SWITCHES + ) #Per-NvSwitch NvLink link statuses ] -dcgmNvLinkStatus_version2 = make_dcgm_version(c_dcgmNvLinkStatus_v2, 2) +dcgmNvLinkStatus_version3 = make_dcgm_version(c_dcgmNvLinkStatus_v3, 3) # Bitmask values for dcgmGetFieldIdSummary DCGM_SUMMARY_MIN = 0x00000001 @@ -2143,7 +2062,8 @@ class c_dcgmNvLinkStatus_v2(_PrintableStructure): class c_dcgmSummaryResponse_t(_PrintableStructure): - class ResponseValue(Union): + + class ResponseValue(DcgmUnion): _fields_ = [ ("i64", c_int64), ("dbl", c_double), @@ -2169,7 +2089,8 @@ class c_dcgmFieldSummaryRequest_v1(_PrintableStructure): ] -dcgmFieldSummaryRequest_version1 = make_dcgm_version(c_dcgmFieldSummaryRequest_v1, 1) +dcgmFieldSummaryRequest_version1 = make_dcgm_version( + c_dcgmFieldSummaryRequest_v1, 1) # Module IDs DcgmModuleIdCore = 0 # Core DCGM @@ -2184,90 +2105,61 @@ class c_dcgmFieldSummaryRequest_v1(_PrintableStructure): DcgmModuleIdCount = 9 # 1 greater than largest ID above # Module Status -# Module has not been loaded yet -DcgmModuleStatusNotLoaded = 0 -# Module has been blacklisted from being loaded -DcgmModuleStatusBlacklisted = 1 -# Loading the module failed -DcgmModuleStatusFailed = 2 -# Module has been loaded -DcgmModuleStatusLoaded = 3 +DcgmModuleStatusNotLoaded = 0 # Module has not been loaded yet +DcgmModuleStatusDenylisted = 1 # Module has been added to the denylist so it can't be loaded +DcgmModuleStatusFailed = 2 # Loading the module failed +DcgmModuleStatusLoaded = 3 # Module has been loaded +DcgmModuleStatusUnloaded = 4 # Module has been unloaded +DcgmModuleStatusPaused = 5 # Module has been paused. Implies it's been loaded DCGM_MODULE_STATUSES_CAPACITY = 16 class c_dcgmModuleGetStatusesModule_t(_PrintableStructure): _fields_ = [ - ("id", c_uint32), # One of DcgmModuleId* - ("status", c_uint32), # One of DcgmModuleStatus* + ('id', c_uint32), #One of DcgmModuleId* + ('status', c_uint32), #One of DcgmModuleStatus* ] class c_dcgmModuleGetStatuses_v1(_PrintableStructure): _fields_ = [ - ("version", c_uint), - ("numStatuses", c_uint32), - ("statuses", c_dcgmModuleGetStatusesModule_t * DCGM_MODULE_STATUSES_CAPACITY), - ] - - -dcgmModuleGetStatuses_version1 = make_dcgm_version(c_dcgmModuleGetStatuses_v1, 1) - -# Maximum number of metric ID groups that can exist in DCGM -DCGM_PROF_MAX_NUM_GROUPS = 10 -# Maximum number of field IDs that can be in a single DCGM profiling metric -# group -DCGM_PROF_MAX_FIELD_IDS_PER_GROUP = 8 - - -class c_dcgmProfMetricGroupInfo_t(_PrintableStructure): - _fields_ = [ - ("majorId", c_ushort), - ("minorId", c_ushort), - ("numFieldIds", c_uint32), - ("fieldIds", c_ushort * DCGM_PROF_MAX_FIELD_IDS_PER_GROUP), + ('version', c_uint), + ('numStatuses', c_uint32), + ('statuses', + c_dcgmModuleGetStatusesModule_t * DCGM_MODULE_STATUSES_CAPACITY), ] -class c_dcgmProfGetMetricGroups_v2(_PrintableStructure): - _fields_ = [ - ("version", c_uint32), - ("unused", c_uint32), - ("groupId", c_void_p), - ("numMetricGroups", c_uint32), - ("unused1", c_uint32), - ("metricGroups", c_dcgmProfMetricGroupInfo_t * DCGM_PROF_MAX_NUM_GROUPS), - ] +dcgmModuleGetStatuses_version1 = make_dcgm_version(c_dcgmModuleGetStatuses_v1, + 1) +DCGM_PROF_MAX_NUM_GROUPS_V2 = 10 # Maximum number of metric ID groups that can exist in DCGM +DCGM_PROF_MAX_FIELD_IDS_PER_GROUP_V2 = 64 # Maximum number of field IDs that can be in a single DCGM profiling metric group -dcgmProfGetMetricGroups_version1 = make_dcgm_version(c_dcgmProfGetMetricGroups_v2, 2) - -class c_dcgmProfWatchFields_v1(_PrintableStructure): +class c_dcgmProfMetricGroupInfo_v2(_PrintableStructure): _fields_ = [ - ("version", c_uint32), - ("groupId", c_void_p), - ("numFieldIds", c_uint32), - ("fieldIds", c_ushort * 16), - ("updateFreq", c_int64), - ("maxKeepAge", c_double), - ("maxKeepSamples", c_int32), - ("flags", c_uint32), + ('majorId', c_ushort), + ('minorId', c_ushort), + ('numFieldIds', c_uint32), + ('fieldIds', c_ushort * DCGM_PROF_MAX_FIELD_IDS_PER_GROUP_V2), ] -dcgmProfWatchFields_version1 = make_dcgm_version(c_dcgmProfWatchFields_v1, 1) - - -class c_dcgmProfUnwatchFields_v1(_PrintableStructure): +class c_dcgmProfGetMetricGroups_v3(_PrintableStructure): _fields_ = [ - ("version", c_uint32), - ("groupId", c_void_p), - ("flags", c_uint32), + ('version', c_uint32), + ('unused', c_uint32), + ('gpuId', c_uint32), + ('numMetricGroups', c_uint32), + ('metricGroups', + c_dcgmProfMetricGroupInfo_v2 * DCGM_PROF_MAX_NUM_GROUPS_V2), ] -dcgmProfUnwatchFields_version1 = make_dcgm_version(c_dcgmProfUnwatchFields_v1, 1) +dcgmProfGetMetricGroups_version3 = make_dcgm_version( + c_dcgmProfGetMetricGroups_v3, 3) class c_dcgmVersionInfo_v2(_PrintableStructure): diff --git a/model_analyzer/monitor/dcgm/dcgm_telegraf.py b/model_analyzer/monitor/dcgm/dcgm_telegraf.py new file mode 100644 index 000000000..63563662e --- /dev/null +++ b/model_analyzer/monitor/dcgm/dcgm_telegraf.py @@ -0,0 +1,65 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from model_analyzer.monitor.dcgm.common.dcgm_client_main import main +from model_analyzer.monitor.dcgm.DcgmJsonReader import DcgmJsonReader +from socket import socket, AF_INET, SOCK_DGRAM + +# Displayed to the user +TELEGRAF_NAME = 'Telegraf' +DEFAULT_TELEGRAF_PORT = 8094 + +# Telegraf Configuration +# ====================== +# +# In order for Telegraf to understand the format of the data sent by this +# module, it needs to be configured with the input plugin below +# +# If you modify the list of published fields, you will need to add non-numeric +# ones as tag_keys for Telegraf to store them +# +# [[inputs.socket_listener]] +# name_override = "dcgm" +# service_address = "udp://:8094" +# data_format = "json" +# tag_keys = [ +# "compute_pids", +# "driver_version", +# "gpu_uuid", +# "nvml_version", +# "process_name", +# "xid_errors" +# ] + + +class DcgmTelegraf(DcgmJsonReader): + ########################################################################### + def __init__(self, publish_hostname, publish_port, **kwargs): + self.m_sock = socket(AF_INET, SOCK_DGRAM) + self.m_dest = (publish_hostname, publish_port) + super(DcgmTelegraf, self).__init__(**kwargs) + + ########################################################################### + def SendToTelegraf(self, payload): + self.m_sock.sendto(payload, self.m_dest) + + ########################################################################### + def CustomJsonHandler(self, outJson): + self.SendToTelegraf(outJson) + + +if __name__ == '__main__': # pragma: no cover + main(DcgmTelegraf, + TELEGRAF_NAME, + DEFAULT_TELEGRAF_PORT, + add_target_host=True) diff --git a/model_analyzer/monitor/dcgm/dcgmvalue.py b/model_analyzer/monitor/dcgm/dcgmvalue.py new file mode 100644 index 000000000..d26625d50 --- /dev/null +++ b/model_analyzer/monitor/dcgm/dcgmvalue.py @@ -0,0 +1,155 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Base value for integer blank. can be used as an unspecified blank +DCGM_INT32_BLANK = 0x7ffffff0 +DCGM_INT64_BLANK = 0x7ffffffffffffff0 + +# Base value for double blank. 2 ** 47. FP 64 has 52 bits of mantissa, +#so 47 bits can still increment by 1 and represent each value from 0-15 +DCGM_FP64_BLANK = 140737488355328.0 + +DCGM_STR_BLANK = "<<>>" + +# Represents an error where data was not found +DCGM_INT32_NOT_FOUND = (DCGM_INT32_BLANK + 1) +DCGM_INT64_NOT_FOUND = (DCGM_INT64_BLANK + 1) +DCGM_FP64_NOT_FOUND = (DCGM_FP64_BLANK + 1.0) +DCGM_STR_NOT_FOUND = "<<>>" + +# Represents an error where fetching the value is not supported +DCGM_INT32_NOT_SUPPORTED = (DCGM_INT32_BLANK + 2) +DCGM_INT64_NOT_SUPPORTED = (DCGM_INT64_BLANK + 2) +DCGM_FP64_NOT_SUPPORTED = (DCGM_FP64_BLANK + 2.0) +DCGM_STR_NOT_SUPPORTED = "<<>>" + +# Represents and error where fetching the value is not allowed with our current credentials +DCGM_INT32_NOT_PERMISSIONED = (DCGM_INT32_BLANK + 3) +DCGM_INT64_NOT_PERMISSIONED = (DCGM_INT64_BLANK + 3) +DCGM_FP64_NOT_PERMISSIONED = (DCGM_FP64_BLANK + 3.0) +DCGM_STR_NOT_PERMISSIONED = "<<>>" + + +############################################################################### +# Functions to check if a value is blank or not +def DCGM_INT32_IS_BLANK(val): + if val >= DCGM_INT32_BLANK: + return True + else: + return False + + +def DCGM_INT64_IS_BLANK(val): + if val >= DCGM_INT64_BLANK: + return True + else: + return False + + +def DCGM_FP64_IS_BLANK(val): + if val >= DCGM_FP64_BLANK: + return True + else: + return False + + +#Looks for <<< at first position and >>> inside string +def DCGM_STR_IS_BLANK(val): + if 0 != val.find("<<<"): + return False + elif 0 > val.find(">>>"): + return False + return True + + +############################################################################### +class DcgmValue: + + def __init__(self, value): + self.value = value #Contains either an integer (int64), string, or double of the actual value + + ########################################################################### + def SetFromInt32(self, i32Value): + ''' + Handle the special case where our source data was an int32 but is currently + stored in a python int (int64), dealing with blanks + ''' + value = int(i32Value) + + if not DCGM_INT32_IS_BLANK(i32Value): + self.value = value + return + + if value == DCGM_INT32_NOT_FOUND: + self.value = DCGM_INT64_NOT_FOUND + elif value == DCGM_INT32_NOT_SUPPORTED: + self.value = DCGM_INT64_NOT_SUPPORTED + elif value == DCGM_INT32_NOT_PERMISSIONED: + self.value = DCGM_INT64_NOT_PERMISSIONED + else: + self.value = DCGM_INT64_BLANK + + ########################################################################### + def IsBlank(self): + ''' + Returns True if the currently-stored value is a blank value. False if not + ''' + if self.value is None: + return True + elif type(self.value) == int or type(self.value) == int: + return DCGM_INT64_IS_BLANK(self.value) + elif type(self.value) == float: + return DCGM_FP64_IS_BLANK(self.value) + elif type(self.value) == str: + return DCGM_STR_IS_BLANK(self.value) + else: + raise Exception("Unknown type: %s") % str(type(self.value)) + + ########################################################################### + def __str__(self): + return str(self.value) + + ########################################################################### + + +############################################################################### +def self_test(): + + v = DcgmValue(1.0) + assert (not v.IsBlank()) + assert (v.value == 1.0) + + v = DcgmValue(100) + assert (not v.IsBlank()) + assert (v.value == 100) + + v = DcgmValue(DCGM_INT64_NOT_FOUND) + assert (v.IsBlank()) + + v = DcgmValue(DCGM_FP64_NOT_FOUND) + assert (v.IsBlank()) + + v.SetFromInt32(DCGM_INT32_NOT_SUPPORTED) + assert (v.IsBlank()) + assert (v.value == DCGM_INT64_NOT_SUPPORTED) + + print("Tests passed") + return + + +############################################################################### +if __name__ == "__main__": + self_test() + +############################################################################### diff --git a/model_analyzer/monitor/dcgm/denylist_recommendations.py b/model_analyzer/monitor/dcgm/denylist_recommendations.py new file mode 100644 index 000000000..38dafc624 --- /dev/null +++ b/model_analyzer/monitor/dcgm/denylist_recommendations.py @@ -0,0 +1,573 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import sys +import logging +import json +import os + +try: + import model_analyzer.monitor.dcgm.pydcgm as pydcgm + import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent + import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs + import model_analyzer.monitor.dcgm.dcgm_errors as dcgm_errors + import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields + import model_analyzer.monitor.dcgm.DcgmSystem as DcgmSystem +except: + # If we don't find the bindings, add the default path and try again + if 'PYTHONPATH' in os.environ: + os.environ['PYTHONPATH'] = os.environ[ + 'PYTHONPATH'] + ":/usr/local/dcgm/bindings" + else: + os.environ['PYTHONPATH'] = '/usr/local/dcgm/bindings' + + import model_analyzer.monitor.dcgm.pydcgm as pydcgm + import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent + import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs + import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields + import model_analyzer.monitor.dcgm.DcgmSystem as DcgmSystem + +BR_ST_HEALTHY = 0x0000 +BR_ST_NOT_DETECTED = 0x0001 +BR_ST_FAILED_PASSIVE_HEALTH = 0x0002 +BR_ST_FAILED_ACTIVE_HEALTH = 0x0004 + +BR_HEALTH_WATCH_BITMAP = dcgm_structs.DCGM_HEALTH_WATCH_ALL + +DIAG_SM_STRESS_DURATION = 90.0 +DIAG_CONSTANT_POWER_DURATION = 120.0 +DIAG_CONSTANT_STRESS_DURATION = 120.0 +DIAG_DIAGNOSTIC_DURATION = 300.0 + +global g_gpus +global g_switches +g_gpus = [] +g_switches = [] + + +class Entity(object): + + def __init__(self, + entityId, + entityType=dcgm_fields.DCGM_FE_GPU, + uuid=None, + bdf=None): + self.health = BR_ST_HEALTHY + self.entityType = entityType + self.entityId = entityId + self.reasonsUnhealthy = [] + if uuid: + self.uuid = uuid + if bdf: + self.bdf = bdf + + def IsHealthy(self): + return self.health == BR_ST_HEALTHY + + def MarkUnhealthy(self, failCondition, reason): + self.health = self.health | failCondition + self.reasonsUnhealthy.append(reason) + + def WhyUnhealthy(self): + return self.reasonsUnhealthy + + def SetEntityId(self, entityId): + self.entityId = entityId + + def GetEntityId(self): + return self.entityId + + def GetUUID(self): + return self.uuid + + def GetBDF(self): + return self.bdf + + +def mark_entity_unhealthy(entities, entityId, code, reason): + found = False + for entity in entities: + if entityId == entity.GetEntityId(): + entity.MarkUnhealthy(code, reason) + found = True + + return found + + +def addParamString(runDiagInfo, paramIndex, paramStr): + strIndex = 0 + for c in paramStr: + runDiagInfo.testParms[paramIndex][strIndex] = c + strIndex = strIndex + 1 + + +def setTestDurations(runDiagInfo, timePercentage): + # We only are reducing the test time for the default case + if runDiagInfo.validate != 3: + return + + stressDuration = int(DIAG_SM_STRESS_DURATION * timePercentage) + powerDuration = int(DIAG_CONSTANT_POWER_DURATION * timePercentage) + constantStressDuration = int(DIAG_CONSTANT_STRESS_DURATION * timePercentage) + diagDuration = int(DIAG_DIAGNOSTIC_DURATION * timePercentage) + + smParam = "sm stress.test_duration=%d" % (stressDuration) + powerParam = "targeted power.test_duration=%d" % (powerDuration) + constantStressParam = "targeted stress.test_duration=%d" % ( + constantStressDuration) + diagParam = "diagnostic.test_duration=%d" % (diagDuration) + + addParamString(runDiagInfo, 0, diagParam) + addParamString(runDiagInfo, 1, smParam) + addParamString(runDiagInfo, 2, constantStressParam) + addParamString(runDiagInfo, 3, powerParam) + + +def initialize_run_diag_info(settings): + runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7() + runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7 + runDiagInfo.flags = dcgm_structs.DCGM_RUN_FLAGS_VERBOSE + testNamesStr = settings['testNames'] + if testNamesStr == '1': + runDiagInfo.validate = 1 + elif testNamesStr == '2': + runDiagInfo.validate = 2 + elif testNamesStr == '3': + runDiagInfo.validate = 3 + else: + # Make sure no number other that 1-3 were submitted + if testNamesStr.isdigit(): + raise ValueError("'%s' is not a valid test name" % testNamesStr) + + # Copy to the testNames portion of the object + names = testNamesStr.split(',') + testIndex = 0 + if len(names) > dcgm_structs.DCGM_MAX_TEST_NAMES: + err = 'Aborting DCGM Diag because %d test names were specified exceeding the limit of %d' %\ + (len(names), dcgm_structs.DCGM_MAX_TEST_NAMES) + raise ValueError(err) + + for testName in names: + testNameIndex = 0 + if len(testName) >= dcgm_structs.DCGM_MAX_TEST_NAMES_LEN: + err = 'Aborting DCGM Diag because test name %s exceeds max length %d' % \ + (testName, dcgm_structs.DCGM_MAX_TEST_NAMES_LEN) + raise ValueError(err) + + for c in testName: + runDiagInfo.testNames[testIndex][testNameIndex] = c + testNameIndex = testNameIndex + 1 + + testIndex = testIndex + 1 + + if 'timePercentage' in settings: + setTestDurations(runDiagInfo, settings['timePercentage']) + + activeGpuIds = [] + + first = True + for gpuObj in g_gpus: + if gpuObj.IsHealthy(): + activeGpuIds.append(gpuObj.GetEntityId()) + if first: + runDiagInfo.gpuList = str(gpuObj.GetEntityId()) + first = False + else: + to_append = ',%s' % (str(gpuObj.GetEntityId())) + runDiagInfo.gpuList = runDiagInfo.gpuList + to_append + + return runDiagInfo, activeGpuIds + + +def mark_all_unhealthy(activeGpuIds, reason): + for gpuId in activeGpuIds: + mark_entity_unhealthy(g_gpus, gpuId, BR_ST_FAILED_ACTIVE_HEALTH, reason) + + +def result_to_str(result): + if result == dcgm_structs.DCGM_DIAG_RESULT_PASS: + return 'PASS' + elif result == dcgm_structs.DCGM_DIAG_RESULT_SKIP: + return 'SKIP' + elif result == dcgm_structs.DCGM_DIAG_RESULT_WARN: + return 'WARN' + elif result == dcgm_structs.DCGM_DIAG_RESULT_FAIL: + return 'FAIL' + else: + return 'NOT RUN' + + +def check_passive_health_checks(response, activeGpuIds): + unhealthy = False + for i in range(0, dcgm_structs.DCGM_SWTEST_COUNT): + if response.levelOneResults[ + i].result == dcgm_structs.DCGM_DIAG_RESULT_FAIL: + mark_all_unhealthy(activeGpuIds, + response.levelOneResults[i].error.msg) + unhealthy = True + break + + return unhealthy + + +def check_gpu_diagnostic(handleObj, settings): + runDiagInfo, activeGpuIds = initialize_run_diag_info(settings) + if len(activeGpuIds) == 0: + return + + response = dcgm_agent.dcgmActionValidate_v2(handleObj.handle, runDiagInfo) + + sysError = response.systemError + if (sysError.code != dcgm_errors.DCGM_FR_OK): + raise ValueError(sysError) + + if check_passive_health_checks(response, activeGpuIds) == False: + for gpuIndex in range(response.gpuCount): + for testIndex in range(dcgm_structs.DCGM_PER_GPU_TEST_COUNT_V8): + if response.perGpuResponses[gpuIndex].results[ + testIndex].result == dcgm_structs.DCGM_DIAG_RESULT_FAIL: + gpuId = response.perGpuResponses[gpuIndex].gpuId + mark_entity_unhealthy( + g_gpus, gpuId, BR_ST_FAILED_ACTIVE_HEALTH, + response.perGpuResponses[gpuIndex].results[testIndex]. + result.error.msg) + + # NVVS marks all subsequent tests as failed so there's no point in continuing + break + + +def query_passive_health(handleObj, desired_watches): + dcgmGroup = handleObj.GetSystem().GetDefaultGroup() + watches = dcgmGroup.health.Get() + + # Check for the correct watches to be set and set them if necessary + if watches != desired_watches: + dcgmGroup.health.Set(desired_watches) + + return dcgmGroup.health.Check() + + +def denylist_from_passive_health_check(response): + for incidentIndex in range(response.incidentCount): + if response.incidents[ + incidentIndex].health != dcgm_structs.DCGM_HEALTH_RESULT_FAIL: + # Only add to the denylist for failures; ignore warnings + continue + + entityId = response.incidents[incidentIndex].entityInfo.entityId + entityGroupId = response.incidents[ + incidentIndex].entityInfo.entityGroupId + errorString = response.incidents[incidentIndex].error.msg + + if entityGroupId == dcgm_fields.DCGM_FE_GPU: + mark_entity_unhealthy(g_gpus, entityId, BR_ST_FAILED_PASSIVE_HEALTH, + errorString) + else: + mark_entity_unhealthy(g_switches, entityId, + BR_ST_FAILED_PASSIVE_HEALTH, errorString) + + +def check_passive_health(handleObj, watches): + response = query_passive_health(handleObj, watches) + + if response.overallHealth != dcgm_structs.DCGM_HEALTH_RESULT_PASS: + denylist_from_passive_health_check(response) + + +def initialize_devices(handle, flags): + gpuIds = dcgm_agent.dcgmGetEntityGroupEntities(handle, + dcgm_fields.DCGM_FE_GPU, + flags) + switchIds = dcgm_agent.dcgmGetEntityGroupEntities( + handle, dcgm_fields.DCGM_FE_SWITCH, flags) + + i = 0 + for gpuId in gpuIds: + attributes = dcgm_agent.dcgmGetDeviceAttributes(handle, gpuId) + gpuObj = Entity(gpuId, + entityType=dcgm_fields.DCGM_FE_GPU, + uuid=attributes.identifiers.uuid, + bdf=attributes.identifiers.pciBusId) + g_gpus.append(gpuObj) + i = i + 1 + + i = 0 + for switchId in switchIds: + switchObj = Entity(switchId, entityType=dcgm_fields.DCGM_FE_SWITCH) + g_switches.append(switchObj) + i = i + 1 + + +# Process command line arguments +def __process_command_line__(settings): + parser = argparse.ArgumentParser() + parser.add_argument('-g', + '--num-gpus', + dest='num_gpus', + type=int, + help='The expected number of GPUs.') + parser.add_argument('-s', + '--num-switches', + dest='num_switches', + type=int, + help='The expected number of NvSwitches.') + parser.add_argument( + '-n', + '--hostname', + dest='hostname', + type=str, + help='The hostname of the nv-hostengine we want to query.') + parser.add_argument( + '-d', + '--detect', + dest='detect', + action='store_true', + help='Run on whatever GPUs can be detected. Do not check counts.') + parser.add_argument( + '-l', + '--log-file', + dest='logfileName', + type=str, + help= + 'The name of the log file where details should be stored. Default is stdout' + ) + parser.add_argument( + '-u', + '--unsupported-too', + dest='unsupported', + action='store_true', + help='Get unsupported devices in addition to the ones DCGM supports') + parser.add_argument('-f', + '--full-report', + dest='fullReport', + action='store_true', + help='Print a health status for each GPU') + parser.add_argument( + '-c', + '--csv', + dest='outfmtCSV', + action='store_true', + help='Write output in csv format. By default, output is in json format.' + ) + parser.add_argument( + '-w', + '--watches', + dest='watches', + type=str, + help= + 'Specify which health watches to monitor. By default, all are watched. Any list of the following may be specified:\n\ta = All watches\n\tp = PCIE\n\tm = Memory\n\ti = Inforom\n\tt = Thermal and Power\n\tn = NVLINK' + ) + + group = parser.add_mutually_exclusive_group() + group.add_argument( + '-r', + '--specified-test', + dest='testNames', + type=str, + help='Option to specify what tests are run in dcgmi diag.') + group.add_argument( + '-i', + '--instantaneous', + dest='instant', + action='store_true', + help='Specify to skip the longer tests and run instantaneously') + group.add_argument( + '-t', + '--time-limit', + dest='timeLimit', + type=int, + help= + 'The time limit in seconds that all the tests should not exceed. Diagnostics will be reduced in their time to meet this boundary.' + ) + + parser.set_defaults(instant=False, detect=False, fullReport=False) + args = parser.parse_args() + + if args.num_gpus is not None and args.num_switches is not None: + settings['numGpus'] = args.num_gpus + settings['numSwitches'] = args.num_switches + elif args.detect == False: + raise ValueError( + 'Must specify either a number of gpus and switches with -g and -s or auto-detect with -d' + ) + + if args.hostname: + settings['hostname'] = args.hostname + else: + settings['hostname'] = 'localhost' + + if args.unsupported: + settings['entity_get_flags'] = 0 + else: + settings[ + 'entity_get_flags'] = dcgm_structs.DCGM_GEGE_FLAG_ONLY_SUPPORTED + + settings['instant'] = args.instant + settings['fullReport'] = args.fullReport + + if args.testNames: + settings['testNames'] = args.testNames + else: + settings['testNames'] = '3' + + if args.timeLimit: + settings['timePercentage'] = float(args.timeLimit) / 840.0 + + if args.logfileName: + logging.basicConfig(filename=args.logfileName) + + if args.outfmtCSV: + settings['outfmtCSV'] = 1 + + if args.watches: + health_watches = 0 + for c in args.watches: + if c == 'p': + health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_PCIE + elif c == 'm': + health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_MEM + elif c == 'i': + health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_INFOROM + elif c == 't': + health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_THERMAL + health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_POWER + elif c == 'n': + health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_NVLINK + elif c == 'a': + health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_ALL + else: + print(("Unrecognized character %s found in watch string '%s'" % + (c, args.watches))) + sys.exit(-1) + settings['watches'] = health_watches + else: + settings['watches'] = BR_HEALTH_WATCH_BITMAP + + +def get_entity_id_list(entities): + ids = "" + first = True + for entity in entities: + if first: + ids = str(entity.GetEntityId()) + else: + ids += ",%d" % (entity.GetEntityId()) + first = False + + return ids + + +def check_health(handleObj, settings, error_list): + initialize_devices(handleObj.handle, settings['entity_get_flags']) + + if 'numGpus' in settings: + if len(g_gpus) != settings['numGpus']: + error_list.append( + "%d GPUs were specified but only %d were detected with ids '%s'" + % + (settings['numGpus'], len(g_gpus), get_entity_id_list(g_gpus))) + + if 'numSwitches' in settings: + if len(g_switches) != settings['numSwitches']: + error_list.append( + "%d switches were specified but only %d were detected with ids '%s'" + % (settings['numSwitches'], len(g_switches), + get_entity_id_list(g_switches))) + + check_passive_health(handleObj, settings['watches']) # quick check + + if settings['instant'] == False: + check_gpu_diagnostic(handleObj, settings) + + +def process_command_line(settings): + try: + __process_command_line__(settings) + except ValueError as e: + return str(e) + + +def main(): + # Parse the command line + settings = {} + error_list = [] + + exitCode = 0 + jsonTop = {} + + error = process_command_line(settings) + if error: + # If we had an error processing the command line, don't attempt to check anything + error_list.append(error) + else: + try: + handleObj = pydcgm.DcgmHandle(None, settings['hostname'], + dcgm_structs.DCGM_OPERATION_MODE_AUTO) + + check_health(handleObj, settings, error_list) + except dcgm_structs.DCGMError as e: + # Catch any exceptions from DCGM and add them to the error_list so they'll be printed as JSON + error_list.append(str(e)) + except ValueError as e: + error_list.append(str(e)) + + if 'outfmtCSV' in settings: # show all health, then all un-healthy + for gpuObj in g_gpus: + if gpuObj.IsHealthy() == True: + print("healthy,%s,%s" % (gpuObj.GetBDF(), gpuObj.GetUUID())) + for gpuObj in g_gpus: + if gpuObj.IsHealthy() == False: + print("unhealthy,%s,%s,\"%s\"" % + (gpuObj.GetBDF(), gpuObj.GetUUID(), + gpuObj.WhyUnhealthy())) + + else: # build obj that can be output in json + denylistGpus = {} + healthyGpus = {} + for gpuObj in g_gpus: + if gpuObj.IsHealthy() == False: + details = {} + details['UUID'] = gpuObj.GetUUID() + details['BDF'] = gpuObj.GetBDF() + details['Failure Explanation'] = gpuObj.WhyUnhealthy() + denylistGpus[gpuObj.GetEntityId()] = details + elif settings['fullReport']: + details = {} + details['UUID'] = gpuObj.GetUUID() + details['BDF'] = gpuObj.GetBDF() + healthyGpus[gpuObj.GetEntityId()] = details + + jsonTop['denylistedGpus'] = denylistGpus + if settings['fullReport']: + jsonTop['Healthy GPUs'] = healthyGpus + + if len(error_list): # had error processing the command line + exitCode = 1 + if 'outfmtCSV' in settings: # json output + if len(error_list): + for errObj in error_list: + print("errors,\"%s\"" % (errObj)) + else: + jsonTop['errors'] = error_list + + if 'outfmtCSV' in settings: # show all health, then all un-healthy + pass + else: + print(json.dumps(jsonTop, indent=4, separators=(',', ': '))) + + sys.exit(exitCode) + + +if __name__ == '__main__': + main() diff --git a/model_analyzer/monitor/dcgm/pydcgm.py b/model_analyzer/monitor/dcgm/pydcgm.py new file mode 100644 index 000000000..da6157471 --- /dev/null +++ b/model_analyzer/monitor/dcgm/pydcgm.py @@ -0,0 +1,47 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def _python_version_check(): + import sys + python_version = sys.version.split(None, 1)[0] + if python_version < '3': + print( + '[ERROR] Detected Python version {}. These bindings are for Python 3.5+. Please load the Python 2 bindings found at /usr/local/dcgm/bindings' + .format(python_version)) + sys.exit(1) + + +_python_version_check() + +#Bring classes into this namespace +from model_analyzer.monitor.dcgm.DcgmHandle import * +from model_analyzer.monitor.dcgm.DcgmGroup import * +from model_analyzer.monitor.dcgm.DcgmStatus import * +from model_analyzer.monitor.dcgm.DcgmSystem import * +from model_analyzer.monitor.dcgm.DcgmFieldGroup import * + +import os +if '__DCGM_TESTING_FRAMEWORK_ACTIVE' in os.environ and os.environ[ + '__DCGM_TESTING_FRAMEWORK_ACTIVE'] == '1': + import utils + import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs + dcgm_structs._dcgmInit(utils.get_testing_framework_library_path()) +''' +Define a unique exception type we will return so that callers can distinguish our exceptions from python standard ones +''' + + +class DcgmException(Exception): + pass