-
Notifications
You must be signed in to change notification settings - Fork 52
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Change to new graphic test strategy (BugFix) (#586)
* Changing gpu test strategy to prime/reverse-prime gpu offload without depending on index For Nvidia GPU, the prime/reverse prime offload is not supported before version 435.17. Therefore, This new strategy is only for 22.04+. For backward compatibility, this PR add new test plans for 22.04+ as follow: graphics-gpu-cert-full graphics-gpu-cert-automated graphics-gpu-cert-manual after-suspend-graphics-gpu-cert-full after-suspend-graphics-gpu-cert-automated after-suspend-graphics-gpu-cert-manual monitor-gpu-cert-full monitor-gpu-cert-automated monitor-gpu-cert-manual after-suspend-monitor-gpu-cert-full after-suspend-monitor-gpu-cert-automated after-suspend-monitor-gpu-cert-manual And add new python script "prime_offload_tester.py" to execute command with prime/reverse prime setting for new test jobs as follow: Auto test: graphics/{index}_auto_glxgears_{product_slug} graphics/{index}_auto_glxgears_fullscreen_{product_slug} Manual: graphics/{index}_valid_glxgears_{product_slug} graphics/{index}_valid_glxgears_fullscreen_{product_slug} * Add more unit test for graphics_card_resource.py and prime_offload_tester.py * Add one more unit test * move parse arguments to single function for unit testing * Fix flake8 error * 1. Refactory to be more like python 2. add extra method for avoid checking fail by 6.5 kernel bug * Fix flake8 error * add executable permission * 1. Move changes of job and test-plan to another PR 2. Bug of 6.5 kernel is released in proposed kernel 6.5.0.16 and have tested. Therefore, removing workaround. * 1. Move change of jobs and test plan to another PR 2. add more unit tests * Fix pci BDF format check error ref: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/arch/x86/pci/early.c?id=refs/tags/v3.12.7#n65 https://wiki.xenproject.org/wiki/Bus:Device.Function_(BDF)_Notation * Update providers/base/bin/prime_offload_tester.py Co-authored-by: kissiel <[email protected]> * Update providers/base/bin/prime_offload_tester.py Co-authored-by: kissiel <[email protected]> * Update providers/base/bin/prime_offload_tester.py Co-authored-by: kissiel <[email protected]> * 1. move the get clients from check_offload to get_client 2. fix docstring error 3. change default to 20s and the logic in the check_offload 4. change RuntimeError to SystemExit --------- Co-authored-by: kissiel <[email protected]>
- Loading branch information
Showing
4 changed files
with
859 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,302 @@ | ||
#!/usr/bin/env python3 | ||
# This file is part of Checkbox. | ||
# | ||
# Copyright 2023 Canonical Ltd. | ||
# Written by: | ||
# Hanhsuan Lee <[email protected]> | ||
# | ||
# Checkbox is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License version 3, | ||
# as published by the Free Software Foundation. | ||
# | ||
# Checkbox is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License | ||
# along with Checkbox. If not, see <http://www.gnu.org/licenses/>. | ||
|
||
import sys | ||
import threading | ||
import subprocess | ||
import time | ||
import re | ||
import json | ||
import argparse | ||
import logging | ||
import os | ||
|
||
|
||
class PrimeOffloader: | ||
""" | ||
A class used to execute process to specific GPU. | ||
Have to run this as root. | ||
:attr logger: console logger | ||
:type logger: RootLogger | ||
:attr check_result: | ||
store the result of checking offloading is ok or not. | ||
:type check_result: bool | ||
""" | ||
|
||
logger = logging.getLogger() | ||
check_result = False | ||
|
||
def find_card_id(self, pci_name: str) -> str: | ||
""" | ||
use pci name to find card id under /sys/kernel/debug/dri | ||
:param pci_name: pci device name in NNNN:NN:NN.N format | ||
:returns: card id | ||
""" | ||
pci_name_format = "[0-9]{4}:[0-9,a-f]{2}:[0-9,a-f]{2}.[0-9]" | ||
if not re.match(pci_name_format, pci_name.lower()): | ||
raise SystemExit("pci name format error") | ||
|
||
try: | ||
cmd = ["grep", | ||
"-lr", | ||
"--include=name", | ||
pci_name, | ||
"/sys/kernel/debug/dri"] | ||
|
||
card_path = subprocess.check_output(cmd, | ||
universal_newlines=True) | ||
return card_path.split('/')[5] | ||
except IndexError as e: | ||
raise SystemExit("return value format error {}".format(repr(e))) | ||
except subprocess.CalledProcessError as e: | ||
raise SystemExit("run command failed {}".format(repr(e))) | ||
|
||
def find_card_name(self, pci_name: str) -> str: | ||
""" | ||
use pci name to find card name by lshw | ||
:param pci_name: pci device name in NNNN:NN:NN.N format | ||
:returns: card name | ||
""" | ||
cmd = ["lshw", "-c", "display", "-json"] | ||
try: | ||
card_infos = subprocess.check_output(cmd, | ||
universal_newlines=True) | ||
infos = json.loads(card_infos) | ||
for info in infos: | ||
if pci_name in info['businfo']: | ||
return info['product'] | ||
raise SystemExit("Card name not found") | ||
except (KeyError, TypeError, json.decoder.JSONDecodeError) as e: | ||
raise SystemExit("return value format error {}".format(e)) | ||
except subprocess.CalledProcessError as e: | ||
raise SystemExit("run command failed {}".format(repr(e))) | ||
|
||
def get_clients(self, card_id: str) -> str: | ||
""" | ||
Use to get clients that running on specific GPU | ||
by reading debugfs. | ||
.. note:: | ||
While setting prime offload environment such as DRI_PRIME, | ||
the process will be listed under kernel debug interface. | ||
The location of kernel debug interface is | ||
/sys/kernel/debug/dri/<card id>, | ||
and the process could be found in | ||
/sys/kernel/debug/dri/<card id>/clients | ||
:param cmd: command that running under prime offload | ||
""" | ||
read_clients_cmd = ["cat", | ||
"/sys/kernel/debug/dri/{}/clients" | ||
.format(card_id)] | ||
try: | ||
return subprocess.check_output(read_clients_cmd, | ||
universal_newlines=True) | ||
except subprocess.CalledProcessError: | ||
self.logger.info("Couldn't get clients on specific GPU{}" | ||
.format(card_id)) | ||
|
||
def check_offload(self, cmd: list, card_id: str, | ||
card_name: str, timeout: str): | ||
""" | ||
Use to check provided command is executed on specific GPU. | ||
:param cmd: command that running under prime offload | ||
:param card_id: card id of dri device | ||
:param card_name: card name of dri device | ||
:param timeout: timeout for offloaded command | ||
""" | ||
delay = timeout / 10 | ||
|
||
deadline = time.time() + timeout | ||
|
||
while time.time() < deadline: | ||
time.sleep(delay) | ||
clients = self.get_clients(card_id) | ||
if clients and cmd[0] in clients: | ||
self.logger.info("Checking success:") | ||
self.logger.info(" Offload process:[{}]".format(cmd)) | ||
self.logger.info(" Card ID:[{}]".format(card_id)) | ||
self.logger.info(" Device Name:[{}]".format(card_name)) | ||
return | ||
self.logger.info("Checking fail:") | ||
self.logger.info(" Couldn't find process [{}]".format(cmd)) | ||
self.check_result = True | ||
|
||
def check_nv_offload_env(self): | ||
""" | ||
prime offload of nvidia driver is limited. | ||
Only on-demand mode is supported. | ||
""" | ||
# nvidia-smi ship with NVIDIA GPU display drivers on Linux | ||
# https://developer.nvidia.com/nvidia-system-management-interface | ||
# check prime-select to make sure the nv driver is included. | ||
# If there is no nv driver, prime offload is fine for other drivers. | ||
try: | ||
if "on-demand" not in subprocess.check_output( | ||
["prime-select", "query"], universal_newlines=True): | ||
raise SystemExit("System isn't on-demand mode") | ||
|
||
# prime offload couldn't running on nvlink active or inactive | ||
# Therefore, only return empty string is supported environment. | ||
nvlink = subprocess.check_output(["nvidia-smi", "nvlink", "-s"], | ||
universal_newlines=True) | ||
if nvlink: | ||
if 'error' in nvlink.lower(): | ||
raise SystemExit("nvidia driver error") | ||
raise SystemExit("NVLINK detected") | ||
except FileNotFoundError: | ||
self.logger.info( | ||
"No prime-select, it should be ok to run prime offload") | ||
|
||
def run_offload_cmd(self, cmd: str, pci_name: str, | ||
driver: str, timeout: int): | ||
""" | ||
run offload command and check it runs on correct GPU | ||
:param cmd: command that running under prime offload | ||
:param pci_name: pci device name in NNNN:NN:NN.N format | ||
:param driver: GPU driver, such as i915, amdgpu, nvidia | ||
:param timeout: timeout for offloaded command | ||
""" | ||
card_id = self.find_card_id(pci_name) | ||
card_name = self.find_card_name(pci_name) | ||
|
||
# run offload command in other process | ||
dri_pci_name_format = re.sub("[:.]", "_", pci_name) | ||
|
||
if "timeout" in cmd: | ||
raise SystemExit("Put timeout in command isn't allowed") | ||
|
||
cmd = cmd.split() | ||
if timeout > 0: | ||
offload_cmd = ["timeout", str(timeout)] + cmd | ||
else: | ||
# if timeout <=0 will make check_offload failed. | ||
# Set the timeout to the default value | ||
log_str = ("Timeout {}s is invalid," | ||
" remove the timeout setting" | ||
" and change check_offload to run 20s".format(timeout)) | ||
self.logger.info(log_str) | ||
timeout = 20 | ||
offload_cmd = cmd | ||
|
||
env = os.environ.copy() | ||
if driver in ('nvidia', 'pcieport'): | ||
offload_env = {"__NV_PRIME_RENDER_OFFLOAD": "1", | ||
"__GLX_VENDOR_LIBRARY_NAME": "nvidia"} | ||
else: | ||
offload_env = {"DRI_PRIME": "pci-{}".format(dri_pci_name_format)} | ||
|
||
env.update(offload_env) | ||
self.logger.info("prime offload env: {}".format(offload_env)) | ||
|
||
# if nv driver under nvidia mode, prime/reverse prime couldn't work. | ||
self.check_nv_offload_env() | ||
|
||
# use other thread to check offload is correctly or not | ||
check_thread = threading.Thread(target=self.check_offload, | ||
args=(cmd, card_id, | ||
card_name, | ||
timeout)) | ||
check_thread.start() | ||
try: | ||
with subprocess.Popen(offload_cmd, env=env, | ||
stdout=subprocess.PIPE, | ||
universal_newlines=True) as offload: | ||
|
||
self.logger.info("offload command:[{}]".format(offload_cmd)) | ||
|
||
# redirect offload command output real time | ||
while offload.poll() is None: | ||
line = offload.stdout.readline().strip() | ||
self.logger.info(line) | ||
check_thread.join() | ||
if self.check_result: | ||
raise SystemExit("offload to specific GPU failed") | ||
except subprocess.CalledProcessError as e: | ||
raise SystemExit("run offload command failed {}".format(repr(e))) | ||
|
||
def parse_args(self, args=sys.argv[1:]): | ||
""" | ||
command line arguments parsing | ||
:param args: arguments from sys | ||
:type args: sys.argv | ||
""" | ||
parser = argparse.ArgumentParser( | ||
prog="Prime offload tester", | ||
description="Test prime offload feature", | ||
) | ||
|
||
parser.add_argument( | ||
"-c", "--command", type=str, default='glxgears', | ||
help='command to offload to specific GPU (default: %(default)s)' | ||
) | ||
parser.add_argument( | ||
"-p", "--pci", type=str, default='0000:00:02.0', | ||
help='pci name in NNNN:NN:NN.N format (default: %(default)s)' | ||
) | ||
parser.add_argument( | ||
"-d", "--driver", type=str, default='i915', | ||
help='Type of GPU driver (default: %(default)s)' | ||
) | ||
parser.add_argument( | ||
"-t", "--timeout", type=int, default=20, | ||
help='executing command duration in second (default: %(default)s).' | ||
) | ||
return parser.parse_args(args) | ||
|
||
def main(self): | ||
args = self.parse_args() | ||
|
||
# create self.logger.formatter | ||
log_formatter = logging.Formatter(fmt='%(message)s') | ||
|
||
# create logger | ||
self.logger.setLevel(logging.INFO) | ||
|
||
# create console handler | ||
console_handler = logging.StreamHandler() | ||
console_handler.setFormatter(log_formatter) | ||
|
||
# Add console handler to logger | ||
self.logger.addHandler(console_handler) | ||
|
||
# run_offload_cmd("glxgears", "0000:00:02.0", "i915", 0) | ||
self.run_offload_cmd(args.command, | ||
args.pci, | ||
args.driver, | ||
args.timeout) | ||
|
||
|
||
if __name__ == "__main__": | ||
PrimeOffloader().main() |
Oops, something went wrong.