diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 4c2d5f0b5c..9f1e0586c3 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -57,11 +57,12 @@ jobs: - name: Run core tests if: ${{ env.BACKEND == 'XPU'}} run: | - cd python/test/unit/language - python3 -m pytest --verbose --device xpu --ignore=test_line_info.py --ignore=test_subprocess.py - + cd python/test/unit + python3 -m pytest -n 8 --verbose --device xpu language/ --ignore=language/test_line_info.py --ignore=language/test_subprocess.py + # run runtime tests serially to avoid race condition with cache handling. + python3 -m pytest runtime/ # run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0 - TRITON_DISABLE_LINE_INFO=0 python3 -m pytest --verbose --device xpu test_line_info.py + TRITON_DISABLE_LINE_INFO=0 python3 -m pytest --verbose --device xpu language/test_line_info.py - name: Run assert/print tests if: ${{ env.BACKEND == 'XPU'}} diff --git a/.github/workflows/build_and_test_2.yaml b/.github/workflows/build_and_test_2.yaml index b2f04777d6..4ae26086a5 100644 --- a/.github/workflows/build_and_test_2.yaml +++ b/.github/workflows/build_and_test_2.yaml @@ -150,10 +150,12 @@ jobs: run: | pip install pytest pytest-xdist pip install torch==1.13.0a0+git6c9b55e intel_extension_for_pytorch==1.13.120+xpu -f https://developer.intel.com/ipex-whl-stable-xpu - cd python/test/unit/language - python3 -m pytest -n auto --verbose --device xpu --ignore=test_line_info.py --ignore=test_subprocess.py + cd python/test/unit + python3 -m pytest -n 8 --verbose --device xpu language/ --ignore=language/test_line_info.py --ignore=language/test_subprocess.py + # run runtime tests serially to avoid race condition with cache handling. + python3 -m pytest runtime/ # run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0 - TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -n auto test_line_info.py + TRITON_DISABLE_LINE_INFO=0 python3 -m pytest --verbose --device xpu language/test_line_info.py - name: Run assert/print tests run: | diff --git a/python/test/unit/runtime/test_autotuner.py b/python/test/unit/runtime/test_autotuner.py index ef5d16005d..1c789a9758 100644 --- a/python/test/unit/runtime/test_autotuner.py +++ b/python/test/unit/runtime/test_autotuner.py @@ -4,11 +4,14 @@ import triton.language as tl import pytest +# FIXME remove this once Triton L0 queue and IPEX SYCL queue can be synchronized through events +torch.xpu.enable_sync_mode() + def test_kwargs(): N = 1024 - src = torch.empty(N, device='cuda') - dst = torch.empty(N, device='cuda') + src = torch.empty(N, device='xpu') + dst = torch.empty(N, device='xpu') configs = [triton.Config(kwargs={'BLOCK_SIZE': 32}), triton.Config(kwargs={'BLOCK_SIZE': 128})] @@ -26,7 +29,7 @@ def _kernel(dst, src, N, BLOCK_SIZE: tl.constexpr): def test_restore(): N = 1024 - src = torch.zeros(N, device='cuda') + src = torch.zeros(N, device='xpu') configs = [triton.Config(kwargs={'BLOCK_SIZE': 32}), triton.Config(kwargs={'BLOCK_SIZE': 128})] @@ -45,8 +48,8 @@ def _kernel(src, N, BLOCK_SIZE: tl.constexpr): @pytest.mark.parametrize('with_perf_model', [False, True]) def test_prune_configs(with_perf_model: bool): N = 1024 - src = torch.empty(N, device='cuda') - dst = torch.empty(N, device='cuda') + src = torch.empty(N, device='xpu') + dst = torch.empty(N, device='xpu') records = {} def early_config_prune(configs, named_args): diff --git a/python/test/unit/runtime/test_cache.py b/python/test/unit/runtime/test_cache.py index cd589fa920..eff49a5a0d 100644 --- a/python/test/unit/runtime/test_cache.py +++ b/python/test/unit/runtime/test_cache.py @@ -10,6 +10,9 @@ import triton.language as tl from triton.runtime.jit import JITFunction +# FIXME remove this once Triton L0 queue and IPEX SYCL queue can be synchronized through events +torch.xpu.enable_sync_mode() + tmpdir = ".tmp" @@ -111,7 +114,7 @@ def inc_counter(*args, **kwargs): JITFunction.cache_hook = inc_counter reset_tmp_dir() - x = torch.empty(1, dtype=torch.int32, device='cuda') + x = torch.empty(1, dtype=torch.int32, device='xpu') for i in range(10): kernel[(1, )](x, 1, BLOCK=1024) assert counter == 1 @@ -127,7 +130,7 @@ def inc_counter(*args, **kwargs): JITFunction.cache_hook = inc_counter reset_tmp_dir() - x = torch.empty(1, dtype=torch.int32, device='cuda') + x = torch.empty(1, dtype=torch.int32, device='xpu') function = {'enable': kernel, 'disable': kernel_nospec}[mode] target = {'enable': 4, 'disable': 1}[mode] for i in [1, 2, 4, 8, 16, 32]: @@ -141,9 +144,9 @@ def test_annotation(): def kernel(X, i: tl.int32): tl.store(X, i) - x = torch.empty(1, dtype=torch.int32, device='cuda') + x = torch.empty(1, dtype=torch.int32, device='xpu') - device = torch.cuda.current_device() + device = torch.xpu.current_device() kernel[(1, )](x, 1) kernel[(1, )](x, 8) kernel[(1, )](x, 16) @@ -157,7 +160,7 @@ def test_constexpr_not_callable() -> None: def kernel(X, c: tl.constexpr): tl.store(X, 2) - x = torch.empty(1, dtype=torch.int32, device='cuda') + x = torch.empty(1, dtype=torch.int32, device='xpu') error = False try: kernel[(1, )](x, c="str") @@ -180,12 +183,12 @@ def kernel_add(a, b, o, N: tl.constexpr): tl.store(o + idx, tl.load(a + idx) + tl.load(b + idx)) args = [ - torch.randn(32, dtype=torch.float32, device="cuda"), - torch.randn(32, dtype=torch.float32, device="cuda"), - torch.randn(32, dtype=torch.float32, device="cuda"), + torch.randn(32, dtype=torch.float32, device="xpu"), + torch.randn(32, dtype=torch.float32, device="xpu"), + torch.randn(32, dtype=torch.float32, device="xpu"), 32, ] - device = torch.cuda.current_device() + device = torch.xpu.current_device() assert len(kernel_add.cache[device]) == 0 kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, )) assert len(kernel_add.cache[device]) == 1 @@ -203,7 +206,7 @@ def kernel_add(a, b, o, N: tl.constexpr): tl.device_assert(idx < 32, "idx < 32") tl.store(o + idx, tl.load(a + idx) + tl.load(b + idx)) - device = torch.cuda.current_device() + device = torch.xpu.current_device() assert len(kernel_add.cache[device]) == 0 kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, )) assert len(kernel_add.cache[device]) == 1 @@ -229,7 +232,7 @@ def test_jit_noinline() -> None: def kernel_add_device(a, b, o, N: tl.constexpr): add_fn(a, b, o, N) - device = torch.cuda.current_device() + device = torch.xpu.current_device() assert len(kernel_add_device.cache[device]) == 0 kernel_add_device.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, )) assert len(kernel_add_device.cache[device]) == 1 @@ -257,3 +260,5 @@ def kernel(in_ptr0, out_ptr0, xnumel, XBLOCK: tl.constexpr): x0 = xindex tmp0 = tl.load(in_ptr0 + (x0), xmask) tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask) + + reset_tmp_dir() diff --git a/python/test/unit/runtime/test_launch.py b/python/test/unit/runtime/test_launch.py index 00009f230f..ecd1aa5059 100644 --- a/python/test/unit/runtime/test_launch.py +++ b/python/test/unit/runtime/test_launch.py @@ -29,8 +29,8 @@ def kernel(in_ptr0, out_ptr0, xnumel, XBLOCK: tl.constexpr): tracemalloc.start() try: - inp = torch.randn(10, device='cuda') - out = torch.randn(10, device='cuda') + inp = torch.randn(10, device='xpu') + out = torch.randn(10, device='xpu') kernel[(10, )](inp, out, 10, XBLOCK=16) gc.collect() begin, _ = tracemalloc.get_traced_memory() diff --git a/python/test/unit/runtime/test_subproc.py b/python/test/unit/runtime/test_subproc.py index 63401f28e4..e5ef955930 100644 --- a/python/test/unit/runtime/test_subproc.py +++ b/python/test/unit/runtime/test_subproc.py @@ -2,12 +2,16 @@ import os import shutil +import pytest import torch import triton import triton.language as tl from triton.compiler import ASTSource +# FIXME remove this once Triton L0 queue and IPEX SYCL queue can be synchronized through events +torch.xpu.enable_sync_mode() + tmpdir = ".tmp" @@ -30,10 +34,12 @@ def kernel_sub(a, b, o, N: tl.constexpr): signature={0: "*fp32", 1: "*fp32", 2: "*fp32"}, attrs=attrs, ) - triton.compile(src=src, target=("cuda", capability)) + triton.compile(src=src, target=("xpu", capability)) def test_compile_in_subproc() -> None: + pytest.skip("FIXME: Port get_device_capability to XPU") + major, minor = torch.cuda.get_device_capability(0) cc = major * 10 + minor config = triton.compiler.AttrsDescriptor(tuple(range(4)), (), (), ()) @@ -55,10 +61,12 @@ def kernel_dot(Z): tl.store(Z + offs, z) src = ASTSource(fn=kernel_dot, signature={0: "*fp32"}, attrs=attrs, constants=dict()) - triton.compile(src=src, target=("cuda", capability)) + triton.compile(src=src, target=("xpu", capability)) def test_compile_in_forked_subproc() -> None: + pytest.skip("FIXME: Port get_device_capability to XPU") + reset_tmp_dir() major, minor = torch.cuda.get_device_capability(0) capability = major * 10 + minor diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh index 054ee2bfff..339ded848f 100755 --- a/scripts/test-triton.sh +++ b/scripts/test-triton.sh @@ -96,11 +96,12 @@ function run_core_tests { echo "***************************************************" echo "****** Running Triton Core tests ******" echo "***************************************************" - CORE_TEST_DIR=$TRITON_PROJ/python/test/unit/language + CORE_TEST_DIR=$TRITON_PROJ/python/test/unit if [ ! -d "${CORE_TEST_DIR}" ]; then echo "Not found '${CORE_TEST_DIR}'. Build Triton please" ; exit 3 fi - cd $CORE_TEST_DIR + + cd $CORE_TEST_DIR/language TRITON_DISABLE_LINE_INFO=1 python3 -m pytest --verbose --device xpu --ignore=test_line_info.py --ignore=test_subprocess.py if [ $? -ne 0 ]; then echo "FAILED: return code $?" ; exit $? @@ -108,6 +109,9 @@ function run_core_tests { # run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0 TRITON_DISABLE_LINE_INFO=0 python3 -m pytest --verbose --device xpu test_line_info.py + if [ $? -ne 0 ]; then + echo "FAILED: return code $?" ; exit $? + fi python3 assert_helper.py device_assert if [ $? -ne 0 ]; then @@ -117,6 +121,12 @@ function run_core_tests { if [ $? -ne 0 ]; then echo "FAILED: return code $?" ; exit $? fi + + cd $CORE_TEST_DIR/runtime + TRITON_DISABLE_LINE_INFO=1 python3 -m pytest --verbose + if [ $? -ne 0 ]; then + echo "FAILED: return code $?" ; exit $? + fi } function run_tutorial_test {