diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 4c2d5f0b5c..9f1e0586c3 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -57,11 +57,12 @@ jobs:
       - name: Run core tests
         if: ${{ env.BACKEND == 'XPU'}}
         run: |
-          cd python/test/unit/language
-          python3 -m pytest --verbose --device xpu --ignore=test_line_info.py --ignore=test_subprocess.py
-
+          cd python/test/unit
+          python3 -m pytest -n 8 --verbose --device xpu language/ --ignore=language/test_line_info.py --ignore=language/test_subprocess.py
+          # run runtime tests serially to avoid race condition with cache handling.
+          python3 -m pytest runtime/
           # run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
-          TRITON_DISABLE_LINE_INFO=0 python3 -m pytest --verbose --device xpu test_line_info.py
+          TRITON_DISABLE_LINE_INFO=0 python3 -m pytest --verbose --device xpu language/test_line_info.py
 
       - name: Run assert/print tests
         if: ${{ env.BACKEND == 'XPU'}}
diff --git a/.github/workflows/build_and_test_2.yaml b/.github/workflows/build_and_test_2.yaml
index b2f04777d6..4ae26086a5 100644
--- a/.github/workflows/build_and_test_2.yaml
+++ b/.github/workflows/build_and_test_2.yaml
@@ -150,10 +150,12 @@ jobs:
         run: |
           pip install pytest pytest-xdist
           pip install torch==1.13.0a0+git6c9b55e intel_extension_for_pytorch==1.13.120+xpu -f https://developer.intel.com/ipex-whl-stable-xpu
-          cd python/test/unit/language
-          python3 -m pytest -n auto --verbose --device xpu --ignore=test_line_info.py --ignore=test_subprocess.py
+          cd python/test/unit
+          python3 -m pytest -n 8 --verbose --device xpu language/ --ignore=language/test_line_info.py --ignore=language/test_subprocess.py
+          # run runtime tests serially to avoid race condition with cache handling.
+          python3 -m pytest runtime/
           # run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
-          TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -n auto test_line_info.py
+          TRITON_DISABLE_LINE_INFO=0 python3 -m pytest --verbose --device xpu language/test_line_info.py
 
       - name: Run assert/print tests
         run: |
diff --git a/python/test/unit/runtime/test_autotuner.py b/python/test/unit/runtime/test_autotuner.py
index ef5d16005d..1c789a9758 100644
--- a/python/test/unit/runtime/test_autotuner.py
+++ b/python/test/unit/runtime/test_autotuner.py
@@ -4,11 +4,14 @@
 import triton.language as tl
 import pytest
 
+# FIXME remove this once Triton L0 queue and IPEX SYCL queue can be synchronized through events
+torch.xpu.enable_sync_mode()
+
 
 def test_kwargs():
     N = 1024
-    src = torch.empty(N, device='cuda')
-    dst = torch.empty(N, device='cuda')
+    src = torch.empty(N, device='xpu')
+    dst = torch.empty(N, device='xpu')
 
     configs = [triton.Config(kwargs={'BLOCK_SIZE': 32}), triton.Config(kwargs={'BLOCK_SIZE': 128})]
 
@@ -26,7 +29,7 @@ def _kernel(dst, src, N, BLOCK_SIZE: tl.constexpr):
 
 def test_restore():
     N = 1024
-    src = torch.zeros(N, device='cuda')
+    src = torch.zeros(N, device='xpu')
 
     configs = [triton.Config(kwargs={'BLOCK_SIZE': 32}), triton.Config(kwargs={'BLOCK_SIZE': 128})]
 
@@ -45,8 +48,8 @@ def _kernel(src, N, BLOCK_SIZE: tl.constexpr):
 @pytest.mark.parametrize('with_perf_model', [False, True])
 def test_prune_configs(with_perf_model: bool):
     N = 1024
-    src = torch.empty(N, device='cuda')
-    dst = torch.empty(N, device='cuda')
+    src = torch.empty(N, device='xpu')
+    dst = torch.empty(N, device='xpu')
     records = {}
 
     def early_config_prune(configs, named_args):
diff --git a/python/test/unit/runtime/test_cache.py b/python/test/unit/runtime/test_cache.py
index cd589fa920..eff49a5a0d 100644
--- a/python/test/unit/runtime/test_cache.py
+++ b/python/test/unit/runtime/test_cache.py
@@ -10,6 +10,9 @@
 import triton.language as tl
 from triton.runtime.jit import JITFunction
 
+# FIXME remove this once Triton L0 queue and IPEX SYCL queue can be synchronized through events
+torch.xpu.enable_sync_mode()
+
 tmpdir = ".tmp"
 
 
@@ -111,7 +114,7 @@ def inc_counter(*args, **kwargs):
 
     JITFunction.cache_hook = inc_counter
     reset_tmp_dir()
-    x = torch.empty(1, dtype=torch.int32, device='cuda')
+    x = torch.empty(1, dtype=torch.int32, device='xpu')
     for i in range(10):
         kernel[(1, )](x, 1, BLOCK=1024)
     assert counter == 1
@@ -127,7 +130,7 @@ def inc_counter(*args, **kwargs):
 
     JITFunction.cache_hook = inc_counter
     reset_tmp_dir()
-    x = torch.empty(1, dtype=torch.int32, device='cuda')
+    x = torch.empty(1, dtype=torch.int32, device='xpu')
     function = {'enable': kernel, 'disable': kernel_nospec}[mode]
     target = {'enable': 4, 'disable': 1}[mode]
     for i in [1, 2, 4, 8, 16, 32]:
@@ -141,9 +144,9 @@ def test_annotation():
     def kernel(X, i: tl.int32):
         tl.store(X, i)
 
-    x = torch.empty(1, dtype=torch.int32, device='cuda')
+    x = torch.empty(1, dtype=torch.int32, device='xpu')
 
-    device = torch.cuda.current_device()
+    device = torch.xpu.current_device()
     kernel[(1, )](x, 1)
     kernel[(1, )](x, 8)
     kernel[(1, )](x, 16)
@@ -157,7 +160,7 @@ def test_constexpr_not_callable() -> None:
     def kernel(X, c: tl.constexpr):
         tl.store(X, 2)
 
-    x = torch.empty(1, dtype=torch.int32, device='cuda')
+    x = torch.empty(1, dtype=torch.int32, device='xpu')
     error = False
     try:
         kernel[(1, )](x, c="str")
@@ -180,12 +183,12 @@ def kernel_add(a, b, o, N: tl.constexpr):
         tl.store(o + idx, tl.load(a + idx) + tl.load(b + idx))
 
     args = [
-        torch.randn(32, dtype=torch.float32, device="cuda"),
-        torch.randn(32, dtype=torch.float32, device="cuda"),
-        torch.randn(32, dtype=torch.float32, device="cuda"),
+        torch.randn(32, dtype=torch.float32, device="xpu"),
+        torch.randn(32, dtype=torch.float32, device="xpu"),
+        torch.randn(32, dtype=torch.float32, device="xpu"),
         32,
     ]
-    device = torch.cuda.current_device()
+    device = torch.xpu.current_device()
     assert len(kernel_add.cache[device]) == 0
     kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, ))
     assert len(kernel_add.cache[device]) == 1
@@ -203,7 +206,7 @@ def kernel_add(a, b, o, N: tl.constexpr):
         tl.device_assert(idx < 32, "idx < 32")
         tl.store(o + idx, tl.load(a + idx) + tl.load(b + idx))
 
-    device = torch.cuda.current_device()
+    device = torch.xpu.current_device()
     assert len(kernel_add.cache[device]) == 0
     kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, ))
     assert len(kernel_add.cache[device]) == 1
@@ -229,7 +232,7 @@ def test_jit_noinline() -> None:
     def kernel_add_device(a, b, o, N: tl.constexpr):
         add_fn(a, b, o, N)
 
-    device = torch.cuda.current_device()
+    device = torch.xpu.current_device()
     assert len(kernel_add_device.cache[device]) == 0
     kernel_add_device.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, ))
     assert len(kernel_add_device.cache[device]) == 1
@@ -257,3 +260,5 @@ def kernel(in_ptr0, out_ptr0, xnumel, XBLOCK: tl.constexpr):
         x0 = xindex
         tmp0 = tl.load(in_ptr0 + (x0), xmask)
         tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)
+
+    reset_tmp_dir()
diff --git a/python/test/unit/runtime/test_launch.py b/python/test/unit/runtime/test_launch.py
index 00009f230f..ecd1aa5059 100644
--- a/python/test/unit/runtime/test_launch.py
+++ b/python/test/unit/runtime/test_launch.py
@@ -29,8 +29,8 @@ def kernel(in_ptr0, out_ptr0, xnumel, XBLOCK: tl.constexpr):
 
     tracemalloc.start()
     try:
-        inp = torch.randn(10, device='cuda')
-        out = torch.randn(10, device='cuda')
+        inp = torch.randn(10, device='xpu')
+        out = torch.randn(10, device='xpu')
         kernel[(10, )](inp, out, 10, XBLOCK=16)
         gc.collect()
         begin, _ = tracemalloc.get_traced_memory()
diff --git a/python/test/unit/runtime/test_subproc.py b/python/test/unit/runtime/test_subproc.py
index 63401f28e4..e5ef955930 100644
--- a/python/test/unit/runtime/test_subproc.py
+++ b/python/test/unit/runtime/test_subproc.py
@@ -2,12 +2,16 @@
 import os
 import shutil
 
+import pytest
 import torch
 
 import triton
 import triton.language as tl
 from triton.compiler import ASTSource
 
+# FIXME remove this once Triton L0 queue and IPEX SYCL queue can be synchronized through events
+torch.xpu.enable_sync_mode()
+
 tmpdir = ".tmp"
 
 
@@ -30,10 +34,12 @@ def kernel_sub(a, b, o, N: tl.constexpr):
         signature={0: "*fp32", 1: "*fp32", 2: "*fp32"},
         attrs=attrs,
     )
-    triton.compile(src=src, target=("cuda", capability))
+    triton.compile(src=src, target=("xpu", capability))
 
 
 def test_compile_in_subproc() -> None:
+    pytest.skip("FIXME: Port get_device_capability to XPU")
+
     major, minor = torch.cuda.get_device_capability(0)
     cc = major * 10 + minor
     config = triton.compiler.AttrsDescriptor(tuple(range(4)), (), (), ())
@@ -55,10 +61,12 @@ def kernel_dot(Z):
         tl.store(Z + offs, z)
 
     src = ASTSource(fn=kernel_dot, signature={0: "*fp32"}, attrs=attrs, constants=dict())
-    triton.compile(src=src, target=("cuda", capability))
+    triton.compile(src=src, target=("xpu", capability))
 
 
 def test_compile_in_forked_subproc() -> None:
+    pytest.skip("FIXME: Port get_device_capability to XPU")
+
     reset_tmp_dir()
     major, minor = torch.cuda.get_device_capability(0)
     capability = major * 10 + minor
diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh
index 054ee2bfff..339ded848f 100755
--- a/scripts/test-triton.sh
+++ b/scripts/test-triton.sh
@@ -96,11 +96,12 @@ function run_core_tests {
   echo "***************************************************"
   echo "******      Running Triton Core tests        ******"
   echo "***************************************************"
-  CORE_TEST_DIR=$TRITON_PROJ/python/test/unit/language
+  CORE_TEST_DIR=$TRITON_PROJ/python/test/unit
   if [ ! -d "${CORE_TEST_DIR}" ]; then
     echo "Not found '${CORE_TEST_DIR}'. Build Triton please" ; exit 3
   fi
-  cd $CORE_TEST_DIR
+
+  cd $CORE_TEST_DIR/language
   TRITON_DISABLE_LINE_INFO=1 python3 -m pytest --verbose --device xpu --ignore=test_line_info.py --ignore=test_subprocess.py
   if [ $? -ne 0 ]; then
     echo "FAILED: return code $?" ; exit $?
@@ -108,6 +109,9 @@ function run_core_tests {
 
   # run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
   TRITON_DISABLE_LINE_INFO=0 python3 -m pytest --verbose --device xpu test_line_info.py
+  if [ $? -ne 0 ]; then
+    echo "FAILED: return code $?" ; exit $?
+  fi
 
   python3 assert_helper.py device_assert
   if [ $? -ne 0 ]; then
@@ -117,6 +121,12 @@ function run_core_tests {
   if [ $? -ne 0 ]; then
     echo "FAILED: return code $?" ; exit $?
   fi
+
+  cd $CORE_TEST_DIR/runtime
+  TRITON_DISABLE_LINE_INFO=1 python3 -m pytest --verbose
+  if [ $? -ne 0 ]; then
+    echo "FAILED: return code $?" ; exit $?
+  fi
 }
 
 function run_tutorial_test {