From 47e703580715edfa638299f15e2c638cc49e129b Mon Sep 17 00:00:00 2001 From: "Wang, Quintin" Date: Thu, 18 Jan 2024 01:40:36 +0800 Subject: [PATCH] [UPDATE]: update to oneapi toolkit 2024 and torch version 2.1.0 (#239) Update to oneapi toolkit 2024 and update to torch 2.1.0. They should be updated at the same time because ipex 1.13 package has dynamic link to libraries in oneapi 2023. --------- Co-authored-by: Pavel Chekin Co-authored-by: Ettore Tiotto --- .github/dockerfiles/runner-base/Dockerfile | 11 +--- .github/workflows/build_and_test.yml | 11 +++- .github/workflows/docker-runner-base.yaml | 2 +- .../third_party_backends/test_xpu_backend.py | 6 +- python/test/unit/language/assert_helper.py | 1 + python/test/unit/language/print_helper.py | 1 + python/test/unit/language/test_annotations.py | 1 + .../test/unit/language/test_block_pointer.py | 1 + python/test/unit/language/test_conversions.py | 1 + python/test/unit/language/test_core.py | 1 + python/test/unit/language/test_line_info.py | 23 +++---- python/test/unit/language/test_random.py | 1 + .../test/unit/operators/test_blocksparse.py | 1 + .../test/unit/operators/test_cross_entropy.py | 1 + .../unit/operators/test_flash_attention.py | 1 + python/test/unit/operators/test_inductor.py | 1 + python/test/unit/operators/test_matmul.py | 1 + python/test/unit/runtime/test_autotuner.py | 1 + python/test/unit/runtime/test_cache.py | 1 + python/test/unit/runtime/test_driver.py | 1 + python/test/unit/runtime/test_launch.py | 1 + python/test/unit/runtime/test_subproc.py | 1 + scripts/test-triton.sh | 2 +- third_party/xpu/backend/driver.c | 62 +++++++++++++------ 24 files changed, 88 insertions(+), 46 deletions(-) diff --git a/.github/dockerfiles/runner-base/Dockerfile b/.github/dockerfiles/runner-base/Dockerfile index ec5a02f037..d06a770e9c 100644 --- a/.github/dockerfiles/runner-base/Dockerfile +++ b/.github/dockerfiles/runner-base/Dockerfile @@ -1,4 +1,4 @@ -ARG INSTALLER_IMAGE=docker-registry.docker-registry.svc.cluster.local:5000/oneapi-basekit:2023.2.0 +ARG INSTALLER_IMAGE=docker-registry.docker-registry.svc.cluster.local:5000/oneapi-basekit:2024.0.1 FROM $INSTALLER_IMAGE as installer @@ -8,17 +8,12 @@ USER root RUN set -ex; \ export DEBIAN_FRONTEND=noninteractive; \ - echo 'deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy max' > /etc/apt/sources.list.d/intel-graphics.list; \ - curl -s https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg; \ + echo 'deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy unified' > /etc/apt/sources.list.d/intel-gpu-jammy.list; \ + curl -s https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg; \ apt-get update -y; \ apt-get install -y --no-install-recommends --fix-missing \ intel-opencl-icd \ clinfo \ - ; \ - echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" > /etc/apt/sources.list.d/intel-oneapi.list; \ - curl -s https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor --output /usr/share/keyrings/oneapi-archive-keyring.gpg; \ - apt-get update -y; \ - apt-get install -y --no-install-recommends --fix-missing \ intel-level-zero-gpu \ level-zero \ level-zero-dev \ diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index a4c8c0855e..30d4ba31db 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -73,7 +73,7 @@ jobs: runs-on: - glados - spr - - pvc + - oneapi-2024.0.1 strategy: matrix: python: @@ -156,7 +156,14 @@ jobs: - name: Run core tests run: | pip install pytest pytest-xdist - pip install torch==1.13.0a0+git6c9b55e intel_extension_for_pytorch==1.13.120+xpu -f https://developer.intel.com/ipex-whl-stable-xpu + pip install torch==2.1.0a0+cxx11.abi intel_extension_for_pytorch==2.1.10+xpu -f https://developer.intel.com/ipex-whl-stable-xpu + wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.14828.8/intel-igc-core_1.0.14828.8_amd64.deb + wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.14828.8/intel-igc-opencl_1.0.14828.8_amd64.deb + sudo dpkg -i ./intel-igc-core_1.0.14828.8_amd64.deb ./intel-igc-opencl_1.0.14828.8_amd64.deb + rm ./intel-igc-core_1.0.14828.8_amd64.deb ./intel-igc-opencl_1.0.14828.8_amd64.deb + echo 'deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy unified' | sudo tee -a /etc/apt/sources.list.d/intel-gpu-jammy.list + sudo apt update -y + sudo apt-get install -y --no-install-recommends --allow-downgrades --fix-missing libigc1=1.0.14828.26-736~22.04 cd python/test/unit python3 -m pytest -n 8 --verbose --device xpu language/ --ignore=language/test_line_info.py --ignore=language/test_subprocess.py # run runtime tests serially to avoid race condition with cache handling. diff --git a/.github/workflows/docker-runner-base.yaml b/.github/workflows/docker-runner-base.yaml index 0e95564cd7..39ca40f3f1 100644 --- a/.github/workflows/docker-runner-base.yaml +++ b/.github/workflows/docker-runner-base.yaml @@ -20,7 +20,7 @@ jobs: run: | docker build .github/dockerfiles/runner-base/ \ --tag $REGISTRY/$TAG \ - --build-arg INSTALLER_IMAGE=$REGISTRY/oneapi-basekit:2023.2.0 + --build-arg INSTALLER_IMAGE=$REGISTRY/oneapi-basekit:2024.0.1 - name: Push image run: | diff --git a/python/test/backend/third_party_backends/test_xpu_backend.py b/python/test/backend/third_party_backends/test_xpu_backend.py index e6850efdd8..ededb0a07e 100644 --- a/python/test/backend/third_party_backends/test_xpu_backend.py +++ b/python/test/backend/third_party_backends/test_xpu_backend.py @@ -1,8 +1,5 @@ import torch -import triton -import triton.language as tl - def test_xpu_backend(cmdopt): if cmdopt == "xpu": @@ -14,6 +11,9 @@ def test_xpu_backend(cmdopt): except Exception: has_ipex = False + import triton + import triton.language as tl + @triton.jit() def kernel(x_ptr, y_ptr, out_ptr): pid = tl.program_id(axis=0) diff --git a/python/test/unit/language/assert_helper.py b/python/test/unit/language/assert_helper.py index aa7465cf3a..c62f7fc952 100644 --- a/python/test/unit/language/assert_helper.py +++ b/python/test/unit/language/assert_helper.py @@ -1,6 +1,7 @@ import sys import torch +import intel_extension_for_pytorch # type: ignore # noqa: F401 from torch.testing import assert_close import triton diff --git a/python/test/unit/language/print_helper.py b/python/test/unit/language/print_helper.py index 86bf02e665..c87367eee7 100644 --- a/python/test/unit/language/print_helper.py +++ b/python/test/unit/language/print_helper.py @@ -2,6 +2,7 @@ import uuid import torch +import intel_extension_for_pytorch # type: ignore # noqa: F401 from torch.testing import assert_close import triton diff --git a/python/test/unit/language/test_annotations.py b/python/test/unit/language/test_annotations.py index 26bb406649..9eed8da096 100644 --- a/python/test/unit/language/test_annotations.py +++ b/python/test/unit/language/test_annotations.py @@ -1,6 +1,7 @@ from __future__ import annotations import torch +import intel_extension_for_pytorch # type: ignore # noqa: F401 import triton import triton.language as tl diff --git a/python/test/unit/language/test_block_pointer.py b/python/test/unit/language/test_block_pointer.py index 86c660af4a..70869e9b83 100644 --- a/python/test/unit/language/test_block_pointer.py +++ b/python/test/unit/language/test_block_pointer.py @@ -1,5 +1,6 @@ import pytest import torch +import intel_extension_for_pytorch # type: ignore # noqa: F401 import triton import triton.language as tl diff --git a/python/test/unit/language/test_conversions.py b/python/test/unit/language/test_conversions.py index 061dc69cfe..1ad89aba3a 100644 --- a/python/test/unit/language/test_conversions.py +++ b/python/test/unit/language/test_conversions.py @@ -3,6 +3,7 @@ import numpy as np import torch +import intel_extension_for_pytorch # type: ignore # noqa: F401 import pytest import triton import triton.language as tl diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py index 6415749616..86a2671d1f 100644 --- a/python/test/unit/language/test_core.py +++ b/python/test/unit/language/test_core.py @@ -6,6 +6,7 @@ import numpy as np import pytest import torch +import intel_extension_for_pytorch # type: ignore # noqa: F401 from numpy.random import RandomState import triton diff --git a/python/test/unit/language/test_line_info.py b/python/test/unit/language/test_line_info.py index 1d35d1afdc..cdb6cf57e3 100644 --- a/python/test/unit/language/test_line_info.py +++ b/python/test/unit/language/test_line_info.py @@ -3,6 +3,7 @@ import pytest import torch +import intel_extension_for_pytorch # type: ignore # noqa: F401 import triton import triton.language as tl @@ -152,27 +153,27 @@ def test_line_info(func: str): file_lines = extract_file_lines(kernel_info.asm["spv"]) if func == "single": - assert (check_file_lines(file_lines, "test_line_info.py", 16)) assert (check_file_lines(file_lines, "test_line_info.py", 17)) + assert (check_file_lines(file_lines, "test_line_info.py", 18)) elif func == "call": - assert (check_file_lines(file_lines, "test_line_info.py", 29)) - assert (check_file_lines(file_lines, "test_line_info.py", 22)) - assert (check_file_lines(file_lines, "test_line_info.py", 31)) + assert (check_file_lines(file_lines, "test_line_info.py", 30)) + assert (check_file_lines(file_lines, "test_line_info.py", 23)) + assert (check_file_lines(file_lines, "test_line_info.py", 32)) elif func == "call_noinline": - assert (check_file_lines(file_lines, "test_line_info.py", 43)) - assert (check_file_lines(file_lines, "test_line_info.py", 36)) + assert (check_file_lines(file_lines, "test_line_info.py", 44)) assert (check_file_lines(file_lines, "test_line_info.py", 37)) assert (check_file_lines(file_lines, "test_line_info.py", 38)) + assert (check_file_lines(file_lines, "test_line_info.py", 39)) elif func == "multi_files": - assert (check_file_lines(file_lines, "test_line_info.py", 48)) - assert (check_file_lines(file_lines, "test_line_info.py", 50)) + assert (check_file_lines(file_lines, "test_line_info.py", 49)) + assert (check_file_lines(file_lines, "test_line_info.py", 51)) assert (check_file_lines(file_lines, "standard.py", 33)) assert (check_file_lines(file_lines, "standard.py", 34)) assert (check_file_lines(file_lines, "standard.py", 36)) elif func == "autotune": - assert (check_file_lines(file_lines, "test_line_info.py", 61)) assert (check_file_lines(file_lines, "test_line_info.py", 62)) assert (check_file_lines(file_lines, "test_line_info.py", 63)) + assert (check_file_lines(file_lines, "test_line_info.py", 64)) elif func == "dot_combine": - assert (check_file_lines(file_lines, "test_line_info.py", 73)) - assert (check_file_lines(file_lines, "test_line_info.py", 74, should_contain=False)) + assert (check_file_lines(file_lines, "test_line_info.py", 74)) + assert (check_file_lines(file_lines, "test_line_info.py", 75, should_contain=False)) diff --git a/python/test/unit/language/test_random.py b/python/test/unit/language/test_random.py index 76e04f8390..47c4ae9a4f 100644 --- a/python/test/unit/language/test_random.py +++ b/python/test/unit/language/test_random.py @@ -2,6 +2,7 @@ import pytest import scipy.stats import torch +import intel_extension_for_pytorch # type: ignore # noqa: F401 import triton import triton.language as tl diff --git a/python/test/unit/operators/test_blocksparse.py b/python/test/unit/operators/test_blocksparse.py index 72316832f0..644d49151a 100644 --- a/python/test/unit/operators/test_blocksparse.py +++ b/python/test/unit/operators/test_blocksparse.py @@ -1,5 +1,6 @@ import pytest import torch +import intel_extension_for_pytorch # type: ignore # noqa: F401 import triton import triton.ops diff --git a/python/test/unit/operators/test_cross_entropy.py b/python/test/unit/operators/test_cross_entropy.py index 144494f481..bc1bfbd751 100644 --- a/python/test/unit/operators/test_cross_entropy.py +++ b/python/test/unit/operators/test_cross_entropy.py @@ -1,5 +1,6 @@ import pytest import torch +import intel_extension_for_pytorch # type: ignore # noqa: F401 import triton import triton.ops diff --git a/python/test/unit/operators/test_flash_attention.py b/python/test/unit/operators/test_flash_attention.py index e53074d992..d1e60a925c 100644 --- a/python/test/unit/operators/test_flash_attention.py +++ b/python/test/unit/operators/test_flash_attention.py @@ -1,5 +1,6 @@ import pytest import torch +import intel_extension_for_pytorch # type: ignore # noqa: F401 import triton import triton.ops diff --git a/python/test/unit/operators/test_inductor.py b/python/test/unit/operators/test_inductor.py index de64bf4953..207e0277dd 100644 --- a/python/test/unit/operators/test_inductor.py +++ b/python/test/unit/operators/test_inductor.py @@ -1,5 +1,6 @@ import pytest import torch +import intel_extension_for_pytorch # type: ignore # noqa: F401 import triton import triton.language as tl diff --git a/python/test/unit/operators/test_matmul.py b/python/test/unit/operators/test_matmul.py index 62d874280c..e1216303dc 100644 --- a/python/test/unit/operators/test_matmul.py +++ b/python/test/unit/operators/test_matmul.py @@ -2,6 +2,7 @@ import pytest import torch +import intel_extension_for_pytorch # type: ignore # noqa: F401 import triton import triton.language as tl diff --git a/python/test/unit/runtime/test_autotuner.py b/python/test/unit/runtime/test_autotuner.py index 1c789a9758..8a83223f13 100644 --- a/python/test/unit/runtime/test_autotuner.py +++ b/python/test/unit/runtime/test_autotuner.py @@ -1,4 +1,5 @@ import torch +import intel_extension_for_pytorch # type: ignore # noqa: F401 import triton import triton.language as tl diff --git a/python/test/unit/runtime/test_cache.py b/python/test/unit/runtime/test_cache.py index d700e7a96d..851da21e09 100644 --- a/python/test/unit/runtime/test_cache.py +++ b/python/test/unit/runtime/test_cache.py @@ -5,6 +5,7 @@ import pytest import torch +import intel_extension_for_pytorch # type: ignore # noqa: F401 import triton import triton.language as tl diff --git a/python/test/unit/runtime/test_driver.py b/python/test/unit/runtime/test_driver.py index 103b2ef520..4e6a4e2561 100644 --- a/python/test/unit/runtime/test_driver.py +++ b/python/test/unit/runtime/test_driver.py @@ -1,4 +1,5 @@ import sys +import intel_extension_for_pytorch # type: ignore # noqa: F401 import triton diff --git a/python/test/unit/runtime/test_launch.py b/python/test/unit/runtime/test_launch.py index ecd1aa5059..ee1120027c 100644 --- a/python/test/unit/runtime/test_launch.py +++ b/python/test/unit/runtime/test_launch.py @@ -8,6 +8,7 @@ import tracemalloc import torch +import intel_extension_for_pytorch # type: ignore # noqa: F401 import triton import triton.language as tl diff --git a/python/test/unit/runtime/test_subproc.py b/python/test/unit/runtime/test_subproc.py index 5633fba0cc..c96ec58c14 100644 --- a/python/test/unit/runtime/test_subproc.py +++ b/python/test/unit/runtime/test_subproc.py @@ -3,6 +3,7 @@ import shutil import torch +import intel_extension_for_pytorch # type: ignore # noqa: F401 import triton import triton.language as tl diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh index 6626193f46..16a8c4ef4e 100755 --- a/scripts/test-triton.sh +++ b/scripts/test-triton.sh @@ -47,7 +47,7 @@ export TRITON_PROJ_BUILD=$TRITON_PROJ/python/build python3 -m pip install lit python3 -m pip install pytest -python3 -m pip install torch==1.13.0a0+git6c9b55e intel_extension_for_pytorch==1.13.120+xpu -f https://developer.intel.com/ipex-whl-stable-xpu +python3 -m pip install torch==2.1.0a0+cxx11.abi intel_extension_for_pytorch==2.1.10+xpu -f https://developer.intel.com/ipex-whl-stable-xpu if [ $? -ne 0 ]; then echo "FAILED: return code $?" exit $? diff --git a/third_party/xpu/backend/driver.c b/third_party/xpu/backend/driver.c index b456b6acd5..07a2ff6e24 100644 --- a/third_party/xpu/backend/driver.c +++ b/third_party/xpu/backend/driver.c @@ -144,15 +144,15 @@ bool update(sycl::queue sycl_queue) { // Get l0-context auto sycl_context = sycl_queue.get_context(); ze_context_handle_t hCtxt = - get_native(sycl_context); + get_native(sycl_context); // Get l0-device std::vector sycl_devices = sycl_context.get_devices(); ze_device_handle_t hDev = - get_native(sycl_devices[0]); + get_native(sycl_devices[0]); // Get l0-queue bool immediate_cmd_list = false; std::variant queue_var = - get_native(sycl_queue); + get_native(sycl_queue); auto l0_queue = std::get_if(&queue_var); if (l0_queue == nullptr) { auto imm_cmd_list = std::get_if(&queue_var); @@ -170,15 +170,19 @@ bool update(sycl::queue sycl_queue) { context = sycl_queue_map[sycl_queue].context; uint32_t deviceCount = std::min(sycl_devices.size(), devices.size()); for (uint32_t i = 0; i < deviceCount; ++i) { - devices[i] = sycl::get_native(sycl_devices[i]); + devices[i] = + sycl::get_native(sycl_devices[i]); } return true; } static PyObject *initContext(PyObject *self, PyObject *args) { - void *queue; - if (!PyArg_ParseTuple(args, "K", &queue)) + PyObject *cap; + void *queue = NULL; + if (!PyArg_ParseTuple(args, "O", &cap)) + return NULL; + if (!(queue = PyCapsule_GetPointer(cap, PyCapsule_GetName(cap)))) return NULL; sycl::queue *sycl_queue = static_cast(queue); if (sycl_queue_map.find(*sycl_queue) == sycl_queue_map.end()) { @@ -202,8 +206,11 @@ static PyObject *initEventPool(PyObject *self, PyObject *args) { } static PyObject *initDevices(PyObject *self, PyObject *args) { - void *queue; - if (!PyArg_ParseTuple(args, "K", &queue)) + PyObject *cap; + void *queue = NULL; + if (!PyArg_ParseTuple(args, "O", &cap)) + return NULL; + if (!(queue = PyCapsule_GetPointer(cap, PyCapsule_GetName(cap)))) return NULL; sycl::queue *sycl_queue = static_cast(queue); @@ -215,16 +222,19 @@ static PyObject *initDevices(PyObject *self, PyObject *args) { // Retrieve devices uint32_t deviceCount = sycl_devices.size(); for (uint32_t i = 0; i < deviceCount; ++i) { - devices.push_back( - sycl::get_native(sycl_devices[i])); + devices.push_back(sycl::get_native( + sycl_devices[i])); } return Py_BuildValue("(i)", deviceCount); } static PyObject *getL0ImmCommandList(PyObject *self, PyObject *args) { - void *queue; - if (!PyArg_ParseTuple(args, "K", &queue)) + PyObject *cap; + void *queue = NULL; + if (!PyArg_ParseTuple(args, "O", &cap)) + return NULL; + if (!(queue = PyCapsule_GetPointer(cap, PyCapsule_GetName(cap)))) return NULL; sycl::queue *sycl_queue = static_cast(queue); @@ -234,8 +244,11 @@ static PyObject *getL0ImmCommandList(PyObject *self, PyObject *args) { return Py_BuildValue("(K)", (uint64_t)(sycl_queue_map[*sycl_queue].cmd_list)); } static PyObject *getL0Queue(PyObject *self, PyObject *args) { - void *queue; - if (!PyArg_ParseTuple(args, "K", &queue)) + PyObject *cap; + void *queue = NULL; + if (!PyArg_ParseTuple(args, "O", &cap)) + return NULL; + if (!(queue = PyCapsule_GetPointer(cap, PyCapsule_GetName(cap)))) return NULL; sycl::queue *sycl_queue = static_cast(queue); if (sycl_queue_map.find(*sycl_queue) == sycl_queue_map.end()) { @@ -244,8 +257,11 @@ static PyObject *getL0Queue(PyObject *self, PyObject *args) { return Py_BuildValue("(K)", (uint64_t)(sycl_queue_map[*sycl_queue].queue)); } static PyObject *getL0DevPtr(PyObject *self, PyObject *args) { - void *queue; - if (!PyArg_ParseTuple(args, "K", &queue)) + PyObject *cap; + void *queue = NULL; + if (!PyArg_ParseTuple(args, "O", &cap)) + return NULL; + if (!(queue = PyCapsule_GetPointer(cap, PyCapsule_GetName(cap)))) return NULL; sycl::queue *sycl_queue = static_cast(queue); if (sycl_queue_map.find(*sycl_queue) == sycl_queue_map.end()) { @@ -254,8 +270,11 @@ static PyObject *getL0DevPtr(PyObject *self, PyObject *args) { return Py_BuildValue("(K)", (uint64_t)(sycl_queue_map[*sycl_queue].device)); } static PyObject *getL0CtxtPtr(PyObject *self, PyObject *args) { - void *queue; - if (!PyArg_ParseTuple(args, "K", &queue)) + PyObject *cap; + void *queue = NULL; + if (!PyArg_ParseTuple(args, "O", &cap)) + return NULL; + if (!(queue = PyCapsule_GetPointer(cap, PyCapsule_GetName(cap)))) return NULL; sycl::queue *sycl_queue = static_cast(queue); if (sycl_queue_map.find(*sycl_queue) == sycl_queue_map.end()) { @@ -264,8 +283,11 @@ static PyObject *getL0CtxtPtr(PyObject *self, PyObject *args) { return Py_BuildValue("(K)", (uint64_t)(sycl_queue_map[*sycl_queue].context)); } static PyObject *isUsingICL(PyObject *self, PyObject *args) { - void *queue; - if (!PyArg_ParseTuple(args, "K", &queue)) + PyObject *cap; + void *queue = NULL; + if (!PyArg_ParseTuple(args, "O", &cap)) + return NULL; + if (!(queue = PyCapsule_GetPointer(cap, PyCapsule_GetName(cap)))) return NULL; sycl::queue *sycl_queue = static_cast(queue); if (sycl_queue_map.find(*sycl_queue) == sycl_queue_map.end()) {