Merge commit '6d3ed0b91116e1e238a56f8b1d0d7cdaa2141911'

intel · Nov 27, 2024 · 637f0c3 · 637f0c3
2 parents 4fe1e6f + 6d3ed0b
commit 637f0c3
Show file tree

Hide file tree

Showing 149 changed files with 6,779 additions and 6,784 deletions.
diff --git a/bin/triton-tensor-layout.cpp b/bin/triton-tensor-layout.cpp
@@ -22,16 +22,16 @@ using namespace mlir;
 // clang-format off
 // Example usage:
 //
-// triton-tensor-layout -l "#triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], instrShape = [16, 256, 32]}>" -t "tensor<128x256xf16>"
+// triton-tensor-layout -l "#ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], instrShape = [16, 256, 32]}>" -t "tensor<128x256xf16>"
 //
 // triton-tensor-layout -i input.mlir -t "tensor<1x128x128xf16>" -o output.txt
 //
 // triton-tensor-layout -i input.mlir -t "tensor<1x128x128xf16>" -o output.txt -alias-names="blocked,mma" -use-hw-view
 //
 // An input file usually looks like:
 // '''
-// #mma = #triton_gpu.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 1, 8], instrShape = [32, 32], isTransposed = false}>
-// #blocked = #triton_gpu.blocked<{sizePerThread = [1, 8, 1], threadsPerWarp = [1, 16, 4], warpsPerCTA = [1, 1, 8], order = [0, 1, 2]}>
+// #mma = #ttg.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 1, 8], instrShape = [32, 32], isTransposed = false}>
+// #blocked = #ttg.blocked<{sizePerThread = [1, 8, 1], threadsPerWarp = [1, 16, 4], warpsPerCTA = [1, 1, 8], order = [0, 1, 2]}>
 // '''
 // clang-format on
 

diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
@@ -314,7 +314,7 @@ Example 1, a row-major coalesced layout may partition a 16x16 tensor over 2 warp
 ```
 for
 ```
-#triton_gpu.blocked_layout<{
+#ttg.blocked_layout<{
   sizePerThread = {2, 2}
   threadsPerWarp = {8, 4}
   warpsPerCTA = {1, 2}
@@ -341,7 +341,7 @@ Example 2, a row-major coalesced layout may partition a 32x32 tensor over 2 warp
 ```
 for
 ```
-#triton_gpu.blocked_layout<{
+#ttg.blocked_layout<{
   sizePerThread = {2, 2}
   threadsPerWarp = {8, 4}
   warpsPerCTA = {1, 2}
@@ -373,7 +373,7 @@ CTA [1,0]                                              CTA [1,1]
 ```
 for
 ```
-#triton_gpu.blocked_layout<{
+#ttg.blocked_layout<{
   sizePerThread = {2, 2}
   threadsPerWarp = {8, 4}
   warpsPerCTA = {1, 2}
@@ -403,25 +403,25 @@ A single dot operator is likely to be mapped to multiple MMA instructions. For N
 ### Layout conversion
 To produce the desired memory behavior described in the previous section, triton GPU introduces layouts conversion (by means of ConvertLayoutOp). An input tensor represented in a blocked layout is sliced and inserted into a shared layout, e.g.:
 ```
-    %61 = triton_gpu.insert_slice_async %39, %58, %c0_i32, %60, %cst_1 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x32x!tt.ptr<f16>, #blocked> -> tensor<4x64x32xf16, #shared>
-    triton_gpu.async_commit_group
+    %61 = ttg.insert_slice_async %39, %58, %c0_i32, %60, %cst_1 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x32x!tt.ptr<f16>, #blocked> -> tensor<4x64x32xf16, #shared>
+    ttg.async_commit_group
 ```
 
 The main loop of the GEMM would then extract a slice (a reimplementation of tensor.extract_slice [25]) from the shared memory, converting arguments to the dot layout and producing mma layout with the dot operator, e.g.:
 <pre><code>
-    %107:14 = <b>scf.for</b> %arg9 = %c0_i32 to %51 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %39, %arg12 = %49, %arg13 = %94, %arg14 = %100, %arg15 = %101, %arg16 = %102, %arg17 = %85, %arg18 = %86, %arg19 = %c2_i32, %arg20 = %c3_i32, %arg21 = %c1_i32, %arg22 = %104, %arg23 = %106) -> (tensor<64x128xf32, #mma>, tensor<64x32x!tt.ptr<f16>, #blocked>, tensor<32x128x!tt.ptr<f16>, #blocked1>, tensor<4x64x32xf16, #shared>, tensor<4x32x128xf16, #shared1>, tensor<64x32xf16, #shared>, tensor<32x128xf16, #shared1>, tensor<64x32x!tt.ptr<f16>, #blocked>, tensor<32x128x!tt.ptr<f16>, #blocked1>, i32, i32, i32, tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>, tensor<16x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>)  : i32 {
-      %126 = <b>triton_gpu.extract_slice</b> %arg15[0, 16] [64, 16] [1, 1] : tensor<64x32xf16, #shared> to tensor<64x16xf16, #shared>
-      %127 = <b>triton_gpu.convert_layout</b> %126 : (tensor<64x16xf16, <b>#shared</b>>) -> tensor<64x16xf16, <b>#triton_gpu.dot_op</b><{opIdx = 0, parent = #mma, kWidth = 2}>>
-      %128 = <b>triton_gpu.extract_slice</b> %arg16[16, 0] [16, 128] [1, 1] : tensor<32x128xf16, <b>#shared1</b>> to tensor<16x128xf16, <b>#shared1</b>>
-      %129 = <b>triton_gpu.convert_layout</b> %128 : (tensor<16x128xf16, #shared1>) -> tensor<16x128xf16, <b>#triton_gpu.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>>
-      %130 = tt.dot %arg22, %arg23, %arg10 {allowTF32 = true} : tensor<64x16xf16, <b>#triton_gpu.dot_op</b><{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x128xf16, <b>#triton_gpu.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x128xf32, <b>#mma</b>>
-      %131 = <b>tt.dot</b> %127, %129, %130 {allowTF32 = true} : tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x128xf16, <b>#triton_gpu.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x128xf32, <b>#mma</b>>
+    %107:14 = <b>scf.for</b> %arg9 = %c0_i32 to %51 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %39, %arg12 = %49, %arg13 = %94, %arg14 = %100, %arg15 = %101, %arg16 = %102, %arg17 = %85, %arg18 = %86, %arg19 = %c2_i32, %arg20 = %c3_i32, %arg21 = %c1_i32, %arg22 = %104, %arg23 = %106) -> (tensor<64x128xf32, #mma>, tensor<64x32x!tt.ptr<f16>, #blocked>, tensor<32x128x!tt.ptr<f16>, #blocked1>, tensor<4x64x32xf16, #shared>, tensor<4x32x128xf16, #shared1>, tensor<64x32xf16, #shared>, tensor<32x128xf16, #shared1>, tensor<64x32x!tt.ptr<f16>, #blocked>, tensor<32x128x!tt.ptr<f16>, #blocked1>, i32, i32, i32, tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>, tensor<16x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>)  : i32 {
+      %126 = <b>ttg.extract_slice</b> %arg15[0, 16] [64, 16] [1, 1] : tensor<64x32xf16, #shared> to tensor<64x16xf16, #shared>
+      %127 = <b>ttg.convert_layout</b> %126 : (tensor<64x16xf16, <b>#shared</b>>) -> tensor<64x16xf16, <b>#ttg.dot_op</b><{opIdx = 0, parent = #mma, kWidth = 2}>>
+      %128 = <b>ttg.extract_slice</b> %arg16[16, 0] [16, 128] [1, 1] : tensor<32x128xf16, <b>#shared1</b>> to tensor<16x128xf16, <b>#shared1</b>>
+      %129 = <b>ttg.convert_layout</b> %128 : (tensor<16x128xf16, #shared1>) -> tensor<16x128xf16, <b>#ttg.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>>
+      %130 = tt.dot %arg22, %arg23, %arg10 {allowTF32 = true} : tensor<64x16xf16, <b>#ttg.dot_op</b><{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x128xf16, <b>#ttg.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x128xf32, <b>#mma</b>>
+      %131 = <b>tt.dot</b> %127, %129, %130 {allowTF32 = true} : tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x128xf16, <b>#ttg.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x128xf32, <b>#mma</b>>
       ...
 </code></pre>
 
 The result of the processing is then converted back to blocked layout to be stored to the main GPU memory, e.g.:
 ```
-    %125 = triton_gpu.convert_layout %108 : (tensor<64x128xf16, #mma>) -> tensor<64x128xf16, #blocked1>
+    %125 = ttg.convert_layout %108 : (tensor<64x128xf16, #mma>) -> tensor<64x128xf16, #blocked1>
     tt.store %117, %125, %124 {cache = 1 : i32, evict = 1 : i32} : tensor<64x128xf16, #blocked1>
 ```
 
@@ -454,12 +454,12 @@ Is translated to:
 ```
 %a: tensor<128x32xf16, #enc>
 %a_tmp = tensor.extract_slice %a[0, 0] [128, 16]
-%a_prefetch = triton_gpu.convert_layout %a_tmp
+%a_prefetch = ttg.convert_layout %a_tmp
 scf.for %iv = ... iter_args(%a_buf = %a, ..., %a_prefetch_arg = %a_prefetch)
 {
   %x = tt.dot %a_arg, %b, %c
   %a_tmp_rem = tensor.extract_slice %a_buf[0, 16] [128, 16]
-  %a_prefetch_next = triton_gpu.convert_layout %a_tmp_rem
+  %a_prefetch_next = ttg.convert_layout %a_tmp_rem
   ...
   scf.yield %next_a, ..., %a_prefetch_next
 }

diff --git a/docs/getting-started/architecture.rst b/docs/getting-started/architecture.rst
@@ -355,7 +355,7 @@ for
 
 .. code-block:: none
 
-  #triton_gpu.blocked_layout<{
+  #ttg.blocked_layout<{
     sizePerThread = {2, 2}
     threadsPerWarp = {8, 4}
     warpsPerCTA = {1, 2}
@@ -385,7 +385,7 @@ for
 
 .. code-block:: none
 
-  #triton_gpu.blocked_layout<{
+  #ttg.blocked_layout<{
     sizePerThread = {2, 2}
     threadsPerWarp = {8, 4}
     warpsPerCTA = {1, 2}
@@ -420,7 +420,7 @@ for
 
 .. code-block:: none
 
-  #triton_gpu.blocked_layout<{
+  #ttg.blocked_layout<{
     sizePerThread = {2, 2}
     threadsPerWarp = {8, 4}
     warpsPerCTA = {1, 2}
@@ -458,30 +458,30 @@ To produce the desired memory behavior described in the previous section, Triton
 
 .. code-block:: none
 
-  %61 = triton_gpu.insert_slice_async %39, %58, %c0_i32, %60, %cst_1 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x32x!tt.ptr<f16>, #blocked> -> tensor<4x64x32xf16, #shared>
-  triton_gpu.async_commit_group
+  %61 = ttg.insert_slice_async %39, %58, %c0_i32, %60, %cst_1 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x32x!tt.ptr<f16>, #blocked> -> tensor<4x64x32xf16, #shared>
+  ttg.async_commit_group
 
 
 The main loop of the GEMM would then extract a slice (a reimplementation of tensor.extract_slice [c25]_) from the shared memory, converting arguments to the dot layout and producing mma layout with the dot operator, for example:
 
 .. raw:: html
 
   <div class="highlight-none notranslate"><div class="highlight"><pre><span></span>
-  %107:14 = <b>scf.for</b> %arg9 = %c0_i32 to %51 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %39, %arg12 = %49, %arg13 = %94, %arg14 = %100, %arg15 = %101, %arg16 = %102, %arg17 = %85, %arg18 = %86, %arg19 = %c2_i32, %arg20 = %c3_i32, %arg21 = %c1_i32, %arg22 = %104, %arg23 = %106) -> (tensor<64x128xf32, #mma>, tensor<64x32x!tt.ptr<f16>, #blocked>, tensor<32x128x!tt.ptr<f16>, #blocked1>, tensor<4x64x32xf16, #shared>, tensor<4x32x128xf16, #shared1>, tensor<64x32xf16, #shared>, tensor<32x128xf16, #shared1>, tensor<64x32x!tt.ptr<f16>, #blocked>, tensor<32x128x!tt.ptr<f16>, #blocked1>, i32, i32, i32, tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>, tensor<16x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>)  : i32 {
-      %126 = <b>triton_gpu.extract_slice</b> %arg15[0, 16] [64, 16] [1, 1] : tensor<64x32xf16, #shared> to tensor<64x16xf16, #shared>
-      %127 = <b>triton_gpu.convert_layout</b> %126 : (tensor<64x16xf16, <b>#shared</b>>) -> tensor<64x16xf16, <b>#triton_gpu.dot_op</b><{opIdx = 0, parent = #mma, kWidth = 2}>>
-      %128 = <b>triton_gpu.extract_slice</b> %arg16[16, 0] [16, 128] [1, 1] : tensor<32x128xf16, <b>#shared1</b>> to tensor<16x128xf16, <b>#shared1</b>>
-      %129 = <b>triton_gpu.convert_layout</b> %128 : (tensor<16x128xf16, #shared1>) -> tensor<16x128xf16, <b>#triton_gpu.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>>
-      %130 = tt.dot %arg22, %arg23, %arg10 {allowTF32 = true} : tensor<64x16xf16, <b>#triton_gpu.dot_op</b><{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x128xf16, <b>#triton_gpu.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x128xf32, <b>#mma</b>>
-      %131 = <b>tt.dot</b> %127, %129, %130 {allowTF32 = true} : tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x128xf16, <b>#triton_gpu.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x128xf32, <b>#mma</b>>
+  %107:14 = <b>scf.for</b> %arg9 = %c0_i32 to %51 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %39, %arg12 = %49, %arg13 = %94, %arg14 = %100, %arg15 = %101, %arg16 = %102, %arg17 = %85, %arg18 = %86, %arg19 = %c2_i32, %arg20 = %c3_i32, %arg21 = %c1_i32, %arg22 = %104, %arg23 = %106) -> (tensor<64x128xf32, #mma>, tensor<64x32x!tt.ptr<f16>, #blocked>, tensor<32x128x!tt.ptr<f16>, #blocked1>, tensor<4x64x32xf16, #shared>, tensor<4x32x128xf16, #shared1>, tensor<64x32xf16, #shared>, tensor<32x128xf16, #shared1>, tensor<64x32x!tt.ptr<f16>, #blocked>, tensor<32x128x!tt.ptr<f16>, #blocked1>, i32, i32, i32, tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>, tensor<16x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>)  : i32 {
+      %126 = <b>ttg.extract_slice</b> %arg15[0, 16] [64, 16] [1, 1] : tensor<64x32xf16, #shared> to tensor<64x16xf16, #shared>
+      %127 = <b>ttg.convert_layout</b> %126 : (tensor<64x16xf16, <b>#shared</b>>) -> tensor<64x16xf16, <b>#ttg.dot_op</b><{opIdx = 0, parent = #mma, kWidth = 2}>>
+      %128 = <b>ttg.extract_slice</b> %arg16[16, 0] [16, 128] [1, 1] : tensor<32x128xf16, <b>#shared1</b>> to tensor<16x128xf16, <b>#shared1</b>>
+      %129 = <b>ttg.convert_layout</b> %128 : (tensor<16x128xf16, #shared1>) -> tensor<16x128xf16, <b>#ttg.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>>
+      %130 = tt.dot %arg22, %arg23, %arg10 {allowTF32 = true} : tensor<64x16xf16, <b>#ttg.dot_op</b><{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x128xf16, <b>#ttg.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x128xf32, <b>#mma</b>>
+      %131 = <b>tt.dot</b> %127, %129, %130 {allowTF32 = true} : tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x128xf16, <b>#ttg.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x128xf32, <b>#mma</b>>
       ...
   </pre></div></div>
 
 The result of the processing is then converted back to blocked layout to be stored to the main GPU memory, for example:
 
 .. code-block:: none
 
-  %125 = triton_gpu.convert_layout %108 : (tensor<64x128xf16, #mma>) -> tensor<64x128xf16, #blocked1>
+  %125 = ttg.convert_layout %108 : (tensor<64x128xf16, #mma>) -> tensor<64x128xf16, #blocked1>
   tt.store %117, %125, %124 {cache = 1 : i32, evict = 1 : i32} : tensor<64x128xf16, #blocked1>
 
 
@@ -520,12 +520,12 @@ Is translated to:
 
   %a: tensor<128x32xf16, #enc>
   %a_tmp = tensor.extract_slice %a[0, 0] [128, 16]
-  %a_prefetch = triton_gpu.convert_layout %a_tmp
+  %a_prefetch = ttg.convert_layout %a_tmp
   scf.for %iv = ... iter_args(%a_buf = %a, ..., %a_prefetch_arg = %a_prefetch)
   {
     %x = tt.dot %a_arg, %b, %c
     %a_tmp_rem = tensor.extract_slice %a_buf[0, 16] [128, 16]
-    %a_prefetch_next = triton_gpu.convert_layout %a_tmp_rem
+    %a_prefetch_next = ttg.convert_layout %a_tmp_rem
     ...
     scf.yield %next_a, ..., %a_prefetch_next
   }

diff --git a/include/triton/Analysis/Allocation.h b/include/triton/Analysis/Allocation.h
@@ -180,8 +180,8 @@ class Allocation {
 private:
   /// A class that represents a shared memory buffer
   struct BufferT {
-    /// Explicit: triton_gpu.local_alloc
-    /// Scratch: triton_gpu.convert_layout
+    /// Explicit: ttg.local_alloc
+    /// Scratch: ttg.convert_layout
     /// Virtual: triton.call
     enum class BufferKind { Explicit, Scratch, Virtual };
 

diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h
@@ -400,7 +400,7 @@ inline Value getGlobalScratchPtr(Location loc, RewriterBase &rewriter,
 
   ModuleOp mod = funcOp.getOperation()->getParentOfType<ModuleOp>();
   auto allocSizeAttr = mod.getOperation()->getAttrOfType<mlir::IntegerAttr>(
-      "triton_gpu.global_scratch_memory_size");
+      "ttg.global_scratch_memory_size");
   if (!allocSizeAttr) {
     return gmemBase;
   }

diff --git a/include/triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h b/include/triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h
@@ -12,11 +12,11 @@ template <typename T> class OperationPass;
 
 namespace triton {
 
-constexpr static char AttrNumWarpsName[] = "triton_gpu.num-warps";
-constexpr static char AttrNumCTAsName[] = "triton_gpu.num-ctas";
-constexpr static char AttrTargetName[] = "triton_gpu.target";
+constexpr static char AttrNumWarpsName[] = "ttg.num-warps";
+constexpr static char AttrNumCTAsName[] = "ttg.num-ctas";
+constexpr static char AttrTargetName[] = "ttg.target";
 
-constexpr static char AttrNumThreadsPerWarp[] = "triton_gpu.threads-per-warp";
+constexpr static char AttrNumThreadsPerWarp[] = "ttg.threads-per-warp";
 
 // Create the pass with numWarps passed from cl::opt.
 std::unique_ptr<OperationPass<ModuleOp>> createConvertTritonToTritonGPUPass();

diff --git a/include/triton/Dialect/TritonGPU/IR/CMakeLists.txt b/include/triton/Dialect/TritonGPU/IR/CMakeLists.txt
@@ -1,12 +1,12 @@
 set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
 
 set(LLVM_TARGET_DEFINITIONS TritonGPUOps.td)
-mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=triton_gpu)
-mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=triton_gpu)
+mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=ttg)
+mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=ttg)
 mlir_tablegen(Ops.h.inc -gen-op-decls)
 mlir_tablegen(Ops.cpp.inc -gen-op-defs)
-mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=triton_gpu)
-mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=triton_gpu)
+mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=ttg)
+mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=ttg)
 add_mlir_doc(TritonGPUDialect TritonGPUDialect dialects/ -gen-dialect-doc)
 add_mlir_doc(TritonGPUOps TritonGPUOps dialects/ -gen-op-doc)
 add_public_tablegen_target(TritonGPUTableGen)

diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -616,7 +616,7 @@ Example 1, a row-major coalesced layout may partition a 16x16 tensor over 2 warp
 
 for
 
-#triton_gpu.blocked_layout<{
+#ttg.blocked_layout<{
   sizePerThread = {2, 2}
   threadsPerWarp = {8, 4}
   warpsPerCTA = {1, 2}
@@ -642,7 +642,7 @@ Example 2, a row-major coalesced layout may partition a 32x32 tensor over 2 warp
 [ 28 28 29 29 30 30 31 31 ; 60 60 61 61 62 62 63 63  28 28 29 29 30 30 31 31 ; 60 60 61 61 62 62 63 63 ]
 for
 
-#triton_gpu.blocked_layout<{
+#ttg.blocked_layout<{
   sizePerThread = {2, 2}
   threadsPerWarp = {8, 4}
   warpsPerCTA = {1, 2}
@@ -672,7 +672,7 @@ CTA [1,0]                                              CTA [1,1]
 [ 28 28 29 29 30 30 31 31 ; 60 60 61 61 62 62 63 63 ]  [ 28 28 29 29 30 30 31 31 ; 60 60 61 61 62 62 63 63 ]
 for
 
-#triton_gpu.blocked_layout<{
+#ttg.blocked_layout<{
   sizePerThread = {2, 2}
   threadsPerWarp = {8, 4}
   warpsPerCTA = {1, 2}

diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td
@@ -4,7 +4,7 @@
 include "mlir/IR/OpBase.td"
 
 def TritonGPU_Dialect : Dialect {
-  let name = "triton_gpu";
+  let name = "ttg";
 
   let cppNamespace = "::mlir::triton::gpu";
 
@@ -21,24 +21,24 @@ def TritonGPU_Dialect : Dialect {
   ];
 
   let extraClassDeclaration = [{
-    static std::string getNumWarpsAttrName() { return "triton_gpu.num-warps"; }
+    static std::string getNumWarpsAttrName() { return "ttg.num-warps"; }
     static int getNumWarps(ModuleOp mod) {
-      if (!mod->hasAttr("triton_gpu.num-warps"))
+      if (!mod->hasAttr("ttg.num-warps"))
         llvm::report_fatal_error(
-            "TritonGPU module should contain a triton_gpu.num-warps attribute");
-      return cast<IntegerAttr>(mod->getAttr("triton_gpu.num-warps")).getInt();
+            "TritonGPU module should contain a ttg.num-warps attribute");
+      return cast<IntegerAttr>(mod->getAttr("ttg.num-warps")).getInt();
     }
     static int getNumCTAs(ModuleOp mod) {
-      if (!mod->hasAttr("triton_gpu.num-ctas"))
+      if (!mod->hasAttr("ttg.num-ctas"))
         return 1;
-      return cast<IntegerAttr>(mod->getAttr("triton_gpu.num-ctas")).getInt();
+      return cast<IntegerAttr>(mod->getAttr("ttg.num-ctas")).getInt();
     }
     void registerTypes();
 
-    static std::string getThreadsPerWarpAttrName() { return "triton_gpu.threads-per-warp"; }
+    static std::string getThreadsPerWarpAttrName() { return "ttg.threads-per-warp"; }
 
     static int getThreadsPerWarp(ModuleOp mod) {
-      Attribute threadsPerWarp = mod->getDiscardableAttr("triton_gpu.threads-per-warp");
+      Attribute threadsPerWarp = mod->getDiscardableAttr("ttg.threads-per-warp");
       if(!threadsPerWarp) {
         return 32;
       }