From ff50d6f9830e9190d3813a554da1b5374f1d9e59 Mon Sep 17 00:00:00 2001
From: Greta Yorsh <45005955+gretay-js@users.noreply.github.com>
Date: Wed, 8 Jan 2025 17:41:32 +0000
Subject: [PATCH 1/4] Propagate alignment of memory accesses to simd_selection

Currently it's always 8 but having this argument will help us
consider alignment for new vector sequences.
---
 backend/amd64/simd_selection.ml | 19 +++++++++++++++----
 backend/arm64/simd_selection.ml |  3 ++-
 backend/cfg/vectorize.ml        | 14 +++++++++++++-
 backend/vectorize_utils.ml      |  6 ++++++
 backend/vectorize_utils.mli     |  4 ++++
 5 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/backend/amd64/simd_selection.ml b/backend/amd64/simd_selection.ml
index 526d37d3f11..306c2ff4082 100644
--- a/backend/amd64/simd_selection.ml
+++ b/backend/amd64/simd_selection.ml
@@ -474,12 +474,24 @@ let vector_width_in_bits = 128
 
 (* CR-soon gyorsh: [vectorize_operation] is too long, refactor / split up. *)
 let vectorize_operation (width_type : Vectorize_utils.Width_in_bits.t)
-    ~arg_count ~res_count (cfg_ops : Operation.t list) :
+    ~arg_count ~res_count ~alignment_in_bytes (cfg_ops : Operation.t list) :
     Vectorize_utils.Vectorized_instruction.t list option =
   (* Assumes cfg_ops are isomorphic *)
   let width_in_bits = Vectorize_utils.Width_in_bits.to_int width_type in
   let length = List.length cfg_ops in
   assert (length * width_in_bits = vector_width_in_bits);
+  let vector_width_in_bytes = vector_width_in_bits / 8 in
+  let is_aligned_to_vector_width () =
+    match alignment_in_bytes with
+    | None -> Misc.fatal_error "Unexpected memory operation"
+    | Some alignment_in_bytes ->
+      Int.compare alignment_in_bytes vector_width_in_bytes >= 0
+  in
+  let vec128_chunk () : Cmm.memory_chunk =
+    if is_aligned_to_vector_width ()
+    then Onetwentyeight_aligned
+    else Onetwentyeight_unaligned
+  in
   let same_width memory_chunk =
     Vectorize_utils.Width_in_bits.equal width_type
       (Vectorize_utils.Width_in_bits.of_memory_chunk memory_chunk)
@@ -650,7 +662,7 @@ let vectorize_operation (width_type : Vectorize_utils.Width_in_bits.t)
       assert (arg_count = num_args_addressing && res_count = 1);
       let operation =
         Operation.Load
-          { memory_chunk = Onetwentyeight_unaligned;
+          { memory_chunk = vec128_chunk ();
             addressing_mode;
             mutability;
             is_atomic
@@ -670,8 +682,7 @@ let vectorize_operation (width_type : Vectorize_utils.Width_in_bits.t)
       let num_args_addressing = Arch.num_args_addressing addressing_mode in
       assert (arg_count = num_args_addressing + 1 && res_count = 0);
       let operation =
-        Operation.Store
-          (Onetwentyeight_unaligned, addressing_mode, is_assignment)
+        Operation.Store (vec128_chunk (), addressing_mode, is_assignment)
       in
       Some
         [ { operation;
diff --git a/backend/arm64/simd_selection.ml b/backend/arm64/simd_selection.ml
index 3e18e247129..87a2a7eff3d 100644
--- a/backend/arm64/simd_selection.ml
+++ b/backend/arm64/simd_selection.ml
@@ -24,6 +24,7 @@ let pseudoregs_for_operation _ arg res = arg, res
 
 let vector_width_in_bits = 128
 
-let vectorize_operation _ ~arg_count:_ ~res_count:_ (_ : Operation.t list) :
+let vectorize_operation _ ~arg_count:_ ~res_count:_ ~alignment_in_bytes:_
+    (_ : Operation.t list) :
     Vectorize_utils.Vectorized_instruction.t list option =
   None
diff --git a/backend/cfg/vectorize.ml b/backend/cfg/vectorize.ml
index 77703302f84..6b79d8ec7f8 100644
--- a/backend/cfg/vectorize.ml
+++ b/backend/cfg/vectorize.ml
@@ -638,6 +638,8 @@ module Dependencies : sig
       type t
 
       val first_memory_arg_index : t -> int
+
+      val alignment_in_bytes : t -> int
     end
   end
 
@@ -821,6 +823,8 @@ end = struct
       type t
 
       val first_memory_arg_index : t -> int
+
+      val alignment_in_bytes : t -> int
     end
 
     module Dependencies : sig
@@ -918,6 +922,8 @@ end = struct
 
       val first_memory_arg_index : t -> int
 
+      val alignment_in_bytes : t -> int
+
       val get_instruction_id : t -> Instruction.Id.t
 
       (** [is_adjacent t1 t2] assumes that [t1] and [t2] have isomorphic operations,
@@ -956,6 +962,9 @@ end = struct
       let first_memory_arg_index t =
         Memory_access.first_memory_arg_index t.memory_access
 
+      let alignment_in_bytes t =
+        Vectorize_utils.Memory_access.alignment_in_bytes t.memory_access
+
       let get_instruction_id t = Instruction.id t.instruction
 
       let memory_access (instruction : Instruction.t) : Memory_access.t option =
@@ -2134,12 +2143,15 @@ end = struct
              && can_vectorize_memory_accesses mem_op instructions deps)
         then None
         else
+          let alignment_in_bytes =
+            Option.map Dependencies.Memory.Operation.alignment_in_bytes mem_op
+          in
           let cfg_ops =
             List.map (fun i -> i |> Instruction.op |> Option.get) instructions
           in
           let vector_instructions =
             Simd_selection.vectorize_operation width_in_bits ~arg_count
-              ~res_count cfg_ops
+              ~res_count ~alignment_in_bytes cfg_ops
           in
           match vector_instructions with
           | None -> None
diff --git a/backend/vectorize_utils.ml b/backend/vectorize_utils.ml
index f119306bbe8..82cf933595d 100644
--- a/backend/vectorize_utils.ml
+++ b/backend/vectorize_utils.ml
@@ -72,6 +72,12 @@ module Memory_access = struct
   let desc t = t.desc
 
   let first_memory_arg_index t = t.first_memory_arg_index
+
+  let alignment_in_bytes t =
+    (* CR-someday gyorsh: propagate alignment of base address (such as
+       bigarray). Can be used to emit more efficient vector sequences, for
+       example, arithmetic operations with memory arguments (not stack). *)
+    Arch.size_int
 end
 
 module Vectorized_instruction = struct
diff --git a/backend/vectorize_utils.mli b/backend/vectorize_utils.mli
index 43e6961f35a..7487a69978d 100644
--- a/backend/vectorize_utils.mli
+++ b/backend/vectorize_utils.mli
@@ -53,6 +53,10 @@ module Memory_access : sig
   val desc : t -> desc
 
   val first_memory_arg_index : t -> int
+
+  (** Base address of memory access [t] is guaranteed to be aligned to
+      at least [alignment_in_bytes t]. *)
+  val alignment_in_bytes : t -> int
 end
 
 module Vectorized_instruction : sig

From 9e238e25ec7d57fadf777bf15492264a5c51a1a0 Mon Sep 17 00:00:00 2001
From: Greta Yorsh <45005955+gretay-js@users.noreply.github.com>
Date: Tue, 14 Jan 2025 13:16:24 +0000
Subject: [PATCH 2/4] Enable warnings on the new files

---
 backend/amd64/vectorize_specific.ml | 2 ++
 backend/arm64/vectorize_specific.ml | 2 ++
 backend/vectorize_utils.ml          | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/backend/amd64/vectorize_specific.ml b/backend/amd64/vectorize_specific.ml
index 2f535ca2834..e6bcf4cfd58 100644
--- a/backend/amd64/vectorize_specific.ml
+++ b/backend/amd64/vectorize_specific.ml
@@ -1,3 +1,5 @@
+[@@@ocaml.warning "+a-40-42"]
+
 (* Keep in sync with [Arch.operation_is_pure], [Arch.operation_can_raise],
    [Arch.operation_allocates]. *)
 module Memory_access = Vectorize_utils.Memory_access
diff --git a/backend/arm64/vectorize_specific.ml b/backend/arm64/vectorize_specific.ml
index 5eb1ff3886e..550f505c778 100644
--- a/backend/arm64/vectorize_specific.ml
+++ b/backend/arm64/vectorize_specific.ml
@@ -1,3 +1,5 @@
+[@@@ocaml.warning "+a-40-42"]
+
 (* Keep in sync with [Arch.operation_is_pure], [Arch.operation_can_raise],
    [Arch.operation_allocates]. *)
 module Memory_access = Vectorize_utils.Memory_access
diff --git a/backend/vectorize_utils.ml b/backend/vectorize_utils.ml
index 82cf933595d..848b2e91148 100644
--- a/backend/vectorize_utils.ml
+++ b/backend/vectorize_utils.ml
@@ -1,3 +1,5 @@
+[@@@ocaml.warning "+a-40-42"]
+
 open Arch
 
 module Width_in_bits = struct

From ce9d4f28a496b67d383216e30a7d11c0205bdde8 Mon Sep 17 00:00:00 2001
From: Greta Yorsh <45005955+gretay-js@users.noreply.github.com>
Date: Tue, 14 Jan 2025 13:17:11 +0000
Subject: [PATCH 3/4] Fix warnings

---
 backend/vectorize_utils.ml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/vectorize_utils.ml b/backend/vectorize_utils.ml
index 848b2e91148..87a79c49011 100644
--- a/backend/vectorize_utils.ml
+++ b/backend/vectorize_utils.ml
@@ -75,7 +75,7 @@ module Memory_access = struct
 
   let first_memory_arg_index t = t.first_memory_arg_index
 
-  let alignment_in_bytes t =
+  let alignment_in_bytes _t =
     (* CR-someday gyorsh: propagate alignment of base address (such as
        bigarray). Can be used to emit more efficient vector sequences, for
        example, arithmetic operations with memory arguments (not stack). *)

From ac2955bd9b594735a1bf11a927fa5d7cd75c75de Mon Sep 17 00:00:00 2001
From: Greta Yorsh <45005955+gretay-js@users.noreply.github.com>
Date: Tue, 14 Jan 2025 13:26:34 +0000
Subject: [PATCH 4/4] Improve alignment check to be more robust

---
 backend/amd64/simd_selection.ml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/backend/amd64/simd_selection.ml b/backend/amd64/simd_selection.ml
index 306c2ff4082..ed10039d1c3 100644
--- a/backend/amd64/simd_selection.ml
+++ b/backend/amd64/simd_selection.ml
@@ -485,7 +485,8 @@ let vectorize_operation (width_type : Vectorize_utils.Width_in_bits.t)
     match alignment_in_bytes with
     | None -> Misc.fatal_error "Unexpected memory operation"
     | Some alignment_in_bytes ->
-      Int.compare alignment_in_bytes vector_width_in_bytes >= 0
+      alignment_in_bytes mod vector_width_in_bytes = 0
+      && alignment_in_bytes / vector_width_in_bytes > 1
   in
   let vec128_chunk () : Cmm.memory_chunk =
     if is_aligned_to_vector_width ()