From 2c72dca725a99ff787009b8f8c85a0a38f11eb75 Mon Sep 17 00:00:00 2001
From: Greta Yorsh <45005955+gretay-js@users.noreply.github.com>
Date: Wed, 8 Jan 2025 14:19:25 +0000
Subject: [PATCH 1/2] Add tests

---
 .../tests/backend/vectorizer/dune.inc         | 532 ++++++++++++++++++
 .../tests/backend/vectorizer/gen/gen_dune.ml  |   7 +
 .../backend/vectorizer/test_arrays.expected   |   9 +
 .../tests/backend/vectorizer/test_arrays.ml   | 141 +++++
 .../tests/backend/vectorizer/test_arrays.mli  |   1 +
 .../test_arrays_vectorized.cmx.dump.expected  |   1 +
 .../backend/vectorizer/test_float.expected    |   7 +
 .../tests/backend/vectorizer/test_float.ml    |  75 +++
 .../tests/backend/vectorizer/test_float.mli   |   1 +
 .../vectorizer/test_float32_unboxed.expected  |   5 +
 .../vectorizer/test_float32_unboxed.ml        | 225 ++++++++
 .../vectorizer/test_float32_unboxed.mli       |   1 +
 ...oat32_unboxed_vectorized.cmx.dump.expected |   1 +
 .../vectorizer/test_float_unboxed.expected    |   2 +
 .../backend/vectorizer/test_float_unboxed.ml  |  80 +++
 .../backend/vectorizer/test_float_unboxed.mli |   1 +
 ...float_unboxed_vectorized.cmx.dump.expected |   2 +
 .../test_float_vectorized.cmx.dump.expected   |   7 +
 .../vectorizer/test_int32_unboxed.expected    |   7 +
 .../backend/vectorizer/test_int32_unboxed.ml  | 229 ++++++++
 .../backend/vectorizer/test_int32_unboxed.mli |   1 +
 ...int32_unboxed_vectorized.cmx.dump.expected |   3 +
 .../backend/vectorizer/test_int64.expected    |   7 +
 .../tests/backend/vectorizer/test_int64.ml    |  79 +++
 .../tests/backend/vectorizer/test_int64.mli   |   1 +
 .../vectorizer/test_int64_unboxed.expected    |   3 +
 .../backend/vectorizer/test_int64_unboxed.ml  |  61 ++
 .../backend/vectorizer/test_int64_unboxed.mli |   1 +
 ...int64_unboxed_vectorized.cmx.dump.expected |   3 +
 .../test_int64_vectorized.cmx.dump.expected   |   3 +
 30 files changed, 1496 insertions(+)
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_arrays.expected
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_arrays.ml
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_arrays.mli
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_arrays_vectorized.cmx.dump.expected
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_float.expected
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_float.ml
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_float.mli
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_float32_unboxed.expected
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_float32_unboxed.ml
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_float32_unboxed.mli
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_float32_unboxed_vectorized.cmx.dump.expected
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_float_unboxed.expected
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_float_unboxed.ml
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_float_unboxed.mli
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_float_unboxed_vectorized.cmx.dump.expected
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_float_vectorized.cmx.dump.expected
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_int32_unboxed.expected
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_int32_unboxed.ml
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_int32_unboxed.mli
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_int32_unboxed_vectorized.cmx.dump.expected
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_int64.expected
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_int64.ml
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_int64.mli
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_int64_unboxed.expected
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_int64_unboxed.ml
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_int64_unboxed.mli
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_int64_unboxed_vectorized.cmx.dump.expected
 create mode 100644 flambda-backend/tests/backend/vectorizer/test_int64_vectorized.cmx.dump.expected

diff --git a/flambda-backend/tests/backend/vectorizer/dune.inc b/flambda-backend/tests/backend/vectorizer/dune.inc
index 67a51f80bb5..cb368edfd8b 100644
--- a/flambda-backend/tests/backend/vectorizer/dune.inc
+++ b/flambda-backend/tests/backend/vectorizer/dune.inc
@@ -75,6 +75,538 @@
  (action
    (diff test1_vectorized.expected test1_vectorized.output)))
 
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_arrays_runner.exe test_arrays.cmx.dump)
+ (deps test_arrays.mli test_arrays.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -no-vectorize -o test_arrays_runner.exe)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_arrays.output
+   (run ./test_arrays_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_arrays.expected test_arrays.output)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_arrays.ml test_arrays_vectorized.ml)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_arrays.mli test_arrays_vectorized.mli)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_arrays_vectorized_runner.exe test_arrays_vectorized.cmx.dump)
+ (deps test_arrays_vectorized.mli test_arrays_vectorized.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize -o test_arrays_vectorized_runner.exe)))
+
+(rule
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (target test_arrays_vectorized.cmx.dump.output)
+ (deps ./filter.sh test_arrays_vectorized.cmx.dump)
+ (action
+  (with-outputs-to
+   %{target}
+   (with-accepted-exit-codes 0
+    (run %{deps})))))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+   (diff test_arrays_vectorized.cmx.dump.expected test_arrays_vectorized.cmx.dump.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_arrays_vectorized.output
+   (run ./test_arrays_vectorized_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_arrays.expected test_arrays_vectorized.expected)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_arrays_vectorized.expected test_arrays_vectorized.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_int64_unboxed_runner.exe test_int64_unboxed.cmx.dump)
+ (deps test_int64_unboxed.mli test_int64_unboxed.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -no-vectorize -o test_int64_unboxed_runner.exe)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_int64_unboxed.output
+   (run ./test_int64_unboxed_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_int64_unboxed.expected test_int64_unboxed.output)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_int64_unboxed.ml test_int64_unboxed_vectorized.ml)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_int64_unboxed.mli test_int64_unboxed_vectorized.mli)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_int64_unboxed_vectorized_runner.exe test_int64_unboxed_vectorized.cmx.dump)
+ (deps test_int64_unboxed_vectorized.mli test_int64_unboxed_vectorized.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize -o test_int64_unboxed_vectorized_runner.exe)))
+
+(rule
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (target test_int64_unboxed_vectorized.cmx.dump.output)
+ (deps ./filter.sh test_int64_unboxed_vectorized.cmx.dump)
+ (action
+  (with-outputs-to
+   %{target}
+   (with-accepted-exit-codes 0
+    (run %{deps})))))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+   (diff test_int64_unboxed_vectorized.cmx.dump.expected test_int64_unboxed_vectorized.cmx.dump.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_int64_unboxed_vectorized.output
+   (run ./test_int64_unboxed_vectorized_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_int64_unboxed.expected test_int64_unboxed_vectorized.expected)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_int64_unboxed_vectorized.expected test_int64_unboxed_vectorized.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_float_unboxed_runner.exe test_float_unboxed.cmx.dump)
+ (deps test_float_unboxed.mli test_float_unboxed.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -no-vectorize -o test_float_unboxed_runner.exe)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_float_unboxed.output
+   (run ./test_float_unboxed_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_float_unboxed.expected test_float_unboxed.output)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_float_unboxed.ml test_float_unboxed_vectorized.ml)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_float_unboxed.mli test_float_unboxed_vectorized.mli)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_float_unboxed_vectorized_runner.exe test_float_unboxed_vectorized.cmx.dump)
+ (deps test_float_unboxed_vectorized.mli test_float_unboxed_vectorized.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize -o test_float_unboxed_vectorized_runner.exe)))
+
+(rule
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (target test_float_unboxed_vectorized.cmx.dump.output)
+ (deps ./filter.sh test_float_unboxed_vectorized.cmx.dump)
+ (action
+  (with-outputs-to
+   %{target}
+   (with-accepted-exit-codes 0
+    (run %{deps})))))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+   (diff test_float_unboxed_vectorized.cmx.dump.expected test_float_unboxed_vectorized.cmx.dump.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_float_unboxed_vectorized.output
+   (run ./test_float_unboxed_vectorized_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_float_unboxed.expected test_float_unboxed_vectorized.expected)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_float_unboxed_vectorized.expected test_float_unboxed_vectorized.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_int64_runner.exe test_int64.cmx.dump)
+ (deps test_int64.mli test_int64.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -no-vectorize -o test_int64_runner.exe)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_int64.output
+   (run ./test_int64_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_int64.expected test_int64.output)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_int64.ml test_int64_vectorized.ml)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_int64.mli test_int64_vectorized.mli)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_int64_vectorized_runner.exe test_int64_vectorized.cmx.dump)
+ (deps test_int64_vectorized.mli test_int64_vectorized.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize -o test_int64_vectorized_runner.exe)))
+
+(rule
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (target test_int64_vectorized.cmx.dump.output)
+ (deps ./filter.sh test_int64_vectorized.cmx.dump)
+ (action
+  (with-outputs-to
+   %{target}
+   (with-accepted-exit-codes 0
+    (run %{deps})))))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+   (diff test_int64_vectorized.cmx.dump.expected test_int64_vectorized.cmx.dump.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_int64_vectorized.output
+   (run ./test_int64_vectorized_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_int64.expected test_int64_vectorized.expected)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_int64_vectorized.expected test_int64_vectorized.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_float_runner.exe test_float.cmx.dump)
+ (deps test_float.mli test_float.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -no-vectorize -o test_float_runner.exe)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_float.output
+   (run ./test_float_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_float.expected test_float.output)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_float.ml test_float_vectorized.ml)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_float.mli test_float_vectorized.mli)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_float_vectorized_runner.exe test_float_vectorized.cmx.dump)
+ (deps test_float_vectorized.mli test_float_vectorized.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize -o test_float_vectorized_runner.exe)))
+
+(rule
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (target test_float_vectorized.cmx.dump.output)
+ (deps ./filter.sh test_float_vectorized.cmx.dump)
+ (action
+  (with-outputs-to
+   %{target}
+   (with-accepted-exit-codes 0
+    (run %{deps})))))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+   (diff test_float_vectorized.cmx.dump.expected test_float_vectorized.cmx.dump.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_float_vectorized.output
+   (run ./test_float_vectorized_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_float.expected test_float_vectorized.expected)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_float_vectorized.expected test_float_vectorized.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_float32_unboxed_runner.exe test_float32_unboxed.cmx.dump)
+ (deps test_float32_unboxed.mli test_float32_unboxed.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -no-vectorize -o test_float32_unboxed_runner.exe)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_float32_unboxed.output
+   (run ./test_float32_unboxed_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_float32_unboxed.expected test_float32_unboxed.output)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_float32_unboxed.ml test_float32_unboxed_vectorized.ml)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_float32_unboxed.mli test_float32_unboxed_vectorized.mli)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_float32_unboxed_vectorized_runner.exe test_float32_unboxed_vectorized.cmx.dump)
+ (deps test_float32_unboxed_vectorized.mli test_float32_unboxed_vectorized.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize -o test_float32_unboxed_vectorized_runner.exe)))
+
+(rule
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (target test_float32_unboxed_vectorized.cmx.dump.output)
+ (deps ./filter.sh test_float32_unboxed_vectorized.cmx.dump)
+ (action
+  (with-outputs-to
+   %{target}
+   (with-accepted-exit-codes 0
+    (run %{deps})))))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+   (diff test_float32_unboxed_vectorized.cmx.dump.expected test_float32_unboxed_vectorized.cmx.dump.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_float32_unboxed_vectorized.output
+   (run ./test_float32_unboxed_vectorized_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_float32_unboxed.expected test_float32_unboxed_vectorized.expected)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_float32_unboxed_vectorized.expected test_float32_unboxed_vectorized.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_int32_unboxed_runner.exe test_int32_unboxed.cmx.dump)
+ (deps test_int32_unboxed.mli test_int32_unboxed.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -no-vectorize -o test_int32_unboxed_runner.exe)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_int32_unboxed.output
+   (run ./test_int32_unboxed_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_int32_unboxed.expected test_int32_unboxed.output)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_int32_unboxed.ml test_int32_unboxed_vectorized.ml)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_int32_unboxed.mli test_int32_unboxed_vectorized.mli)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_int32_unboxed_vectorized_runner.exe test_int32_unboxed_vectorized.cmx.dump)
+ (deps test_int32_unboxed_vectorized.mli test_int32_unboxed_vectorized.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize -o test_int32_unboxed_vectorized_runner.exe)))
+
+(rule
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (target test_int32_unboxed_vectorized.cmx.dump.output)
+ (deps ./filter.sh test_int32_unboxed_vectorized.cmx.dump)
+ (action
+  (with-outputs-to
+   %{target}
+   (with-accepted-exit-codes 0
+    (run %{deps})))))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+   (diff test_int32_unboxed_vectorized.cmx.dump.expected test_int32_unboxed_vectorized.cmx.dump.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_int32_unboxed_vectorized.output
+   (run ./test_int32_unboxed_vectorized_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_int32_unboxed.expected test_int32_unboxed_vectorized.expected)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_int32_unboxed_vectorized.expected test_int32_unboxed_vectorized.output)))
+
 (rule
  (alias   runtest)
  (enabled_if (= %{context_name} "main"))
diff --git a/flambda-backend/tests/backend/vectorizer/gen/gen_dune.ml b/flambda-backend/tests/backend/vectorizer/gen/gen_dune.ml
index 53062d52b9f..4a2e903dbe3 100644
--- a/flambda-backend/tests/backend/vectorizer/gen/gen_dune.ml
+++ b/flambda-backend/tests/backend/vectorizer/gen/gen_dune.ml
@@ -170,6 +170,13 @@ let print_test ?(filter_exit_code = 0) name =
 
 let () =
   print_test "test1";
+  print_test "test_arrays";
+  print_test "test_int64_unboxed";
+  print_test "test_float_unboxed";
+  print_test "test_int64";
+  print_test "test_float";
+  print_test "test_float32_unboxed";
+  print_test "test_int32_unboxed";
   (* can't vectorize *)
   print_test ~filter_exit_code:1 "test_register_compatible";
   ()
diff --git a/flambda-backend/tests/backend/vectorizer/test_arrays.expected b/flambda-backend/tests/backend/vectorizer/test_arrays.expected
new file mode 100644
index 00000000000..e86cd1806ce
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_arrays.expected
@@ -0,0 +1,9 @@
+add_arrays_unrolled_manually 17 18 19 20 21 22 23 24 25 26 
+add_arrays_unrolled_safe 17 18 19 20 21 22 23 24 25 26 
+add_arrays_rec_unrolled_attribute 17 18 19 20 21 22 23 24 25 26 
+add_arrays_for 17 18 19 20 21 22 23 24 25 26 
+add_arrays_rec 17 18 19 20 21 22 23 24 25 26 
+initialize_array_const_unrolled_manually 0 0 0 0 0 0 0 0 0 0 
+initialize_arrays_const_unrolled_manually 0 0 0 0 0 0 0 0 0 0 
+initialize_array_unrolled_manually 17 17 17 17 17 17 17 17 17 17 
+initialize_floatarray_unrolled_manually 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 
diff --git a/flambda-backend/tests/backend/vectorizer/test_arrays.ml b/flambda-backend/tests/backend/vectorizer/test_arrays.ml
new file mode 100644
index 00000000000..fab21fe1453
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_arrays.ml
@@ -0,0 +1,141 @@
+let[@inline never] [@local never] [@specialize never] add_arrays_unrolled_manually
+    a b c n =
+  for i = 0 to (n / 2) - 1 do
+    Array.unsafe_set c (i * 2)
+      (Array.unsafe_get a (i * 2) + Array.unsafe_get b (i * 2));
+    Array.unsafe_set c
+      ((i * 2) + 1)
+      (Array.unsafe_get a ((i * 2) + 1) + Array.unsafe_get b ((i * 2) + 1))
+  done;
+  if Int.rem n 2 = 1
+  then
+    Array.unsafe_set c (n - 1)
+      (Array.unsafe_get a (n - 1) + Array.unsafe_get b (n - 1))
+
+(* Currently won't be vectorized. Can vectorize it but it's not worth it
+   according to our cost model. It will be vectorized when we add vectors beyond
+   128 or arrays of elements smaller than 64-bit. *)
+let[@inline never] [@local never] [@specialize never] initialize_array_const_unrolled_manually
+    arr n =
+  let i = ref 0 in
+  while !i < n do
+    Array.unsafe_set arr !i 0;
+    Array.unsafe_set arr (!i + 1) 0;
+    i := !i + 2
+  done
+
+(* Currently, won't be vectorized. If different groups can reuse the new
+   register that holds the constants, this will be worth vectorizing even with
+   128-bit vectors. *)
+let[@inline never] [@local never] [@specialize never] initialize_arrays_const_unrolled_manually
+    a b c n =
+  let i = ref 0 in
+  while !i < n do
+    Array.unsafe_set a !i 0;
+    Array.unsafe_set a (!i + 1) 0;
+    Array.unsafe_set b !i 0;
+    Array.unsafe_set b (!i + 1) 0;
+    Array.unsafe_set c !i 0;
+    Array.unsafe_set c (!i + 1) 0;
+    i := !i + 2
+  done
+
+(* Currently, won't be vectorized. Shuffling values into a vector is not yet
+   supported, only vector loads are. Also not worth it unless the shuffle is
+   outside the loop (loop invariant detection/motion would be needed for it). *)
+let[@inline never] [@local never] [@specialize never] initialize_array_unrolled_manually
+    arr n (v : int) =
+  let i = ref 0 in
+  while !i < n do
+    Array.unsafe_set arr !i v;
+    Array.unsafe_set arr (!i + 1) v;
+    i := !i + 2
+  done
+
+(* same as [initialize_array_unrolled_manually] except needs movddup. *)
+let[@inline never] [@local never] [@specialize never] initialize_floatarray_unrolled_manually
+    arr n (v : float) =
+  let i = ref 0 in
+  while !i < n do
+    Array.unsafe_set arr !i v;
+    Array.unsafe_set arr (!i + 1) v;
+    i := !i + 2
+  done
+
+(* cannot vectorize across basic blocks *)
+let[@inline never] [@local never] [@specialize never] add_arrays_unrolled_safe a
+    b c n =
+  for i = 0 to n - 1 do
+    Array.set c (i * 2) (Array.get a (i * 2) + Array.get b (i * 2));
+    Array.set c
+      ((i * 2) + 1)
+      (Array.get a ((i * 2) + 1) + Array.get b ((i * 2) + 1))
+  done
+
+(* cannot vectorize across basic blocks. unroll attribute is not sufficient to
+   eliminate the loop condition from the unrolled body (e.g., we would need to
+   track the fact that the bound is even. *)
+let[@inline never] [@local never] [@specialize never] add_arrays_rec_unrolled_attribute
+    a b c n =
+  let[@loop never] rec loop i a b c n =
+    if i < n
+    then (
+      Array.unsafe_set c i (Array.unsafe_get a i + Array.unsafe_get b i);
+      (loop [@unrolled 1]) (i + 1) a b c n)
+  in
+  loop 0 a b c (2 * n)
+
+(* cannot vectorizer for-loops *)
+let[@inline never] [@local never] [@specialize never] add_arrays_for a b c n =
+  for i = 0 to n - 1 do
+    Array.unsafe_set c i (Array.unsafe_get a i + Array.unsafe_get b i)
+  done
+
+(* cannot vectorizer loops expressed using recursion *)
+let[@inline never] [@local never] [@specialize never] add_arrays_rec a b c n =
+  let rec loop i =
+    if i < n
+    then (
+      Array.unsafe_set c i (Array.unsafe_get a i + Array.unsafe_get b i);
+      loop (i + 1))
+  in
+  loop 0
+
+let print_array ppf a =
+  let count = Array.length a in
+  for i = 0 to count - 1 do
+    Format.fprintf ppf "%d " a.(i)
+  done
+
+let print_floatarray ppf a =
+  let count = Array.length a in
+  for i = 0 to count - 1 do
+    Format.fprintf ppf "%f " a.(i)
+  done
+
+let () =
+  let n = Sys.opaque_identity 10 in
+  let a = Array.init n (fun i -> i) in
+  let b = Array.make n 17 in
+  let c = Array.make n 0 in
+  let d = Array.make n 0.0 in
+  add_arrays_unrolled_manually a b c (Sys.opaque_identity n);
+  Format.printf "add_arrays_unrolled_manually %a\n" print_array c;
+  add_arrays_unrolled_safe a b c (Sys.opaque_identity (n / 2));
+  Format.printf "add_arrays_unrolled_safe %a\n" print_array c;
+  add_arrays_rec_unrolled_attribute a b c (n / 2);
+  Format.printf "add_arrays_rec_unrolled_attribute %a\n" print_array c;
+  add_arrays_for a b c n;
+  Format.printf "add_arrays_for %a\n" print_array c;
+  add_arrays_rec a b c n;
+  Format.printf "add_arrays_rec %a\n" print_array c;
+  initialize_array_const_unrolled_manually c n;
+  Format.printf "initialize_array_const_unrolled_manually %a\n" print_array c;
+  initialize_arrays_const_unrolled_manually a b c n;
+  Format.printf "initialize_arrays_const_unrolled_manually %a\n" print_array c;
+  initialize_array_unrolled_manually c n (Sys.opaque_identity 17);
+  Format.printf "initialize_array_unrolled_manually %a\n" print_array c;
+  initialize_floatarray_unrolled_manually d n (Sys.opaque_identity 7.7);
+  Format.printf "initialize_floatarray_unrolled_manually %a\n" print_floatarray
+    d;
+  ()
diff --git a/flambda-backend/tests/backend/vectorizer/test_arrays.mli b/flambda-backend/tests/backend/vectorizer/test_arrays.mli
new file mode 100644
index 00000000000..5b909d90a8c
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_arrays.mli
@@ -0,0 +1 @@
+(* blank, make sure all the functions are called from top-level *)
diff --git a/flambda-backend/tests/backend/vectorizer/test_arrays_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_arrays_vectorized.cmx.dump.expected
new file mode 100644
index 00000000000..182c1cc7309
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_arrays_vectorized.cmx.dump.expected
@@ -0,0 +1 @@
+**** Vectorize selected computation: 4 groups, 8 scalar instructions, 7 vector instructions, cost = -1 (Test_arrays_vectorized.add_arrays_unrolled_manually)
diff --git a/flambda-backend/tests/backend/vectorizer/test_float.expected b/flambda-backend/tests/backend/vectorizer/test_float.expected
new file mode 100644
index 00000000000..00ffe66d5e1
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float.expected
@@ -0,0 +1,7 @@
+add_mutable_record { d0 = 88.000000 ; d1 = 110.000000 }
+copy_mutable_record { d0 = 88.000000 ; d1 = 110.000000 }
+add_mutable_record_fresh { d0 = 88.000000 ; d1 = 110.000000 }
+copy_mutable_record_fresh { d0 = 88.000000 ; d1 = 110.000000 }
+add_mutable_record_t4 { d0 = 88.000000 ; d1 = 110.000000; d2 = 88.000000 ; d3 = 110.000000 }
+copy_mutable_record_t4 { d0 = 8.000000 ; d1 = 96.000000; d2 = 80.000000 ; d3 = 14.000000 }
+dup_mutable_record_t4 { d0 = 8.000000 ; d1 = 96.000000; d2 = 8.000000 ; d3 = 96.000000 }
diff --git a/flambda-backend/tests/backend/vectorizer/test_float.ml b/flambda-backend/tests/backend/vectorizer/test_float.ml
new file mode 100644
index 00000000000..1e36c686ceb
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float.ml
@@ -0,0 +1,75 @@
+[@@@ocaml.warnerror "+a-40-41-42"]
+
+type t1 =
+  { mutable d0 : float;
+    mutable d1 : float
+  }
+
+let[@inline never] [@local never] [@specialize never] add_mutable_record
+    (a : t1) (b : t1) (c : t1) : t1 =
+  c.d0 <- Float.add a.d0 b.d0;
+  c.d1 <- Float.add a.d1 b.d1;
+  c
+
+let[@inline never] [@local never] [@specialize never] copy_mutable_record
+    (a : t1) (b : t1) : t1 =
+  b.d0 <- a.d0;
+  b.d1 <- a.d1;
+  b
+
+let[@inline never] [@local never] [@specialize never] add_mutable_record_fresh
+    (a : t1) (b : t1) : t1 =
+  { d0 = Float.add a.d0 b.d0; d1 = Float.add a.d1 b.d1 }
+
+let[@inline never] [@local never] [@specialize never] copy_mutable_record_fresh
+    (a : t1) : t1 =
+  { d0 = a.d0; d1 = a.d1 }
+
+type t4 =
+  { mutable d0 : float;
+    mutable d1 : float;
+    mutable d2 : float;
+    mutable d3 : float
+  }
+
+let[@inline never] [@local never] [@specialize never] add_mutable_record_t4
+    (a : t1) (b : t1) (c : t4) : t4 =
+  c.d0 <- Float.add a.d0 b.d0;
+  c.d1 <- Float.add a.d1 b.d1;
+  c.d2 <- Float.add a.d0 b.d0;
+  c.d3 <- Float.add a.d1 b.d1;
+  c
+
+let[@inline never] [@local never] [@specialize never] copy_mutable_record_t4
+    (a : t1) (b : t1) : t4 =
+  { d0 = a.d0; d1 = a.d1; d2 = b.d0; d3 = b.d1 }
+
+let[@inline never] [@local never] [@specialize never] dup_mutable_record_t4
+    (a : t1) : t4 =
+  { d0 = a.d0; d1 = a.d1; d2 = a.d0; d3 = a.d1 }
+
+let print_t1 ppf (t1 : t1) =
+  Format.fprintf ppf "{ d0 = %f ; d1 = %f }" t1.d0 t1.d1
+
+let print_t4 ppf (t4 : t4) =
+  Format.fprintf ppf "{ d0 = %f ; d1 = %f; d2 = %f ; d3 = %f }" t4.d0 t4.d1
+    t4.d2 t4.d3
+
+let () =
+  let a = { d0 = 8.; d1 = 96. } in
+  let b = { d0 = 80.; d1 = 14. } in
+  let c = { d0 = 10.; d1 = -10. } in
+  let t4 = { d0 = 10.; d1 = -10.; d2 = 199.; d3 = 18. } in
+  let res = { d0 = 0.; d1 = -0. } in
+  Format.printf "add_mutable_record %a\n" print_t1 (add_mutable_record a b c);
+  Format.printf "copy_mutable_record %a\n" print_t1 (copy_mutable_record c res);
+  Format.printf "add_mutable_record_fresh %a\n" print_t1
+    (add_mutable_record_fresh a b);
+  Format.printf "copy_mutable_record_fresh %a\n" print_t1
+    (copy_mutable_record_fresh c);
+  Format.printf "add_mutable_record_t4 %a\n" print_t4
+    (add_mutable_record_t4 a b t4);
+  Format.printf "copy_mutable_record_t4 %a\n" print_t4
+    (copy_mutable_record_t4 a b);
+  Format.printf "dup_mutable_record_t4 %a\n" print_t4 (dup_mutable_record_t4 a);
+  ()
diff --git a/flambda-backend/tests/backend/vectorizer/test_float.mli b/flambda-backend/tests/backend/vectorizer/test_float.mli
new file mode 100644
index 00000000000..5b909d90a8c
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float.mli
@@ -0,0 +1 @@
+(* blank, make sure all the functions are called from top-level *)
diff --git a/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.expected b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.expected
new file mode 100644
index 00000000000..92c4b798f9d
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.expected
@@ -0,0 +1,5 @@
+add_unboxed_pairs_mutable_record { d0 = 88. ; d1 = 110.; d2 = 0. ; d3 = -1. }
+copy_unboxed_pairs_mutable_record { d0 = 88. ; d1 = 110.; d2 = 0. ; d3 = -1. }
+copy_bytes 10. 10. 10. 10. 
+copy_bytes_pos 10. 10. 10. 10. 
+copy_bytes_pos_v2 10. 10. 10. 10. 
diff --git a/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.ml b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.ml
new file mode 100644
index 00000000000..ea552f169e2
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.ml
@@ -0,0 +1,225 @@
+[@@@ocaml.warnerror "+a-40-41-42"]
+
+module Float32 = struct
+  type t = float32
+
+  external add : (t[@local_opt]) -> (t[@local_opt]) -> (t[@local_opt])
+    = "%addfloat32"
+
+  external format : string -> t -> string = "caml_format_float32"
+
+  let to_string f = Stdlib.valid_float_lexem (format "%.9g" f)
+
+  module Bytes = struct
+    external get : bytes -> pos:int -> float32 = "%caml_bytes_getf32"
+    external unsafe_get : bytes -> pos:int -> float32 = "%caml_bytes_getf32u"
+    external set : bytes -> pos:int -> float32 -> unit = "%caml_bytes_setf32"
+
+    external unsafe_set : bytes -> pos:int -> float32 -> unit
+      = "%caml_bytes_setf32u"
+  end
+end
+
+module Float32_u = struct
+  type t = float32#
+
+  external to_float32 : t -> (float32[@local_opt]) = "%box_float32" [@@warning "-187"]
+
+  external of_float32 : (float32[@local_opt]) -> t = "%unbox_float32" [@@warning "-187"]
+
+  let[@inline always] add x y = of_float32 (Float32.add (to_float32 x) (to_float32 y))
+
+  module Bytes = struct
+    let get bytes ~pos = of_float32 (Float32.Bytes.get bytes ~pos)
+    let unsafe_get bytes ~pos = of_float32 (Float32.Bytes.unsafe_get bytes ~pos)
+    let set bytes ~pos x = Float32.Bytes.set bytes ~pos (to_float32 x)
+    let unsafe_set bytes ~pos x = Float32.Bytes.unsafe_set bytes ~pos (to_float32 x)
+  end
+end
+
+type t1 = { mutable d0 : float32# ;
+            mutable d1: float32#; mutable d2: float32#; mutable d3: float32#  }
+
+(* Not vectorized because float32 fields are not adjacent in a record, they are padded
+to 64-bits. *)
+let[@inline never] [@local never][@specialize never] copy_mutable_record (a : t1) (b: t1) : unit =
+  b.d0 <- a.d0;
+  b.d1 <- a.d1;
+  b.d2 <- a.d2;
+  b.d3 <- a.d3;
+  ()
+
+(* Not vectorized because float32 fields are not adjacent in a record, they are padded
+to 64-bits. *)
+let[@inline never] [@local never][@specialize never] add_mutable_record (a : t1) (b: t1) (c : t1) : t1 =
+  c.d0 <- Float32_u.add a.d0 b.d0;
+  c.d1 <- Float32_u.add a.d1 b.d1;
+  c.d2 <- Float32_u.add a.d2 b.d2;
+  c.d3 <- Float32_u.add a.d3 b.d3;
+  c
+
+(* [Float32_u.Bytes] contain packed float32_u, can vectorize. *)
+let[@inline never] [@local never] [@specialize never] copy_bytes a b =
+  let pos = 0 in
+  let x = Float32_u.Bytes.unsafe_get a ~pos in
+  Float32_u.Bytes.unsafe_set b ~pos x;
+  let pos = pos + 4 in
+  let x = Float32_u.Bytes.unsafe_get a ~pos in
+  Float32_u.Bytes.unsafe_set b ~pos x;
+  let pos = pos + 4 in
+  let x = Float32_u.Bytes.unsafe_get a ~pos in
+  Float32_u.Bytes.unsafe_set b ~pos x;
+  let pos = pos + 4 in
+  let x = Float32_u.Bytes.unsafe_get a ~pos in
+  Float32_u.Bytes.unsafe_set b ~pos x;
+  ()
+
+let[@inline never] [@local never] [@specialize never] init_bytes b x =
+  let pos = 0 in
+  Float32_u.Bytes.unsafe_set b ~pos x;
+  let pos = pos + 4 in
+  Float32_u.Bytes.unsafe_set b ~pos x;
+  let pos = pos + 4 in
+  Float32_u.Bytes.unsafe_set b ~pos x;
+  let pos = pos + 4 in
+  Float32_u.Bytes.unsafe_set b ~pos x;
+  ()
+
+let[@inline always] copy_float32_unboxed_pos a b ~pos =
+  let x = Float32_u.Bytes.unsafe_get a ~pos in
+  Float32_u.Bytes.unsafe_set b ~pos x;
+  ()
+
+(* Currently can't vectorize because [pos] untagging is repeated and the current
+   heuristic for detecting relations between pointers is not strong enough to
+   handle this case. *)
+let[@inline never] [@local never] [@specialize never] copy_bytes_pos a b pos =
+  copy_float32_unboxed_pos a b ~pos;
+  copy_float32_unboxed_pos a b ~pos:(pos+1*4);
+  copy_float32_unboxed_pos a b ~pos:(pos+2*4);
+  copy_float32_unboxed_pos a b ~pos:(pos+3*4);
+  ()
+
+(* 128:
+ * (id:3) a:V/61 := R:I/0[%rax]
+ * (id:4) b:V/62 := R:I/1[%rbx]
+ * (id:5) pos:I/63 := R:I/2[%rdi]
+ * (id:6) prim:I/64 := pos:I/63
+ * (id:7) prim:I/64 := prim:I/64 >>s 1
+ * (id:8) S/65 := float32  mut[a:V/61 + prim:I/64]
+ * (id:9) float32[b:V/62 + prim:I/64] := S/65 (assign)
+ * (id:10) Pbytes_set_f32:I/66 := 1
+ * (id:11) I/67 := pos:I/63
+ * (id:12) I/67 := I/67 + 8
+ * (id:13) prim:I/68 := I/67
+ * (id:14) prim:I/68 := prim:I/68 >>s 1
+ * (id:15) S/69 := float32  mut[a:V/61 + prim:I/68]
+ * (id:16) float32[b:V/62 + prim:I/68] := S/69 (assign)
+ * (id:17) Pbytes_set_f32:I/70 := 1
+ * (id:18) I/71 := pos:I/63
+ * (id:19) I/71 := I/71 + 16
+ * (id:20) prim:I/72 := I/71
+ * (id:21) prim:I/72 := prim:I/72 >>s 1
+ * (id:22) S/73 := float32  mut[a:V/61 + prim:I/72]
+ * (id:23) float32[b:V/62 + prim:I/72] := S/73 (assign)
+ * (id:24) Pbytes_set_f32:I/74 := 1
+ * (id:25) I/75 := pos:I/63
+ * (id:26) I/75 := I/75 + 24
+ * (id:27) prim:I/76 := I/75
+ * (id:28) prim:I/76 := prim:I/76 >>s 1
+ * (id:29) S/77 := float32  mut[a:V/61 + prim:I/76]
+ * (id:30) float32[b:V/62 + prim:I/76] := S/77 (assign)
+ * (id:31) Pbytes_set_f32:I/78 := 1
+ * (id:32) I/79 := 1
+ * (id:33) R:I/0[%rax] := I/79
+ * (id:34) Return R:I/0[%rax] *)
+
+(* Currently, can't vectorize because the index is untagged before every memory access,
+   instead of operating on untagged indexes throughout. *)
+let[@inline never] [@local never] [@specialize never] copy_bytes_pos_v2 a b pos =
+  let i0 = pos in
+  copy_float32_unboxed_pos a b ~pos:i0;
+  let i1 = i0 + 4  in
+  copy_float32_unboxed_pos a b ~pos:i1;
+  let i2 = i1 + 4 in
+  copy_float32_unboxed_pos a b ~pos:i2;
+  let i3 = i2 + 4 in
+  copy_float32_unboxed_pos a b ~pos:i3;
+  ()
+
+(* 177:
+ * (id:3) a:V/61 := R:I/0[%rax]
+ * (id:4) b:V/62 := R:I/1[%rbx]
+ * (id:5) pos:I/63 := R:I/2[%rdi]
+ * (id:6) prim:I/64 := pos:I/63
+ * (id:7) prim:I/64 := prim:I/64 >>s 1
+ * (id:8) S/65 := float32  mut[a:V/61 + prim:I/64]
+ * (id:9) float32[b:V/62 + prim:I/64] := S/65 (assign)
+ * (id:10) Pbytes_set_f32:I/66 := 1
+ * (id:11) i1:I/67 := pos:I/63
+ * (id:12) i1:I/67 := i1:I/67 + 8
+ * (id:13) prim:I/68 := i1:I/67
+ * (id:14) prim:I/68 := prim:I/68 >>s 1
+ * (id:15) S/69 := float32  mut[a:V/61 + prim:I/68]
+ * (id:16) float32[b:V/62 + prim:I/68] := S/69 (assign)
+ * (id:17) Pbytes_set_f32:I/70 := 1
+ * (id:18) i2:I/71 := i1:I/67
+ * (id:19) i2:I/71 := i2:I/71 + 8
+ * (id:20) prim:I/72 := i2:I/71
+ * (id:21) prim:I/72 := prim:I/72 >>s 1
+ * (id:22) S/73 := float32  mut[a:V/61 + prim:I/72]
+ * (id:23) float32[b:V/62 + prim:I/72] := S/73 (assign)
+ * (id:24) Pbytes_set_f32:I/74 := 1
+ * (id:25) I/75 := i2:I/71
+ * (id:26) I/75 := I/75 + 8
+ * (id:27) prim:I/76 := I/75
+ * (id:28) prim:I/76 := prim:I/76 >>s 1
+ * (id:29) S/77 := float32  mut[a:V/61 + prim:I/76]
+ * (id:30) float32[b:V/62 + prim:I/76] := S/77 (assign)
+ * (id:31) Pbytes_set_f32:I/78 := 1
+ * (id:32) I/79 := 1
+ * (id:33) R:I/0[%rax] := I/79
+ * (id:34) Return R:I/0[%rax] *)
+
+
+let print_t1 ppf (t1 : t1) =
+  (* CR gyorsh: how to print Float32? *)
+  let to_string f = (Float32_u.to_float32 f |> Float32.to_string) in
+  Format.fprintf ppf "{ d0 = %s ; d1 = %s; d2 = %s ; d3 = %s }"
+    (to_string t1.d0)
+    (to_string t1.d1)
+    (to_string t1.d2)
+    (to_string t1.d3)
+
+let create_s length =
+  String.init length (fun i -> i * 7 mod 256 |> char_of_int)
+;;
+
+let create_b length = create_s length |> Bytes.of_string
+
+let print_b ~len ppf b =
+  for i = 0 to len-1 do
+    Format.fprintf ppf "%s "
+      (Float32_u.to_float32 (Float32_u.Bytes.get b ~pos:(i*4)) |> Float32.to_string)
+  done
+
+let () =
+  let a = { d0 = #8.s; d1 = #96.s; d2 = #0.s; d3 = -#0.5s } in
+  let b = { d0 = #80.s; d1 = #14.s; d2 = #0.s; d3 = -#0.5s } in
+  let c = { d0 = #8.s; d1 = #96.s; d2 = #0.s; d3 = -#0.s } in
+  let res = { d0 = #0.s; d1 = -#10.s; d2 = #1.s; d3 = -#1.s } in
+  Format.printf "add_unboxed_pairs_mutable_record %a\n" print_t1
+    (add_mutable_record a b c);
+  copy_mutable_record c res;
+  Format.printf "copy_unboxed_pairs_mutable_record %a\n" print_t1 res;
+  let b1 = create_b 16 in
+  let b2 = create_b 16 in
+  init_bytes b1 #10.s;
+  init_bytes b2 #0.s;
+  copy_bytes b1 b2;
+  Format.printf "copy_bytes %a\n" (print_b ~len:4) b2;
+  copy_bytes_pos b2 b1 (Sys.opaque_identity 0);
+  Format.printf "copy_bytes_pos %a\n" (print_b ~len:4) b2;
+  copy_bytes_pos_v2 b1 b2 (Sys.opaque_identity 0);
+  Format.printf "copy_bytes_pos_v2 %a\n" (print_b ~len:4) b2;
+  ()
diff --git a/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.mli b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.mli
new file mode 100644
index 00000000000..5b909d90a8c
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.mli
@@ -0,0 +1 @@
+(* blank, make sure all the functions are called from top-level *)
diff --git a/flambda-backend/tests/backend/vectorizer/test_float32_unboxed_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed_vectorized.cmx.dump.expected
new file mode 100644
index 00000000000..3178ac03fb8
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed_vectorized.cmx.dump.expected
@@ -0,0 +1 @@
+**** Vectorize selected computation: 2 groups, 8 scalar instructions, 2 vector instructions, cost = -6 (Test_float32_unboxed_vectorized.copy_bytes)
diff --git a/flambda-backend/tests/backend/vectorizer/test_float_unboxed.expected b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.expected
new file mode 100644
index 00000000000..bfea42ed769
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.expected
@@ -0,0 +1,2 @@
+add_mutable_record { d0 = 88.000000 ; d1 = 110.000000; d2 = 0.000000 ; d3 = -1.000000 }
+copy_mutable_record { d0 = 88.000000 ; d1 = 110.000000; d2 = 1.000000 ; d3 = -1.000000 }
diff --git a/flambda-backend/tests/backend/vectorizer/test_float_unboxed.ml b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.ml
new file mode 100644
index 00000000000..a49aaf0b841
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.ml
@@ -0,0 +1,80 @@
+[@@@ocaml.warnerror "+a-40-41-42"]
+
+module Float_u = struct
+  type t = float#
+
+  external to_float : t -> (float[@local_opt]) = "%box_float" [@@warning "-187"]
+
+  external of_float : (float[@local_opt]) -> t = "%unbox_float" [@@warning "-187"]
+
+  let[@inline always] add x y = of_float (Float.add (to_float x) (to_float y))
+end
+
+type t1 = { mutable d0: float#;
+            mutable d1: float#;
+            mutable d2: float#;
+            mutable d3: float#
+          }
+
+
+let[@inline never] [@local never][@specialize never] copy_mutable_record (a : t1) (b: t1) : unit =
+  b.d0 <- a.d0;
+  b.d1 <- a.d1;
+  ()
+
+(* Currently, can't vectorize because of the specific floatmem operation (looks like
+   it is treated overly conservatively. *)
+let[@inline never] [@local never][@specialize never] add_mutable_record (a : t1) (b: t1) (c : t1) : t1 =
+  c.d0 <- Float_u.add a.d0 b.d0;
+  c.d1 <- Float_u.add a.d1 b.d1;
+  c.d2 <- Float_u.add a.d2 b.d2;
+  c.d3 <- Float_u.add a.d3 b.d3;
+  c
+
+(*
+102:
+(id:3) a:V/61 := R:I/0[%rax]
+(id:4) b:V/62 := R:I/1[%rbx]
+(id:5) c:V/63 := R:I/2[%rdi]
+(id:6) F/64 := float64  mut[a:V/61]
+(id:7) F/65 := F/64
+(id:8) F/65 := F/65 +f float64[b:V/62]
+(id:9) float64[c:V/63] := F/65 (assign)
+(id:10) Psetufloatfield:I/66 := 1
+(id:11) F/67 := float64  mut[a:V/61 + 8]
+(id:12) F/68 := F/67
+(id:13) F/68 := F/68 +f float64[b:V/62 + 8]
+(id:14) float64[c:V/63 + 8] := F/68 (assign)
+(id:15) Psetufloatfield:I/69 := 1
+(id:16) F/70 := float64  mut[a:V/61 + 16]
+(id:17) F/71 := F/70
+(id:18) F/71 := F/71 +f float64[b:V/62 + 16]
+(id:19) float64[c:V/63 + 16] := F/71 (assign)
+(id:20) Psetufloatfield:I/72 := 1
+(id:21) F/73 := float64  mut[a:V/61 + 24]
+(id:22) F/74 := F/73
+(id:23) F/74 := F/74 +f float64[b:V/62 + 24]
+(id:24) float64[c:V/63 + 24] := F/74 (assign)
+(id:25) Psetufloatfield:I/75 := 1
+(id:26) R:I/0[%rax] := c:V/63
+(id:27) Return R:I/0[%rax]
+
+*)
+
+let print_t1 ppf (t1 : t1) =
+  Format.fprintf ppf "{ d0 = %f ; d1 = %f; d2 = %f ; d3 = %f }"
+    (Float_u.to_float t1.d0)
+    (Float_u.to_float t1.d1)
+    (Float_u.to_float t1.d2)
+    (Float_u.to_float t1.d3)
+
+let () =
+  let a = { d0 = #8.; d1 = #96.; d2 = #0.; d3 = -#0.5 } in
+  let b = { d0 = #80.; d1 = #14.; d2 = #0.; d3 = -#0.5 } in
+  let c = { d0 = #8.; d1 = #96.; d2 = #0.; d3 = -#0. } in
+  let res = { d0 = #0.; d1 = -#10.; d2 = #1.; d3 = -#1. } in
+  Format.printf "add_mutable_record %a\n" print_t1
+    (add_mutable_record a b c);
+  copy_mutable_record c res;
+  Format.printf "copy_mutable_record %a\n" print_t1 res;
+  ()
diff --git a/flambda-backend/tests/backend/vectorizer/test_float_unboxed.mli b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.mli
new file mode 100644
index 00000000000..5b909d90a8c
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.mli
@@ -0,0 +1 @@
+(* blank, make sure all the functions are called from top-level *)
diff --git a/flambda-backend/tests/backend/vectorizer/test_float_unboxed_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_float_unboxed_vectorized.cmx.dump.expected
new file mode 100644
index 00000000000..357dba19d99
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float_unboxed_vectorized.cmx.dump.expected
@@ -0,0 +1,2 @@
+**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_float_unboxed_vectorized.copy_mutable_record)
+**** Vectorize selected computation: 8 groups, 16 scalar instructions, 10 vector instructions, cost = -6 (Test_float_unboxed_vectorized.add_mutable_record)
diff --git a/flambda-backend/tests/backend/vectorizer/test_float_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_float_vectorized.cmx.dump.expected
new file mode 100644
index 00000000000..dc486848738
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float_vectorized.cmx.dump.expected
@@ -0,0 +1,7 @@
+**** Vectorize selected computation: 4 groups, 8 scalar instructions, 5 vector instructions, cost = -3 (Test_float_vectorized.add_mutable_record)
+**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_float_vectorized.copy_mutable_record)
+**** Vectorize selected computation: 4 groups, 8 scalar instructions, 5 vector instructions, cost = -3 (Test_float_vectorized.add_mutable_record_fresh)
+**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_float_vectorized.copy_mutable_record_fresh)
+**** Vectorize selected computation: 8 groups, 16 scalar instructions, 10 vector instructions, cost = -6 (Test_float_vectorized.add_mutable_record_t4)
+**** Vectorize selected computation: 4 groups, 8 scalar instructions, 4 vector instructions, cost = -4 (Test_float_vectorized.copy_mutable_record_t4)
+**** Vectorize selected computation: 4 groups, 8 scalar instructions, 4 vector instructions, cost = -4 (Test_float_vectorized.dup_mutable_record_t4)
diff --git a/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.expected b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.expected
new file mode 100644
index 00000000000..0207ed6b915
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.expected
@@ -0,0 +1,7 @@
+add_mutable_record { d0 = 88 ; d1 = 110; d2 = -40 ; d3 = -100 }
+copy_array_four 30 30 30 30 
+copy_array_index_four 30 30 30 30 
+add_array_from_start 60 60 60 60 
+copy_array_index_from_start 60 60 60 60 
+copy_array_from_start 60 60 60 60 
+copy_array_from_start_v2 60 60 60 60 
diff --git a/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.ml b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.ml
new file mode 100644
index 00000000000..b45eaa57769
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.ml
@@ -0,0 +1,229 @@
+[@@@ocaml.warnerror "+a-40-41-42"]
+
+module Int32_u = struct
+  type t = int32#
+
+  external to_int32 : t -> (int32[@local_opt]) = "%box_int32" [@@warning "-187"]
+
+  external of_int32 : (int32[@local_opt]) -> t = "%unbox_int32" [@@warning "-187"]
+
+  let[@inline always] add x y = of_int32 (Int32.add (to_int32 x) (to_int32 y))
+
+  module Array = struct
+    external unsafe_create : ('a : bits32). int -> 'a array =
+      "caml_make_unboxed_int32_vect_bytecode" "caml_make_unboxed_int32_vect"
+    external unsafe_get: ('a : bits32). 'a array -> int -> 'a = "%array_unsafe_get"
+    external unsafe_set: ('a : bits32). 'a array -> int -> 'a -> unit = "%array_unsafe_set"
+
+    module Index = struct
+      external unsafe_get
+        : ('a : bits32).
+            ('a array) -> t -> 'a
+        = "%array_unsafe_get_indexed_by_int32#"
+
+      external unsafe_set
+        : ('a : bits32).
+            'a array -> t -> 'a -> unit
+        = "%array_unsafe_set_indexed_by_int32#"
+    end
+  end
+
+end
+
+type t1 = { mutable d0 : int32# ; mutable d1: int32#; mutable d2: int32#; mutable d3: int32#  }
+
+(* Currently, can't vectorize because not adjacent and have an unnecessary sign extension. *)
+let[@inline never] [@local never][@specialize never] add_mutable_record (a : t1) (b: t1) (c : t1) : t1 =
+  c.d0 <- Int32_u.add a.d0 b.d0;
+  c.d1 <- Int32_u.add a.d1 b.d1;
+  c.d2 <- Int32_u.add a.d2 b.d2;
+  c.d3 <- Int32_u.add a.d3 b.d3;
+  c
+
+let[@inline always] copy_array_one (a : Int32_u.t array)
+                      (b : Int32_u.t array) pos =
+  let x = Int32_u.Array.unsafe_get a pos in
+  Int32_u.Array.unsafe_set b pos x
+
+(* The accesses are adjacent but the use of [int] typed index results in a convoluted
+   index computation that is not yet handled by the current heuristics. *)
+let[@inline never] [@local never][@specialize never] copy_array_four (a : Int32_u.t array)
+                                                       (b : Int32_u.t array) ~pos =
+  copy_array_one a b pos;
+  copy_array_one a b (pos+1);
+  copy_array_one a b (pos+2);
+  copy_array_one a b (pos+3);
+  ()
+
+(*
+
+114:
+(id:3) a:V/61 := R:I/0[%rax]
+(id:4) b:V/62 := R:I/1[%rbx]
+(id:5) pos:I/63 := R:I/2[%rdi]
+(id:6) new_value:I/64 := signed int32  mut[a:V/61 + pos:I/63 * 2 + 6]
+(id:7) signed int32[b:V/62 + pos:I/63 * 2 + 6] := new_value:I/64 (assign)
+(id:8) Parraysetu:I/65 := 1
+(id:9) Paddint:I/66 := pos:I/63
+(id:10) Paddint:I/66 := Paddint:I/66 + 2
+(id:11) new_value:I/67 := signed int32  mut[a:V/61 + Paddint:I/66 * 2 + 6]
+(id:12) signed int32[b:V/62 + Paddint:I/66 * 2 + 6] := new_value:I/67 (assign)
+(id:13) Parraysetu:I/68 := 1
+(id:14) Paddint:I/69 := pos:I/63
+(id:15) Paddint:I/69 := Paddint:I/69 + 4
+(id:16) new_value:I/70 := signed int32  mut[a:V/61 + Paddint:I/69 * 2 + 6]
+(id:17) signed int32[b:V/62 + Paddint:I/69 * 2 + 6] := new_value:I/70 (assign)
+(id:18) Parraysetu:I/71 := 1
+(id:19) Paddint:I/72 := pos:I/63
+(id:20) Paddint:I/72 := Paddint:I/72 + 6
+(id:21) new_value:I/73 := signed int32  mut[a:V/61 + Paddint:I/72 * 2 + 6]
+(id:22) signed int32[b:V/62 + Paddint:I/72 * 2 + 6] := new_value:I/73 (assign)
+(id:23) Parraysetu:I/74 := 1
+(id:24) I/75 := 1
+(id:25) R:I/0[%rax] := I/75
+(id:26) Return R:I/0[%rax]
+
+*)
+
+let[@inline never] [@local never][@specialize never] copy_array_four_v2 (a : Int32_u.t array)
+                                                       (b : Int32_u.t array) ~pos =
+  let i0 = pos in
+  copy_array_one a b i0;
+  let i1 = i0 + 1 in
+  copy_array_one a b i1;
+  let i2 = i1 + 1 in
+  copy_array_one a b i2;
+  let i3 = i2 + 1 in
+  copy_array_one a b i3;
+  ()
+
+let[@inline always] copy_array_index_one (a : Int32_u.t array)
+                      (b : Int32_u.t array) (pos : Int32_u.t) =
+  let x = Int32_u.Array.Index.unsafe_get a pos in
+  Int32_u.Array.Index.unsafe_set b pos x
+
+(* Can't vectorize it! The accesses are adjacent and we use [Int32_u.t] as index,
+   but the compiler tags the index before using it! This index computation is not
+   yet handled by the vectorizer's heuristics. *)
+let[@inline never] [@local never][@specialize never] copy_array_index_four (a : Int32_u.t array)
+                                                       (b : Int32_u.t array) ~pos =
+  copy_array_index_one a b pos;
+  copy_array_index_one a b (Int32_u.add pos #1l);
+  copy_array_index_one a b (Int32_u.add pos #2l);
+  copy_array_index_one a b (Int32_u.add pos #3l);
+  ()
+
+let[@inline never] [@local never][@specialize never] copy_array_index_from_start (a : Int32_u.t array)
+                                                       (b : Int32_u.t array) =
+  let pos = #0l in
+  copy_array_index_one a b pos;
+  copy_array_index_one a b (Int32_u.add pos #1l);
+  copy_array_index_one a b (Int32_u.add pos #2l);
+  copy_array_index_one a b (Int32_u.add pos #3l);
+   ()
+
+                            let[@inline never] [@local never][@specialize never] copy_array_from_start (a : Int32_u.t array)
+                                                                                   (b : Int32_u.t array) =
+  let[@inline always] copy pos =
+    let x = Int32_u.Array.unsafe_get a pos in
+    Int32_u.Array.unsafe_set b pos x
+  in
+  let pos = 0 in
+  copy pos;
+  copy (pos+1);
+  copy (pos+2);
+  copy (pos+3);
+  ()
+
+(* Can't vectorize because of an unnecessary sign extension. The heuristics in the
+   vectorizer can be extended to handle this case. *)
+let[@inline never] [@local never][@specialize never] add_array_from_start (a : Int32_u.t array) (b : Int32_u.t array) =
+  let[@inline always] add pos =
+    let x = Int32_u.Array.unsafe_get a pos in
+    let y = Int32_u.Array.unsafe_get b pos in
+    Int32_u.Array.unsafe_set b pos (Int32_u.add x y)
+  in
+  let pos = 0 in
+  add pos;
+  add (pos+1);
+  add (pos+2);
+  add (pos+3);
+  ()
+
+(*
+camlTest7__add_array_from_start_7_22_code(R:I/0[%rax] R:I/1[%rbx]) {test7.ml:112,74-379}
+  a:V/61 := R:I/0[%rax]
+  b:V/62 := R:I/1[%rbx]
+  I/63 := signed int32  mut[b:V/62 + 8]{test7.ml:119,2-9;test7.ml:115,12-42}
+  I/64 := signed int32  mut[a:V/61 + 8]{test7.ml:119,2-9;test7.ml:114,12-42}
+  I/65 := I/64
+  I/65 := I/65 + I/63{test7.ml:119,2-9;test7.ml:116,35-52;test7.ml:10,41-78}
+  new_value:I/66 := sextend32 I/65{test7.ml:119,2-9;test7.ml:116,35-52;test7.ml:10,41-78}
+  signed int32[b:V/62 + 8] := new_value:I/66 (assign){test7.ml:119,2-9;test7.ml:116,4-52}
+  Parraysetu:I/67 := 1
+  I/68 := signed int32  mut[b:V/62 + 12]{test7.ml:120,2-13;test7.ml:115,12-42}
+  I/69 := signed int32  mut[a:V/61 + 12]{test7.ml:120,2-13;test7.ml:114,12-42}
+  I/70 := I/69
+  I/70 := I/70 + I/68{test7.ml:120,2-13;test7.ml:116,35-52;test7.ml:10,41-78}
+  new_value:I/71 := sextend32 I/70{test7.ml:120,2-13;test7.ml:116,35-52;test7.ml:10,41-78}
+  signed int32[b:V/62 + 12] := new_value:I/71 (assign){test7.ml:120,2-13;test7.ml:116,4-52}
+  Parraysetu:I/72 := 1
+  I/73 := signed int32  mut[b:V/62 + 16]{test7.ml:121,2-13;test7.ml:115,12-42}
+  I/74 := signed int32  mut[a:V/61 + 16]{test7.ml:121,2-13;test7.ml:114,12-42}
+  I/75 := I/74
+  I/75 := I/75 + I/73{test7.ml:121,2-13;test7.ml:116,35-52;test7.ml:10,41-78}
+  new_value:I/76 := sextend32 I/75{test7.ml:121,2-13;test7.ml:116,35-52;test7.ml:10,41-78}
+  signed int32[b:V/62 + 16] := new_value:I/76 (assign){test7.ml:121,2-13;test7.ml:116,4-52}
+  Parraysetu:I/77 := 1
+  I/78 := signed int32  mut[b:V/62 + 20]{test7.ml:122,2-13;test7.ml:115,12-42}
+  I/79 := signed int32  mut[a:V/61 + 20]{test7.ml:122,2-13;test7.ml:114,12-42}
+  I/80 := I/79
+  I/80 := I/80 + I/78{test7.ml:122,2-13;test7.ml:116,35-52;test7.ml:10,41-78}
+  new_value:I/81 := sextend32 I/80{test7.ml:122,2-13;test7.ml:116,35-52;test7.ml:10,41-78}
+  signed int32[b:V/62 + 20] := new_value:I/81 (assign){test7.ml:122,2-13;test7.ml:116,4-52}
+  Parraysetu:I/82 := 1
+  I/83 := 1
+  R:I/0[%rax] := I/83
+  return R:I/0[%rax]
+*)
+let print_t1 ppf (t1 : t1) =
+  Format.fprintf ppf "{ d0 = %ld ; d1 = %ld; d2 = %ld ; d3 = %ld }"
+    (Int32_u.to_int32 t1.d0)
+    (Int32_u.to_int32 t1.d1)
+    (Int32_u.to_int32 t1.d2)
+    (Int32_u.to_int32 t1.d3)
+
+let print_array ~len ppf ( a : Int32_u.t array)=
+  for i = 0 to len - 1 do
+    let x = Int32_u.Array.unsafe_get a i in
+    Format.fprintf ppf "%ld " (x |> Int32_u.to_int32)
+  done
+
+let create_array ~len ~init =
+  let arr = Int32_u.Array.unsafe_create len in
+  for i = 0 to len-1 do
+    Int32_u.Array.unsafe_set arr i init
+  done;
+  arr
+
+let () =
+  let a = { d0 = #8l; d1 = #96l; d2 = -#10l; d3 = #0l } in
+  let b = { d0 = #80l; d1 = #14l; d2 = -#30l; d3 = -#100l } in
+  let c = { d0 = #8l; d1 = #96l; d2 = #0l; d3 = #0l } in
+  Format.printf "add_mutable_record %a\n" print_t1
+    (add_mutable_record a b c);
+  let ar1 = create_array ~len:4 ~init:#30l in
+  let ar2 = create_array ~len:4 ~init:#0l in
+  copy_array_four ar1 ar2 ~pos:0;
+  Format.printf "copy_array_four %a\n" (print_array ~len:4) ar2;
+  copy_array_index_four ar2 ar1 ~pos:#0l;
+  Format.printf "copy_array_index_four %a\n" (print_array ~len:4) ar1;
+  add_array_from_start ar1 ar2;
+  Format.printf "add_array_from_start %a\n" (print_array ~len:4) ar2;
+  copy_array_index_from_start ar2 ar1;
+  Format.printf "copy_array_index_from_start %a\n" (print_array ~len:4) ar1;
+  copy_array_from_start ar1 ar2;
+  Format.printf "copy_array_from_start %a\n" (print_array ~len:4) ar2;
+  copy_array_four_v2 ar1 ar2 ~pos:0;
+  Format.printf "copy_array_from_start_v2 %a\n" (print_array ~len:4) ar2;
+  ()
diff --git a/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.mli b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.mli
new file mode 100644
index 00000000000..5b909d90a8c
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.mli
@@ -0,0 +1 @@
+(* blank, make sure all the functions are called from top-level *)
diff --git a/flambda-backend/tests/backend/vectorizer/test_int32_unboxed_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed_vectorized.cmx.dump.expected
new file mode 100644
index 00000000000..fef3d590f81
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed_vectorized.cmx.dump.expected
@@ -0,0 +1,3 @@
+**** Vectorize selected computation: 2 groups, 8 scalar instructions, 2 vector instructions, cost = -6 (Test_int32_unboxed_vectorized.copy_array_four_v2)
+**** Vectorize selected computation: 2 groups, 8 scalar instructions, 2 vector instructions, cost = -6 (Test_int32_unboxed_vectorized.copy_array_index_from_start)
+**** Vectorize selected computation: 2 groups, 8 scalar instructions, 2 vector instructions, cost = -6 (Test_int32_unboxed_vectorized.copy_array_from_start)
diff --git a/flambda-backend/tests/backend/vectorizer/test_int64.expected b/flambda-backend/tests/backend/vectorizer/test_int64.expected
new file mode 100644
index 00000000000..21d3934339d
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int64.expected
@@ -0,0 +1,7 @@
+add_mutable_record { d0 = 88 ; d1 = 110 }
+copy_mutable_record { d0 = 88 ; d1 = 110 }
+add_mutable_record_fresh { d0 = 88 ; d1 = 110 }
+copy_mutable_record_fresh { d0 = 88 ; d1 = 110 }
+add_mutable_record_t4 { d0 = 88 ; d1 = 110; d2 = 88 ; d3 = 110 }
+copy_mutable_record_t4 { d0 = 8 ; d1 = 96; d2 = 80 ; d3 = 14 }
+dup_mutable_record_t4 { d0 = 8 ; d1 = 96; d2 = 8 ; d3 = 96 }
diff --git a/flambda-backend/tests/backend/vectorizer/test_int64.ml b/flambda-backend/tests/backend/vectorizer/test_int64.ml
new file mode 100644
index 00000000000..95603dd7773
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int64.ml
@@ -0,0 +1,79 @@
+[@@@ocaml.warnerror "+a-40-41-42"]
+
+type t1 =
+  { mutable d0 : int64;
+    mutable d1 : int64
+  }
+
+(* Can't vectorize because int64 are boxed. *)
+let[@inline never] [@local never] [@specialize never] add_mutable_record
+    (a : t1) (b : t1) (c : t1) : t1 =
+  c.d0 <- Int64.add a.d0 b.d0;
+  c.d1 <- Int64.add a.d1 b.d1;
+  c
+
+(* Can't vectorize because memory write requires [caml_modify]. *)
+let[@inline never] [@local never] [@specialize never] copy_mutable_record
+    (a : t1) (b : t1) : t1 =
+  b.d0 <- a.d0;
+  b.d1 <- a.d1;
+  b
+
+(* Can't vectorize because int64 are boxed *)
+let[@inline never] [@local never] [@specialize never] add_mutable_record_fresh
+    (a : t1) (b : t1) : t1 =
+  { d0 = Int64.add a.d0 b.d0; d1 = Int64.add a.d1 b.d1 }
+
+let[@inline never] [@local never] [@specialize never] copy_mutable_record_fresh
+    (a : t1) : t1 =
+  { d0 = a.d0; d1 = a.d1 }
+
+type t4 =
+  { mutable d0 : int64;
+    mutable d1 : int64;
+    mutable d2 : int64;
+    mutable d3 : int64
+  }
+
+(* Can't vectorize because int64 are boxed. *)
+let[@inline never] [@local never] [@specialize never] add_mutable_record_t4
+    (a : t1) (b : t1) (c : t4) : t4 =
+  c.d0 <- Int64.add a.d0 b.d0;
+  c.d1 <- Int64.add a.d1 b.d1;
+  c.d2 <- Int64.add a.d0 b.d0;
+  c.d3 <- Int64.add a.d1 b.d1;
+  c
+
+let[@inline never] [@local never] [@specialize never] copy_mutable_record_t4
+    (a : t1) (b : t1) : t4 =
+  { d0 = a.d0; d1 = a.d1; d2 = b.d0; d3 = b.d1 }
+
+let[@inline never] [@local never] [@specialize never] dup_mutable_record_t4
+    (a : t1) : t4 =
+  { d0 = a.d0; d1 = a.d1; d2 = a.d0; d3 = a.d1 }
+
+let print_t1 ppf (t1 : t1) =
+  Format.fprintf ppf "{ d0 = %Ld ; d1 = %Ld }" t1.d0 t1.d1
+
+let print_t4 ppf (t4 : t4) =
+  Format.fprintf ppf "{ d0 = %Ld ; d1 = %Ld; d2 = %Ld ; d3 = %Ld }" t4.d0 t4.d1
+    t4.d2 t4.d3
+
+let () =
+  let a = { d0 = 8L; d1 = 96L } in
+  let b = { d0 = 80L; d1 = 14L } in
+  let c = { d0 = 10L; d1 = -10L } in
+  let t4 = { d0 = 10L; d1 = -10L; d2 = 199L; d3 = 18L } in
+  let res = { d0 = 0L; d1 = -0L } in
+  Format.printf "add_mutable_record %a\n" print_t1 (add_mutable_record a b c);
+  Format.printf "copy_mutable_record %a\n" print_t1 (copy_mutable_record c res);
+  Format.printf "add_mutable_record_fresh %a\n" print_t1
+    (add_mutable_record_fresh a b);
+  Format.printf "copy_mutable_record_fresh %a\n" print_t1
+    (copy_mutable_record_fresh c);
+  Format.printf "add_mutable_record_t4 %a\n" print_t4
+    (add_mutable_record_t4 a b t4);
+  Format.printf "copy_mutable_record_t4 %a\n" print_t4
+    (copy_mutable_record_t4 a b);
+  Format.printf "dup_mutable_record_t4 %a\n" print_t4 (dup_mutable_record_t4 a);
+  ()
diff --git a/flambda-backend/tests/backend/vectorizer/test_int64.mli b/flambda-backend/tests/backend/vectorizer/test_int64.mli
new file mode 100644
index 00000000000..5b909d90a8c
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int64.mli
@@ -0,0 +1 @@
+(* blank, make sure all the functions are called from top-level *)
diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.expected b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.expected
new file mode 100644
index 00000000000..68b6515c901
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.expected
@@ -0,0 +1,3 @@
+add_mutable_record { d0 = 88 ; d1 = 110 }
+copy_mutable_record { d0 = 88 ; d1 = 110 }
+add_fours_mutable_record { d0 = 88 ; d1 = 110; d2 = 88 ; d3 = 110 }
diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.ml b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.ml
new file mode 100644
index 00000000000..d9371e65e8f
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.ml
@@ -0,0 +1,61 @@
+[@@@ocaml.warnerror "+a-40-41-42"]
+
+module Int64_u = struct
+  type t = int64#
+
+  external to_int64 : t -> (int64[@local_opt]) = "%box_int64" [@@warning "-187"]
+
+  external of_int64 : (int64[@local_opt]) -> t = "%unbox_int64" [@@warning "-187"]
+
+  let[@inline always] add x y = of_int64 (Int64.add (to_int64 x) (to_int64 y))
+end
+
+type t1 = { mutable d0 : int64# ; mutable d1: int64# }
+
+let[@inline never] [@local never][@specialize never] add_mutable_record (a : t1) (b: t1) (c : t1) : t1 =
+  c.d0 <- Int64_u.add a.d0 b.d0;
+  c.d1 <- Int64_u.add a.d1 b.d1;
+  c
+
+let[@inline never] [@local never][@specialize never] copy_mutable_record (a : t1) (b: t1) : unit =
+  b.d0 <- a.d0;
+  b.d1 <- a.d1;
+  ()
+
+type t2 = {
+  mutable d0 : int64# ;
+  mutable d1: int64# ;
+  mutable d2: int64# ;
+  mutable d3: int64# }
+
+let[@inline never] [@local never][@specialize never] add_fours_mutable_record (a : t1) (b: t1) (c : t2) : unit =
+  c.d0 <- Int64_u.add a.d0 b.d0;
+  c.d1 <- Int64_u.add a.d1 b.d1;
+  c.d2 <- Int64_u.add a.d0 b.d0;
+  c.d3 <- Int64_u.add a.d1 b.d1;
+  ()
+
+let print_t1 ppf (t1 : t1) =
+  Format.fprintf ppf "{ d0 = %Ld ; d1 = %Ld }" (Int64_u.to_int64 t1.d0)
+    (Int64_u.to_int64 t1.d1)
+
+let print_t4 ppf (t2 : t2) =
+  Format.fprintf ppf "{ d0 = %Ld ; d1 = %Ld; d2 = %Ld ; d3 = %Ld }"
+    (Int64_u.to_int64 t2.d0)
+    (Int64_u.to_int64 t2.d1)
+    (Int64_u.to_int64 t2.d2)
+    (Int64_u.to_int64 t2.d3)
+
+let () =
+  let a = { d0 = #8L; d1 = #96L } in
+  let b = { d0 = #80L; d1 = #14L } in
+  let c = { d0 = #8L; d1 = #96L } in
+  let d = { d0 = #0L; d1 = #0L; d2 = #0L; d3 = #0L } in
+  let res = { d0 = #0L; d1 = -#10L } in
+  Format.printf "add_mutable_record %a\n" print_t1
+    (add_mutable_record a b c);
+  copy_mutable_record c res;
+  Format.printf "copy_mutable_record %a\n" print_t1 res;
+  add_fours_mutable_record a b d;
+  Format.printf "add_fours_mutable_record %a\n" print_t4 d;
+  ()
diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.mli b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.mli
new file mode 100644
index 00000000000..5b909d90a8c
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.mli
@@ -0,0 +1 @@
+(* blank, make sure all the functions are called from top-level *)
diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_unboxed_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed_vectorized.cmx.dump.expected
new file mode 100644
index 00000000000..61eea8dffce
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed_vectorized.cmx.dump.expected
@@ -0,0 +1,3 @@
+**** Vectorize selected computation: 5 groups, 10 scalar instructions, 5 vector instructions, cost = -5 (Test_int64_unboxed_vectorized.add_mutable_record)
+**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_int64_unboxed_vectorized.copy_mutable_record)
+**** Vectorize selected computation: 10 groups, 20 scalar instructions, 10 vector instructions, cost = -10 (Test_int64_unboxed_vectorized.add_fours_mutable_record)
diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_int64_vectorized.cmx.dump.expected
new file mode 100644
index 00000000000..6db1b67d70d
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int64_vectorized.cmx.dump.expected
@@ -0,0 +1,3 @@
+**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_int64_vectorized.copy_mutable_record_fresh)
+**** Vectorize selected computation: 4 groups, 8 scalar instructions, 4 vector instructions, cost = -4 (Test_int64_vectorized.copy_mutable_record_t4)
+**** Vectorize selected computation: 4 groups, 8 scalar instructions, 4 vector instructions, cost = -4 (Test_int64_vectorized.dup_mutable_record_t4)

From c914bf2638d0c203eb58368e9a18315ae419e3ce Mon Sep 17 00:00:00 2001
From: Greta Yorsh <45005955+gretay-js@users.noreply.github.com>
Date: Thu, 9 Jan 2025 14:16:06 +0000
Subject: [PATCH 2/2] Disable ocamlformat on unboxed tests

---
 flambda-backend/tests/backend/vectorizer/.ocamlformat-ignore | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 flambda-backend/tests/backend/vectorizer/.ocamlformat-ignore

diff --git a/flambda-backend/tests/backend/vectorizer/.ocamlformat-ignore b/flambda-backend/tests/backend/vectorizer/.ocamlformat-ignore
new file mode 100644
index 00000000000..7ddec40dced
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/.ocamlformat-ignore
@@ -0,0 +1,4 @@
+test_int64_unboxed.ml
+test_float_unboxed.ml
+test_int32_unboxed.ml
+test_float32_unboxed.ml