diff --git a/backend/vectorize_utils.ml b/backend/vectorize_utils.ml
index cb1b715127..7b0db46932 100644
--- a/backend/vectorize_utils.ml
+++ b/backend/vectorize_utils.ml
@@ -137,11 +137,13 @@ let vectorize_machtypes (pack : Reg.t list) : Cmm.machtype_component =
         Printreg.reglist pack;
     match hd.typ, List.length pack with
     | Addr, _ -> Misc.fatal_errorf "Unexpected machtype for %a" Printreg.reg hd
-    | (Int | Float), 2 | Float32, 4 ->
-      (* allows subregs, width should be correct by construction of [Group]. *)
+    | Float, 2 | Float32, 4 -> Vec128
+    | Int, _ ->
+      (* [Int] may be used for int32, width should be correct by construction of
+         [Group]. *)
       Vec128
     | Val, 2 -> Valx2
-    | (Val | Int | Float | Float32), n ->
+    | (Val | Float | Float32), n ->
       Misc.fatal_errorf "Unexpected pack size %d for %a" n Printreg.reglist pack
     | Vec128, _ | Valx2, _ ->
       Misc.fatal_errorf "Unexpected machtype for %a" Printreg.reg hd)
diff --git a/flambda-backend/tests/backend/vectorizer/.ocamlformat-ignore b/flambda-backend/tests/backend/vectorizer/.ocamlformat-ignore
new file mode 100644
index 0000000000..7ddec40dce
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/.ocamlformat-ignore
@@ -0,0 +1,4 @@
+test_int64_unboxed.ml
+test_float_unboxed.ml
+test_int32_unboxed.ml
+test_float32_unboxed.ml
diff --git a/flambda-backend/tests/backend/vectorizer/dune.inc b/flambda-backend/tests/backend/vectorizer/dune.inc
index 67a51f80bb..1673ef0acb 100644
--- a/flambda-backend/tests/backend/vectorizer/dune.inc
+++ b/flambda-backend/tests/backend/vectorizer/dune.inc
@@ -4,7 +4,7 @@
  (enabled_if (= %{context_name} "main"))
  (targets test1_runner.exe test1.cmx.dump)
  (deps test1.mli test1.ml)
- (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -no-vectorize -o test1_runner.exe)))
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test1_runner.exe)))
 
 (rule
  (alias   runtest)
@@ -37,7 +37,7 @@
  (enabled_if (= %{context_name} "main"))
  (targets test1_vectorized_runner.exe test1_vectorized.cmx.dump)
  (deps test1_vectorized.mli test1_vectorized.ml)
- (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize -o test1_vectorized_runner.exe)))
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test1_vectorized_runner.exe)))
 
 (rule
  (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
@@ -75,12 +75,620 @@
  (action
    (diff test1_vectorized.expected test1_vectorized.output)))
 
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_arrays_runner.exe test_arrays.cmx.dump)
+ (deps test_arrays.mli test_arrays.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test_arrays_runner.exe)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_arrays.output
+   (run ./test_arrays_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_arrays.expected test_arrays.output)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_arrays.ml test_arrays_vectorized.ml)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_arrays.mli test_arrays_vectorized.mli)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_arrays_vectorized_runner.exe test_arrays_vectorized.cmx.dump)
+ (deps test_arrays_vectorized.mli test_arrays_vectorized.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test_arrays_vectorized_runner.exe)))
+
+(rule
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (target test_arrays_vectorized.cmx.dump.output)
+ (deps ./filter.sh test_arrays_vectorized.cmx.dump)
+ (action
+  (with-outputs-to
+   %{target}
+   (with-accepted-exit-codes 0
+    (run %{deps})))))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+   (diff test_arrays_vectorized.cmx.dump.expected test_arrays_vectorized.cmx.dump.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_arrays_vectorized.output
+   (run ./test_arrays_vectorized_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_arrays.expected test_arrays_vectorized.expected)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_arrays_vectorized.expected test_arrays_vectorized.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_int64_unboxed_runner.exe test_int64_unboxed.cmx.dump)
+ (deps test_int64_unboxed.mli test_int64_unboxed.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test_int64_unboxed_runner.exe)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_int64_unboxed.output
+   (run ./test_int64_unboxed_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_int64_unboxed.expected test_int64_unboxed.output)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_int64_unboxed.ml test_int64_unboxed_vectorized.ml)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_int64_unboxed.mli test_int64_unboxed_vectorized.mli)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_int64_unboxed_vectorized_runner.exe test_int64_unboxed_vectorized.cmx.dump)
+ (deps test_int64_unboxed_vectorized.mli test_int64_unboxed_vectorized.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test_int64_unboxed_vectorized_runner.exe)))
+
+(rule
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (target test_int64_unboxed_vectorized.cmx.dump.output)
+ (deps ./filter.sh test_int64_unboxed_vectorized.cmx.dump)
+ (action
+  (with-outputs-to
+   %{target}
+   (with-accepted-exit-codes 0
+    (run %{deps})))))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+   (diff test_int64_unboxed_vectorized.cmx.dump.expected test_int64_unboxed_vectorized.cmx.dump.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_int64_unboxed_vectorized.output
+   (run ./test_int64_unboxed_vectorized_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_int64_unboxed.expected test_int64_unboxed_vectorized.expected)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_int64_unboxed_vectorized.expected test_int64_unboxed_vectorized.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_float_unboxed_runner.exe test_float_unboxed.cmx.dump)
+ (deps test_float_unboxed.mli test_float_unboxed.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test_float_unboxed_runner.exe)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_float_unboxed.output
+   (run ./test_float_unboxed_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_float_unboxed.expected test_float_unboxed.output)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_float_unboxed.ml test_float_unboxed_vectorized.ml)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_float_unboxed.mli test_float_unboxed_vectorized.mli)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_float_unboxed_vectorized_runner.exe test_float_unboxed_vectorized.cmx.dump)
+ (deps test_float_unboxed_vectorized.mli test_float_unboxed_vectorized.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test_float_unboxed_vectorized_runner.exe)))
+
+(rule
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (target test_float_unboxed_vectorized.cmx.dump.output)
+ (deps ./filter.sh test_float_unboxed_vectorized.cmx.dump)
+ (action
+  (with-outputs-to
+   %{target}
+   (with-accepted-exit-codes 0
+    (run %{deps})))))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+   (diff test_float_unboxed_vectorized.cmx.dump.expected test_float_unboxed_vectorized.cmx.dump.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_float_unboxed_vectorized.output
+   (run ./test_float_unboxed_vectorized_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_float_unboxed.expected test_float_unboxed_vectorized.expected)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_float_unboxed_vectorized.expected test_float_unboxed_vectorized.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_int64_runner.exe test_int64.cmx.dump)
+ (deps test_int64.mli test_int64.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test_int64_runner.exe)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_int64.output
+   (run ./test_int64_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_int64.expected test_int64.output)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_int64.ml test_int64_vectorized.ml)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_int64.mli test_int64_vectorized.mli)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_int64_vectorized_runner.exe test_int64_vectorized.cmx.dump)
+ (deps test_int64_vectorized.mli test_int64_vectorized.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test_int64_vectorized_runner.exe)))
+
+(rule
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (target test_int64_vectorized.cmx.dump.output)
+ (deps ./filter.sh test_int64_vectorized.cmx.dump)
+ (action
+  (with-outputs-to
+   %{target}
+   (with-accepted-exit-codes 0
+    (run %{deps})))))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+   (diff test_int64_vectorized.cmx.dump.expected test_int64_vectorized.cmx.dump.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_int64_vectorized.output
+   (run ./test_int64_vectorized_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_int64.expected test_int64_vectorized.expected)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_int64_vectorized.expected test_int64_vectorized.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_float_runner.exe test_float.cmx.dump)
+ (deps test_float.mli test_float.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test_float_runner.exe)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_float.output
+   (run ./test_float_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_float.expected test_float.output)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_float.ml test_float_vectorized.ml)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_float.mli test_float_vectorized.mli)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_float_vectorized_runner.exe test_float_vectorized.cmx.dump)
+ (deps test_float_vectorized.mli test_float_vectorized.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test_float_vectorized_runner.exe)))
+
+(rule
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (target test_float_vectorized.cmx.dump.output)
+ (deps ./filter.sh test_float_vectorized.cmx.dump)
+ (action
+  (with-outputs-to
+   %{target}
+   (with-accepted-exit-codes 0
+    (run %{deps})))))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+   (diff test_float_vectorized.cmx.dump.expected test_float_vectorized.cmx.dump.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_float_vectorized.output
+   (run ./test_float_vectorized_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_float.expected test_float_vectorized.expected)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_float_vectorized.expected test_float_vectorized.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (targets test_float32_unboxed_runner.exe test_float32_unboxed.cmx.dump)
+ (deps test_float32_unboxed.mli test_float32_unboxed.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test_float32_unboxed_runner.exe)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+  (with-outputs-to
+   test_float32_unboxed.output
+   (run ./test_float32_unboxed_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+   (diff test_float32_unboxed.expected test_float32_unboxed.output)))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+  (copy test_float32_unboxed.ml test_float32_unboxed_vectorized.ml)))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+  (copy test_float32_unboxed.mli test_float32_unboxed_vectorized.mli)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (targets test_float32_unboxed_vectorized_runner.exe test_float32_unboxed_vectorized.cmx.dump)
+ (deps test_float32_unboxed_vectorized.mli test_float32_unboxed_vectorized.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test_float32_unboxed_vectorized_runner.exe)))
+
+(rule
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (target test_float32_unboxed_vectorized.cmx.dump.output)
+ (deps ./filter.sh test_float32_unboxed_vectorized.cmx.dump)
+ (action
+  (with-outputs-to
+   %{target}
+   (with-accepted-exit-codes 0
+    (run %{deps})))))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+   (diff test_float32_unboxed_vectorized.cmx.dump.expected test_float32_unboxed_vectorized.cmx.dump.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+  (with-outputs-to
+   test_float32_unboxed_vectorized.output
+   (run ./test_float32_unboxed_vectorized_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+  (copy test_float32_unboxed.expected test_float32_unboxed_vectorized.expected)))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+   (diff test_float32_unboxed_vectorized.expected test_float32_unboxed_vectorized.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_int32_unboxed_runner.exe test_int32_unboxed.cmx.dump)
+ (deps test_int32_unboxed.mli test_int32_unboxed.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test_int32_unboxed_runner.exe)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_int32_unboxed.output
+   (run ./test_int32_unboxed_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_int32_unboxed.expected test_int32_unboxed.output)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_int32_unboxed.ml test_int32_unboxed_vectorized.ml)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_int32_unboxed.mli test_int32_unboxed_vectorized.mli)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_int32_unboxed_vectorized_runner.exe test_int32_unboxed_vectorized.cmx.dump)
+ (deps test_int32_unboxed_vectorized.mli test_int32_unboxed_vectorized.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test_int32_unboxed_vectorized_runner.exe)))
+
+(rule
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (target test_int32_unboxed_vectorized.cmx.dump.output)
+ (deps ./filter.sh test_int32_unboxed_vectorized.cmx.dump)
+ (action
+  (with-outputs-to
+   %{target}
+   (with-accepted-exit-codes 0
+    (run %{deps})))))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+   (diff test_int32_unboxed_vectorized.cmx.dump.expected test_int32_unboxed_vectorized.cmx.dump.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_int32_unboxed_vectorized.output
+   (run ./test_int32_unboxed_vectorized_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_int32_unboxed.expected test_int32_unboxed_vectorized.expected)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_int32_unboxed_vectorized.expected test_int32_unboxed_vectorized.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_spill_valx2_runner.exe test_spill_valx2.cmx.dump)
+ (deps test_spill_valx2.mli test_spill_valx2.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test_spill_valx2_runner.exe)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_spill_valx2.output
+   (run ./test_spill_valx2_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_spill_valx2.expected test_spill_valx2.output)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_spill_valx2.ml test_spill_valx2_vectorized.ml)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_spill_valx2.mli test_spill_valx2_vectorized.mli)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (targets test_spill_valx2_vectorized_runner.exe test_spill_valx2_vectorized.cmx.dump)
+ (deps test_spill_valx2_vectorized.mli test_spill_valx2_vectorized.ml)
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test_spill_valx2_vectorized_runner.exe)))
+
+(rule
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (target test_spill_valx2_vectorized.cmx.dump.output)
+ (deps ./filter.sh test_spill_valx2_vectorized.cmx.dump)
+ (action
+  (with-outputs-to
+   %{target}
+   (with-accepted-exit-codes 0
+    (run %{deps})))))
+
+(rule
+ (alias runtest)
+ (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
+ (action
+   (diff test_spill_valx2_vectorized.cmx.dump.expected test_spill_valx2_vectorized.cmx.dump.output)))
+
+(rule
+ (alias   runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (with-outputs-to
+   test_spill_valx2_vectorized.output
+   (run ./test_spill_valx2_vectorized_runner.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+  (copy test_spill_valx2.expected test_spill_valx2_vectorized.expected)))
+
+(rule
+ (alias runtest)
+ (enabled_if (= %{context_name} "main"))
+ (action
+   (diff test_spill_valx2_vectorized.expected test_spill_valx2_vectorized.output)))
+
 (rule
  (alias   runtest)
  (enabled_if (= %{context_name} "main"))
  (targets test_register_compatible_runner.exe test_register_compatible.cmx.dump)
  (deps test_register_compatible.mli test_register_compatible.ml)
- (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -no-vectorize -o test_register_compatible_runner.exe)))
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test_register_compatible_runner.exe)))
 
 (rule
  (alias   runtest)
@@ -113,7 +721,7 @@
  (enabled_if (= %{context_name} "main"))
  (targets test_register_compatible_vectorized_runner.exe test_register_compatible_vectorized.cmx.dump)
  (deps test_register_compatible_vectorized.mli test_register_compatible_vectorized.ml)
- (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize -o test_register_compatible_vectorized_runner.exe)))
+ (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test_register_compatible_vectorized_runner.exe)))
 
 (rule
  (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) )
diff --git a/flambda-backend/tests/backend/vectorizer/gen/gen_dune.ml b/flambda-backend/tests/backend/vectorizer/gen/gen_dune.ml
index 53062d52b9..f6f4e7dc47 100644
--- a/flambda-backend/tests/backend/vectorizer/gen/gen_dune.ml
+++ b/flambda-backend/tests/backend/vectorizer/gen/gen_dune.ml
@@ -5,7 +5,7 @@ let enabled_if_main_amd64 =
 
 let flags =
   "-S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc \
-   cfg -extension simd"
+   cfg -extension simd -vectorize-max-block-size 1000"
 
 let runner name = name ^ "_runner.exe"
 
@@ -119,18 +119,15 @@ let filter_dump ~enabled_if ~exit_code name =
     (run %{deps})))))
 |}
 
-let copy_source_to_vectorize name =
-  copy_file ~enabled_if:enabled_if_main (name |> impl)
-    (name |> vectorized |> impl);
-  copy_file ~enabled_if:enabled_if_main (name |> intf)
-    (name |> vectorized |> intf)
+let copy_source_to_vectorize ~enabled_if name =
+  copy_file ~enabled_if (name |> impl) (name |> vectorized |> impl);
+  copy_file ~enabled_if (name |> intf) (name |> vectorized |> intf)
 
-let compile_no_vectorizer name =
-  compile ~enabled_if:enabled_if_main ~extra_flags:"-no-vectorize" name
+let compile_no_vectorizer ~enabled_if name =
+  compile ~enabled_if ~extra_flags:"-no-vectorize" name
 
-let compile_with_vectorizer name =
-  compile ~enabled_if:enabled_if_main ~extra_flags:"-vectorize"
-    (vectorized name)
+let compile_with_vectorizer ~enabled_if name =
+  compile ~enabled_if ~extra_flags:"-vectorize" (vectorized name)
 
 let filter_vectorizer_dump ~enabled_if ~exit_code name =
   filter_dump ~enabled_if ~exit_code (name |> vectorized)
@@ -138,38 +135,44 @@ let filter_vectorizer_dump ~enabled_if ~exit_code name =
 let diff_vectorizer_dump ~enabled_if name =
   diff_output ~enabled_if (name |> vectorized |> cmx_dump)
 
-let run_no_vectorizer name = run ~enabled_if:enabled_if_main name
+let run_no_vectorizer ~enabled_if name = run ~enabled_if name
 
-let run_vectorized name = run ~enabled_if:enabled_if_main (name |> vectorized)
+let run_vectorized ~enabled_if name = run ~enabled_if (name |> vectorized)
 
-let diff_output_no_vectorizer name =
-  diff_output ~enabled_if:enabled_if_main name
+let diff_output_no_vectorizer ~enabled_if name = diff_output ~enabled_if name
 
-let diff_output_vectorized name =
-  diff_output ~enabled_if:enabled_if_main (name |> vectorized)
+let diff_output_vectorized ~enabled_if name =
+  diff_output ~enabled_if (name |> vectorized)
 
-let copy_expected_output name =
-  copy_file ~enabled_if:enabled_if_main (name |> expected)
-    (name |> vectorized |> expected)
+let copy_expected_output ~enabled_if name =
+  copy_file ~enabled_if (name |> expected) (name |> vectorized |> expected)
 
-let print_test ?(filter_exit_code = 0) name =
+let print_test ?(enabled_if = enabled_if_main) ?(filter_exit_code = 0) name =
   (* check expected test output is up to date *)
-  compile_no_vectorizer name;
-  run_no_vectorizer name;
-  diff_output_no_vectorizer name;
+  compile_no_vectorizer ~enabled_if name;
+  run_no_vectorizer ~enabled_if name;
+  diff_output_no_vectorizer ~enabled_if name;
   (* vectorizer *)
-  copy_source_to_vectorize name;
-  compile_with_vectorizer name;
+  copy_source_to_vectorize ~enabled_if name;
+  compile_with_vectorizer ~enabled_if name;
   filter_vectorizer_dump name ~exit_code:filter_exit_code
     ~enabled_if:enabled_if_main_amd64;
   diff_vectorizer_dump name ~enabled_if:enabled_if_main_amd64;
-  run_vectorized name;
-  copy_expected_output name;
-  diff_output_vectorized name;
+  run_vectorized ~enabled_if name;
+  copy_expected_output ~enabled_if name;
+  diff_output_vectorized ~enabled_if name;
   ()
 
 let () =
   print_test "test1";
+  print_test "test_arrays";
+  print_test "test_int64_unboxed";
+  print_test "test_float_unboxed";
+  print_test "test_int64";
+  print_test "test_float";
+  print_test ~enabled_if:enabled_if_main_amd64 "test_float32_unboxed";
+  print_test "test_int32_unboxed";
+  print_test "test_spill_valx2";
   (* can't vectorize *)
   print_test ~filter_exit_code:1 "test_register_compatible";
   ()
diff --git a/flambda-backend/tests/backend/vectorizer/test_arrays.expected b/flambda-backend/tests/backend/vectorizer/test_arrays.expected
new file mode 100644
index 0000000000..e86cd1806c
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_arrays.expected
@@ -0,0 +1,9 @@
+add_arrays_unrolled_manually 17 18 19 20 21 22 23 24 25 26 
+add_arrays_unrolled_safe 17 18 19 20 21 22 23 24 25 26 
+add_arrays_rec_unrolled_attribute 17 18 19 20 21 22 23 24 25 26 
+add_arrays_for 17 18 19 20 21 22 23 24 25 26 
+add_arrays_rec 17 18 19 20 21 22 23 24 25 26 
+initialize_array_const_unrolled_manually 0 0 0 0 0 0 0 0 0 0 
+initialize_arrays_const_unrolled_manually 0 0 0 0 0 0 0 0 0 0 
+initialize_array_unrolled_manually 17 17 17 17 17 17 17 17 17 17 
+initialize_floatarray_unrolled_manually 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 
diff --git a/flambda-backend/tests/backend/vectorizer/test_arrays.ml b/flambda-backend/tests/backend/vectorizer/test_arrays.ml
new file mode 100644
index 0000000000..106eaa5e5d
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_arrays.ml
@@ -0,0 +1,141 @@
+let[@inline never] [@local never] [@specialize never] add_arrays_unrolled_manually
+    a b c n =
+  for i = 0 to (n / 2) - 1 do
+    Array.unsafe_set c (i * 2)
+      (Array.unsafe_get a (i * 2) + Array.unsafe_get b (i * 2));
+    Array.unsafe_set c
+      ((i * 2) + 1)
+      (Array.unsafe_get a ((i * 2) + 1) + Array.unsafe_get b ((i * 2) + 1))
+  done;
+  if Int.rem n 2 = 1
+  then
+    Array.unsafe_set c (n - 1)
+      (Array.unsafe_get a (n - 1) + Array.unsafe_get b (n - 1))
+
+(* Currently won't be vectorized. Can vectorize it but it's not worth it
+   according to our cost model. It will be vectorized when we add vectors beyond
+   128 or arrays of elements smaller than 64-bit. *)
+let[@inline never] [@local never] [@specialize never] initialize_array_const_unrolled_manually
+    arr n =
+  let i = ref 0 in
+  while !i < n do
+    Array.unsafe_set arr !i 0;
+    Array.unsafe_set arr (!i + 1) 0;
+    i := !i + 2
+  done
+
+(* Currently, won't be vectorized. If different groups can reuse the new
+   register that holds the constants, this will be worth vectorizing even with
+   128-bit vectors. *)
+let[@inline never] [@local never] [@specialize never] initialize_arrays_const_unrolled_manually
+    a b c n =
+  let i = ref 0 in
+  while !i < n do
+    Array.unsafe_set a !i 0;
+    Array.unsafe_set a (!i + 1) 0;
+    Array.unsafe_set b !i 0;
+    Array.unsafe_set b (!i + 1) 0;
+    Array.unsafe_set c !i 0;
+    Array.unsafe_set c (!i + 1) 0;
+    i := !i + 2
+  done
+
+(* Currently, won't be vectorized. Shuffling values into a vector is not yet
+   supported, only vector loads are. Also not worth it unless the shuffle is
+   outside the loop (loop invariant detection/motion would be needed for it). *)
+let[@inline never] [@local never] [@specialize never] initialize_array_unrolled_manually
+    arr n (v : int) =
+  let i = ref 0 in
+  while !i < n do
+    Array.unsafe_set arr !i v;
+    Array.unsafe_set arr (!i + 1) v;
+    i := !i + 2
+  done
+
+(* same as [initialize_array_unrolled_manually] except needs movddup. *)
+let[@inline never] [@local never] [@specialize never] initialize_floatarray_unrolled_manually
+    arr n (v : float) =
+  let i = ref 0 in
+  while !i < n do
+    Array.unsafe_set arr !i v;
+    Array.unsafe_set arr (!i + 1) v;
+    i := !i + 2
+  done
+
+(* cannot vectorize across basic blocks *)
+let[@inline never] [@local never] [@specialize never] add_arrays_unrolled_safe a
+    b c n =
+  for i = 0 to n - 1 do
+    Array.set c (i * 2) (Array.get a (i * 2) + Array.get b (i * 2));
+    Array.set c
+      ((i * 2) + 1)
+      (Array.get a ((i * 2) + 1) + Array.get b ((i * 2) + 1))
+  done
+
+(* cannot vectorize across basic blocks. unroll attribute is not sufficient to
+   eliminate the loop condition from the unrolled body (e.g., we would need to
+   track the fact that the bound is even. *)
+let[@inline never] [@local never] [@specialize never] add_arrays_rec_unrolled_attribute
+    a b c n =
+  let[@loop never] rec loop i a b c n =
+    if i < n
+    then (
+      Array.unsafe_set c i (Array.unsafe_get a i + Array.unsafe_get b i);
+      (loop [@unrolled 1]) (i + 1) a b c n)
+  in
+  loop 0 a b c (2 * n)
+
+(* cannot vectorize for-loops *)
+let[@inline never] [@local never] [@specialize never] add_arrays_for a b c n =
+  for i = 0 to n - 1 do
+    Array.unsafe_set c i (Array.unsafe_get a i + Array.unsafe_get b i)
+  done
+
+(* cannot vectorize loops expressed using recursion *)
+let[@inline never] [@local never] [@specialize never] add_arrays_rec a b c n =
+  let rec loop i =
+    if i < n
+    then (
+      Array.unsafe_set c i (Array.unsafe_get a i + Array.unsafe_get b i);
+      loop (i + 1))
+  in
+  loop 0
+
+let print_array ppf a =
+  let count = Array.length a in
+  for i = 0 to count - 1 do
+    Format.fprintf ppf "%d " a.(i)
+  done
+
+let print_floatarray ppf a =
+  let count = Array.length a in
+  for i = 0 to count - 1 do
+    Format.fprintf ppf "%f " a.(i)
+  done
+
+let () =
+  let n = Sys.opaque_identity 10 in
+  let a = Array.init n (fun i -> i) in
+  let b = Array.make n 17 in
+  let c = Array.make n 0 in
+  let d = Array.make n 0.0 in
+  add_arrays_unrolled_manually a b c (Sys.opaque_identity n);
+  Format.printf "add_arrays_unrolled_manually %a\n" print_array c;
+  add_arrays_unrolled_safe a b c (Sys.opaque_identity (n / 2));
+  Format.printf "add_arrays_unrolled_safe %a\n" print_array c;
+  add_arrays_rec_unrolled_attribute a b c (n / 2);
+  Format.printf "add_arrays_rec_unrolled_attribute %a\n" print_array c;
+  add_arrays_for a b c n;
+  Format.printf "add_arrays_for %a\n" print_array c;
+  add_arrays_rec a b c n;
+  Format.printf "add_arrays_rec %a\n" print_array c;
+  initialize_array_const_unrolled_manually c n;
+  Format.printf "initialize_array_const_unrolled_manually %a\n" print_array c;
+  initialize_arrays_const_unrolled_manually a b c n;
+  Format.printf "initialize_arrays_const_unrolled_manually %a\n" print_array c;
+  initialize_array_unrolled_manually c n (Sys.opaque_identity 17);
+  Format.printf "initialize_array_unrolled_manually %a\n" print_array c;
+  initialize_floatarray_unrolled_manually d n (Sys.opaque_identity 7.7);
+  Format.printf "initialize_floatarray_unrolled_manually %a\n" print_floatarray
+    d;
+  ()
diff --git a/flambda-backend/tests/backend/vectorizer/test_arrays.mli b/flambda-backend/tests/backend/vectorizer/test_arrays.mli
new file mode 100644
index 0000000000..5b909d90a8
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_arrays.mli
@@ -0,0 +1 @@
+(* blank, make sure all the functions are called from top-level *)
diff --git a/flambda-backend/tests/backend/vectorizer/test_arrays_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_arrays_vectorized.cmx.dump.expected
new file mode 100644
index 0000000000..182c1cc730
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_arrays_vectorized.cmx.dump.expected
@@ -0,0 +1 @@
+**** Vectorize selected computation: 4 groups, 8 scalar instructions, 7 vector instructions, cost = -1 (Test_arrays_vectorized.add_arrays_unrolled_manually)
diff --git a/flambda-backend/tests/backend/vectorizer/test_float.expected b/flambda-backend/tests/backend/vectorizer/test_float.expected
new file mode 100644
index 0000000000..00ffe66d5e
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float.expected
@@ -0,0 +1,7 @@
+add_mutable_record { d0 = 88.000000 ; d1 = 110.000000 }
+copy_mutable_record { d0 = 88.000000 ; d1 = 110.000000 }
+add_mutable_record_fresh { d0 = 88.000000 ; d1 = 110.000000 }
+copy_mutable_record_fresh { d0 = 88.000000 ; d1 = 110.000000 }
+add_mutable_record_t4 { d0 = 88.000000 ; d1 = 110.000000; d2 = 88.000000 ; d3 = 110.000000 }
+copy_mutable_record_t4 { d0 = 8.000000 ; d1 = 96.000000; d2 = 80.000000 ; d3 = 14.000000 }
+dup_mutable_record_t4 { d0 = 8.000000 ; d1 = 96.000000; d2 = 8.000000 ; d3 = 96.000000 }
diff --git a/flambda-backend/tests/backend/vectorizer/test_float.ml b/flambda-backend/tests/backend/vectorizer/test_float.ml
new file mode 100644
index 0000000000..1e36c686ce
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float.ml
@@ -0,0 +1,75 @@
+[@@@ocaml.warnerror "+a-40-41-42"]
+
+type t1 =
+  { mutable d0 : float;
+    mutable d1 : float
+  }
+
+let[@inline never] [@local never] [@specialize never] add_mutable_record
+    (a : t1) (b : t1) (c : t1) : t1 =
+  c.d0 <- Float.add a.d0 b.d0;
+  c.d1 <- Float.add a.d1 b.d1;
+  c
+
+let[@inline never] [@local never] [@specialize never] copy_mutable_record
+    (a : t1) (b : t1) : t1 =
+  b.d0 <- a.d0;
+  b.d1 <- a.d1;
+  b
+
+let[@inline never] [@local never] [@specialize never] add_mutable_record_fresh
+    (a : t1) (b : t1) : t1 =
+  { d0 = Float.add a.d0 b.d0; d1 = Float.add a.d1 b.d1 }
+
+let[@inline never] [@local never] [@specialize never] copy_mutable_record_fresh
+    (a : t1) : t1 =
+  { d0 = a.d0; d1 = a.d1 }
+
+type t4 =
+  { mutable d0 : float;
+    mutable d1 : float;
+    mutable d2 : float;
+    mutable d3 : float
+  }
+
+let[@inline never] [@local never] [@specialize never] add_mutable_record_t4
+    (a : t1) (b : t1) (c : t4) : t4 =
+  c.d0 <- Float.add a.d0 b.d0;
+  c.d1 <- Float.add a.d1 b.d1;
+  c.d2 <- Float.add a.d0 b.d0;
+  c.d3 <- Float.add a.d1 b.d1;
+  c
+
+let[@inline never] [@local never] [@specialize never] copy_mutable_record_t4
+    (a : t1) (b : t1) : t4 =
+  { d0 = a.d0; d1 = a.d1; d2 = b.d0; d3 = b.d1 }
+
+let[@inline never] [@local never] [@specialize never] dup_mutable_record_t4
+    (a : t1) : t4 =
+  { d0 = a.d0; d1 = a.d1; d2 = a.d0; d3 = a.d1 }
+
+let print_t1 ppf (t1 : t1) =
+  Format.fprintf ppf "{ d0 = %f ; d1 = %f }" t1.d0 t1.d1
+
+let print_t4 ppf (t4 : t4) =
+  Format.fprintf ppf "{ d0 = %f ; d1 = %f; d2 = %f ; d3 = %f }" t4.d0 t4.d1
+    t4.d2 t4.d3
+
+let () =
+  let a = { d0 = 8.; d1 = 96. } in
+  let b = { d0 = 80.; d1 = 14. } in
+  let c = { d0 = 10.; d1 = -10. } in
+  let t4 = { d0 = 10.; d1 = -10.; d2 = 199.; d3 = 18. } in
+  let res = { d0 = 0.; d1 = -0. } in
+  Format.printf "add_mutable_record %a\n" print_t1 (add_mutable_record a b c);
+  Format.printf "copy_mutable_record %a\n" print_t1 (copy_mutable_record c res);
+  Format.printf "add_mutable_record_fresh %a\n" print_t1
+    (add_mutable_record_fresh a b);
+  Format.printf "copy_mutable_record_fresh %a\n" print_t1
+    (copy_mutable_record_fresh c);
+  Format.printf "add_mutable_record_t4 %a\n" print_t4
+    (add_mutable_record_t4 a b t4);
+  Format.printf "copy_mutable_record_t4 %a\n" print_t4
+    (copy_mutable_record_t4 a b);
+  Format.printf "dup_mutable_record_t4 %a\n" print_t4 (dup_mutable_record_t4 a);
+  ()
diff --git a/flambda-backend/tests/backend/vectorizer/test_float.mli b/flambda-backend/tests/backend/vectorizer/test_float.mli
new file mode 100644
index 0000000000..5b909d90a8
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float.mli
@@ -0,0 +1 @@
+(* blank, make sure all the functions are called from top-level *)
diff --git a/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.expected b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.expected
new file mode 100644
index 0000000000..92c4b798f9
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.expected
@@ -0,0 +1,5 @@
+add_unboxed_pairs_mutable_record { d0 = 88. ; d1 = 110.; d2 = 0. ; d3 = -1. }
+copy_unboxed_pairs_mutable_record { d0 = 88. ; d1 = 110.; d2 = 0. ; d3 = -1. }
+copy_bytes 10. 10. 10. 10. 
+copy_bytes_pos 10. 10. 10. 10. 
+copy_bytes_pos_v2 10. 10. 10. 10. 
diff --git a/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.ml b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.ml
new file mode 100644
index 0000000000..ea552f169e
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.ml
@@ -0,0 +1,225 @@
+[@@@ocaml.warnerror "+a-40-41-42"]
+
+module Float32 = struct
+  type t = float32
+
+  external add : (t[@local_opt]) -> (t[@local_opt]) -> (t[@local_opt])
+    = "%addfloat32"
+
+  external format : string -> t -> string = "caml_format_float32"
+
+  let to_string f = Stdlib.valid_float_lexem (format "%.9g" f)
+
+  module Bytes = struct
+    external get : bytes -> pos:int -> float32 = "%caml_bytes_getf32"
+    external unsafe_get : bytes -> pos:int -> float32 = "%caml_bytes_getf32u"
+    external set : bytes -> pos:int -> float32 -> unit = "%caml_bytes_setf32"
+
+    external unsafe_set : bytes -> pos:int -> float32 -> unit
+      = "%caml_bytes_setf32u"
+  end
+end
+
+module Float32_u = struct
+  type t = float32#
+
+  external to_float32 : t -> (float32[@local_opt]) = "%box_float32" [@@warning "-187"]
+
+  external of_float32 : (float32[@local_opt]) -> t = "%unbox_float32" [@@warning "-187"]
+
+  let[@inline always] add x y = of_float32 (Float32.add (to_float32 x) (to_float32 y))
+
+  module Bytes = struct
+    let get bytes ~pos = of_float32 (Float32.Bytes.get bytes ~pos)
+    let unsafe_get bytes ~pos = of_float32 (Float32.Bytes.unsafe_get bytes ~pos)
+    let set bytes ~pos x = Float32.Bytes.set bytes ~pos (to_float32 x)
+    let unsafe_set bytes ~pos x = Float32.Bytes.unsafe_set bytes ~pos (to_float32 x)
+  end
+end
+
+type t1 = { mutable d0 : float32# ;
+            mutable d1: float32#; mutable d2: float32#; mutable d3: float32#  }
+
+(* Not vectorized because float32 fields are not adjacent in a record, they are padded
+to 64-bits. *)
+let[@inline never] [@local never][@specialize never] copy_mutable_record (a : t1) (b: t1) : unit =
+  b.d0 <- a.d0;
+  b.d1 <- a.d1;
+  b.d2 <- a.d2;
+  b.d3 <- a.d3;
+  ()
+
+(* Not vectorized because float32 fields are not adjacent in a record, they are padded
+to 64-bits. *)
+let[@inline never] [@local never][@specialize never] add_mutable_record (a : t1) (b: t1) (c : t1) : t1 =
+  c.d0 <- Float32_u.add a.d0 b.d0;
+  c.d1 <- Float32_u.add a.d1 b.d1;
+  c.d2 <- Float32_u.add a.d2 b.d2;
+  c.d3 <- Float32_u.add a.d3 b.d3;
+  c
+
+(* [Float32_u.Bytes] contain packed float32_u, can vectorize. *)
+let[@inline never] [@local never] [@specialize never] copy_bytes a b =
+  let pos = 0 in
+  let x = Float32_u.Bytes.unsafe_get a ~pos in
+  Float32_u.Bytes.unsafe_set b ~pos x;
+  let pos = pos + 4 in
+  let x = Float32_u.Bytes.unsafe_get a ~pos in
+  Float32_u.Bytes.unsafe_set b ~pos x;
+  let pos = pos + 4 in
+  let x = Float32_u.Bytes.unsafe_get a ~pos in
+  Float32_u.Bytes.unsafe_set b ~pos x;
+  let pos = pos + 4 in
+  let x = Float32_u.Bytes.unsafe_get a ~pos in
+  Float32_u.Bytes.unsafe_set b ~pos x;
+  ()
+
+let[@inline never] [@local never] [@specialize never] init_bytes b x =
+  let pos = 0 in
+  Float32_u.Bytes.unsafe_set b ~pos x;
+  let pos = pos + 4 in
+  Float32_u.Bytes.unsafe_set b ~pos x;
+  let pos = pos + 4 in
+  Float32_u.Bytes.unsafe_set b ~pos x;
+  let pos = pos + 4 in
+  Float32_u.Bytes.unsafe_set b ~pos x;
+  ()
+
+let[@inline always] copy_float32_unboxed_pos a b ~pos =
+  let x = Float32_u.Bytes.unsafe_get a ~pos in
+  Float32_u.Bytes.unsafe_set b ~pos x;
+  ()
+
+(* Currently can't vectorize because [pos] untagging is repeated and the current
+   heuristic for detecting relations between pointers is not strong enough to
+   handle this case. *)
+let[@inline never] [@local never] [@specialize never] copy_bytes_pos a b pos =
+  copy_float32_unboxed_pos a b ~pos;
+  copy_float32_unboxed_pos a b ~pos:(pos+1*4);
+  copy_float32_unboxed_pos a b ~pos:(pos+2*4);
+  copy_float32_unboxed_pos a b ~pos:(pos+3*4);
+  ()
+
+(* 128:
+ * (id:3) a:V/61 := R:I/0[%rax]
+ * (id:4) b:V/62 := R:I/1[%rbx]
+ * (id:5) pos:I/63 := R:I/2[%rdi]
+ * (id:6) prim:I/64 := pos:I/63
+ * (id:7) prim:I/64 := prim:I/64 >>s 1
+ * (id:8) S/65 := float32  mut[a:V/61 + prim:I/64]
+ * (id:9) float32[b:V/62 + prim:I/64] := S/65 (assign)
+ * (id:10) Pbytes_set_f32:I/66 := 1
+ * (id:11) I/67 := pos:I/63
+ * (id:12) I/67 := I/67 + 8
+ * (id:13) prim:I/68 := I/67
+ * (id:14) prim:I/68 := prim:I/68 >>s 1
+ * (id:15) S/69 := float32  mut[a:V/61 + prim:I/68]
+ * (id:16) float32[b:V/62 + prim:I/68] := S/69 (assign)
+ * (id:17) Pbytes_set_f32:I/70 := 1
+ * (id:18) I/71 := pos:I/63
+ * (id:19) I/71 := I/71 + 16
+ * (id:20) prim:I/72 := I/71
+ * (id:21) prim:I/72 := prim:I/72 >>s 1
+ * (id:22) S/73 := float32  mut[a:V/61 + prim:I/72]
+ * (id:23) float32[b:V/62 + prim:I/72] := S/73 (assign)
+ * (id:24) Pbytes_set_f32:I/74 := 1
+ * (id:25) I/75 := pos:I/63
+ * (id:26) I/75 := I/75 + 24
+ * (id:27) prim:I/76 := I/75
+ * (id:28) prim:I/76 := prim:I/76 >>s 1
+ * (id:29) S/77 := float32  mut[a:V/61 + prim:I/76]
+ * (id:30) float32[b:V/62 + prim:I/76] := S/77 (assign)
+ * (id:31) Pbytes_set_f32:I/78 := 1
+ * (id:32) I/79 := 1
+ * (id:33) R:I/0[%rax] := I/79
+ * (id:34) Return R:I/0[%rax] *)
+
+(* Currently, can't vectorize because the index is untagged before every memory access,
+   instead of operating on untagged indexes throughout. *)
+let[@inline never] [@local never] [@specialize never] copy_bytes_pos_v2 a b pos =
+  let i0 = pos in
+  copy_float32_unboxed_pos a b ~pos:i0;
+  let i1 = i0 + 4  in
+  copy_float32_unboxed_pos a b ~pos:i1;
+  let i2 = i1 + 4 in
+  copy_float32_unboxed_pos a b ~pos:i2;
+  let i3 = i2 + 4 in
+  copy_float32_unboxed_pos a b ~pos:i3;
+  ()
+
+(* 177:
+ * (id:3) a:V/61 := R:I/0[%rax]
+ * (id:4) b:V/62 := R:I/1[%rbx]
+ * (id:5) pos:I/63 := R:I/2[%rdi]
+ * (id:6) prim:I/64 := pos:I/63
+ * (id:7) prim:I/64 := prim:I/64 >>s 1
+ * (id:8) S/65 := float32  mut[a:V/61 + prim:I/64]
+ * (id:9) float32[b:V/62 + prim:I/64] := S/65 (assign)
+ * (id:10) Pbytes_set_f32:I/66 := 1
+ * (id:11) i1:I/67 := pos:I/63
+ * (id:12) i1:I/67 := i1:I/67 + 8
+ * (id:13) prim:I/68 := i1:I/67
+ * (id:14) prim:I/68 := prim:I/68 >>s 1
+ * (id:15) S/69 := float32  mut[a:V/61 + prim:I/68]
+ * (id:16) float32[b:V/62 + prim:I/68] := S/69 (assign)
+ * (id:17) Pbytes_set_f32:I/70 := 1
+ * (id:18) i2:I/71 := i1:I/67
+ * (id:19) i2:I/71 := i2:I/71 + 8
+ * (id:20) prim:I/72 := i2:I/71
+ * (id:21) prim:I/72 := prim:I/72 >>s 1
+ * (id:22) S/73 := float32  mut[a:V/61 + prim:I/72]
+ * (id:23) float32[b:V/62 + prim:I/72] := S/73 (assign)
+ * (id:24) Pbytes_set_f32:I/74 := 1
+ * (id:25) I/75 := i2:I/71
+ * (id:26) I/75 := I/75 + 8
+ * (id:27) prim:I/76 := I/75
+ * (id:28) prim:I/76 := prim:I/76 >>s 1
+ * (id:29) S/77 := float32  mut[a:V/61 + prim:I/76]
+ * (id:30) float32[b:V/62 + prim:I/76] := S/77 (assign)
+ * (id:31) Pbytes_set_f32:I/78 := 1
+ * (id:32) I/79 := 1
+ * (id:33) R:I/0[%rax] := I/79
+ * (id:34) Return R:I/0[%rax] *)
+
+
+let print_t1 ppf (t1 : t1) =
+  (* CR gyorsh: how to print Float32? *)
+  let to_string f = (Float32_u.to_float32 f |> Float32.to_string) in
+  Format.fprintf ppf "{ d0 = %s ; d1 = %s; d2 = %s ; d3 = %s }"
+    (to_string t1.d0)
+    (to_string t1.d1)
+    (to_string t1.d2)
+    (to_string t1.d3)
+
+let create_s length =
+  String.init length (fun i -> i * 7 mod 256 |> char_of_int)
+;;
+
+let create_b length = create_s length |> Bytes.of_string
+
+let print_b ~len ppf b =
+  for i = 0 to len-1 do
+    Format.fprintf ppf "%s "
+      (Float32_u.to_float32 (Float32_u.Bytes.get b ~pos:(i*4)) |> Float32.to_string)
+  done
+
+let () =
+  let a = { d0 = #8.s; d1 = #96.s; d2 = #0.s; d3 = -#0.5s } in
+  let b = { d0 = #80.s; d1 = #14.s; d2 = #0.s; d3 = -#0.5s } in
+  let c = { d0 = #8.s; d1 = #96.s; d2 = #0.s; d3 = -#0.s } in
+  let res = { d0 = #0.s; d1 = -#10.s; d2 = #1.s; d3 = -#1.s } in
+  Format.printf "add_unboxed_pairs_mutable_record %a\n" print_t1
+    (add_mutable_record a b c);
+  copy_mutable_record c res;
+  Format.printf "copy_unboxed_pairs_mutable_record %a\n" print_t1 res;
+  let b1 = create_b 16 in
+  let b2 = create_b 16 in
+  init_bytes b1 #10.s;
+  init_bytes b2 #0.s;
+  copy_bytes b1 b2;
+  Format.printf "copy_bytes %a\n" (print_b ~len:4) b2;
+  copy_bytes_pos b2 b1 (Sys.opaque_identity 0);
+  Format.printf "copy_bytes_pos %a\n" (print_b ~len:4) b2;
+  copy_bytes_pos_v2 b1 b2 (Sys.opaque_identity 0);
+  Format.printf "copy_bytes_pos_v2 %a\n" (print_b ~len:4) b2;
+  ()
diff --git a/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.mli b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.mli
new file mode 100644
index 0000000000..5b909d90a8
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.mli
@@ -0,0 +1 @@
+(* blank, make sure all the functions are called from top-level *)
diff --git a/flambda-backend/tests/backend/vectorizer/test_float32_unboxed_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed_vectorized.cmx.dump.expected
new file mode 100644
index 0000000000..3178ac03fb
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed_vectorized.cmx.dump.expected
@@ -0,0 +1 @@
+**** Vectorize selected computation: 2 groups, 8 scalar instructions, 2 vector instructions, cost = -6 (Test_float32_unboxed_vectorized.copy_bytes)
diff --git a/flambda-backend/tests/backend/vectorizer/test_float_unboxed.expected b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.expected
new file mode 100644
index 0000000000..bfea42ed76
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.expected
@@ -0,0 +1,2 @@
+add_mutable_record { d0 = 88.000000 ; d1 = 110.000000; d2 = 0.000000 ; d3 = -1.000000 }
+copy_mutable_record { d0 = 88.000000 ; d1 = 110.000000; d2 = 1.000000 ; d3 = -1.000000 }
diff --git a/flambda-backend/tests/backend/vectorizer/test_float_unboxed.ml b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.ml
new file mode 100644
index 0000000000..a49aaf0b84
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.ml
@@ -0,0 +1,80 @@
+[@@@ocaml.warnerror "+a-40-41-42"]
+
+module Float_u = struct
+  type t = float#
+
+  external to_float : t -> (float[@local_opt]) = "%box_float" [@@warning "-187"]
+
+  external of_float : (float[@local_opt]) -> t = "%unbox_float" [@@warning "-187"]
+
+  let[@inline always] add x y = of_float (Float.add (to_float x) (to_float y))
+end
+
+type t1 = { mutable d0: float#;
+            mutable d1: float#;
+            mutable d2: float#;
+            mutable d3: float#
+          }
+
+
+let[@inline never] [@local never][@specialize never] copy_mutable_record (a : t1) (b: t1) : unit =
+  b.d0 <- a.d0;
+  b.d1 <- a.d1;
+  ()
+
+(* Currently, can't vectorize because of the specific floatmem operation (looks like
+   it is treated overly conservatively. *)
+let[@inline never] [@local never][@specialize never] add_mutable_record (a : t1) (b: t1) (c : t1) : t1 =
+  c.d0 <- Float_u.add a.d0 b.d0;
+  c.d1 <- Float_u.add a.d1 b.d1;
+  c.d2 <- Float_u.add a.d2 b.d2;
+  c.d3 <- Float_u.add a.d3 b.d3;
+  c
+
+(*
+102:
+(id:3) a:V/61 := R:I/0[%rax]
+(id:4) b:V/62 := R:I/1[%rbx]
+(id:5) c:V/63 := R:I/2[%rdi]
+(id:6) F/64 := float64  mut[a:V/61]
+(id:7) F/65 := F/64
+(id:8) F/65 := F/65 +f float64[b:V/62]
+(id:9) float64[c:V/63] := F/65 (assign)
+(id:10) Psetufloatfield:I/66 := 1
+(id:11) F/67 := float64  mut[a:V/61 + 8]
+(id:12) F/68 := F/67
+(id:13) F/68 := F/68 +f float64[b:V/62 + 8]
+(id:14) float64[c:V/63 + 8] := F/68 (assign)
+(id:15) Psetufloatfield:I/69 := 1
+(id:16) F/70 := float64  mut[a:V/61 + 16]
+(id:17) F/71 := F/70
+(id:18) F/71 := F/71 +f float64[b:V/62 + 16]
+(id:19) float64[c:V/63 + 16] := F/71 (assign)
+(id:20) Psetufloatfield:I/72 := 1
+(id:21) F/73 := float64  mut[a:V/61 + 24]
+(id:22) F/74 := F/73
+(id:23) F/74 := F/74 +f float64[b:V/62 + 24]
+(id:24) float64[c:V/63 + 24] := F/74 (assign)
+(id:25) Psetufloatfield:I/75 := 1
+(id:26) R:I/0[%rax] := c:V/63
+(id:27) Return R:I/0[%rax]
+
+*)
+
+let print_t1 ppf (t1 : t1) =
+  Format.fprintf ppf "{ d0 = %f ; d1 = %f; d2 = %f ; d3 = %f }"
+    (Float_u.to_float t1.d0)
+    (Float_u.to_float t1.d1)
+    (Float_u.to_float t1.d2)
+    (Float_u.to_float t1.d3)
+
+let () =
+  let a = { d0 = #8.; d1 = #96.; d2 = #0.; d3 = -#0.5 } in
+  let b = { d0 = #80.; d1 = #14.; d2 = #0.; d3 = -#0.5 } in
+  let c = { d0 = #8.; d1 = #96.; d2 = #0.; d3 = -#0. } in
+  let res = { d0 = #0.; d1 = -#10.; d2 = #1.; d3 = -#1. } in
+  Format.printf "add_mutable_record %a\n" print_t1
+    (add_mutable_record a b c);
+  copy_mutable_record c res;
+  Format.printf "copy_mutable_record %a\n" print_t1 res;
+  ()
diff --git a/flambda-backend/tests/backend/vectorizer/test_float_unboxed.mli b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.mli
new file mode 100644
index 0000000000..5b909d90a8
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.mli
@@ -0,0 +1 @@
+(* blank, make sure all the functions are called from top-level *)
diff --git a/flambda-backend/tests/backend/vectorizer/test_float_unboxed_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_float_unboxed_vectorized.cmx.dump.expected
new file mode 100644
index 0000000000..357dba19d9
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float_unboxed_vectorized.cmx.dump.expected
@@ -0,0 +1,2 @@
+**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_float_unboxed_vectorized.copy_mutable_record)
+**** Vectorize selected computation: 8 groups, 16 scalar instructions, 10 vector instructions, cost = -6 (Test_float_unboxed_vectorized.add_mutable_record)
diff --git a/flambda-backend/tests/backend/vectorizer/test_float_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_float_vectorized.cmx.dump.expected
new file mode 100644
index 0000000000..dc48684873
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_float_vectorized.cmx.dump.expected
@@ -0,0 +1,7 @@
+**** Vectorize selected computation: 4 groups, 8 scalar instructions, 5 vector instructions, cost = -3 (Test_float_vectorized.add_mutable_record)
+**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_float_vectorized.copy_mutable_record)
+**** Vectorize selected computation: 4 groups, 8 scalar instructions, 5 vector instructions, cost = -3 (Test_float_vectorized.add_mutable_record_fresh)
+**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_float_vectorized.copy_mutable_record_fresh)
+**** Vectorize selected computation: 8 groups, 16 scalar instructions, 10 vector instructions, cost = -6 (Test_float_vectorized.add_mutable_record_t4)
+**** Vectorize selected computation: 4 groups, 8 scalar instructions, 4 vector instructions, cost = -4 (Test_float_vectorized.copy_mutable_record_t4)
+**** Vectorize selected computation: 4 groups, 8 scalar instructions, 4 vector instructions, cost = -4 (Test_float_vectorized.dup_mutable_record_t4)
diff --git a/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.expected b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.expected
new file mode 100644
index 0000000000..0207ed6b91
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.expected
@@ -0,0 +1,7 @@
+add_mutable_record { d0 = 88 ; d1 = 110; d2 = -40 ; d3 = -100 }
+copy_array_four 30 30 30 30 
+copy_array_index_four 30 30 30 30 
+add_array_from_start 60 60 60 60 
+copy_array_index_from_start 60 60 60 60 
+copy_array_from_start 60 60 60 60 
+copy_array_from_start_v2 60 60 60 60 
diff --git a/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.ml b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.ml
new file mode 100644
index 0000000000..b45eaa5776
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.ml
@@ -0,0 +1,229 @@
+[@@@ocaml.warnerror "+a-40-41-42"]
+
+module Int32_u = struct
+  type t = int32#
+
+  external to_int32 : t -> (int32[@local_opt]) = "%box_int32" [@@warning "-187"]
+
+  external of_int32 : (int32[@local_opt]) -> t = "%unbox_int32" [@@warning "-187"]
+
+  let[@inline always] add x y = of_int32 (Int32.add (to_int32 x) (to_int32 y))
+
+  module Array = struct
+    external unsafe_create : ('a : bits32). int -> 'a array =
+      "caml_make_unboxed_int32_vect_bytecode" "caml_make_unboxed_int32_vect"
+    external unsafe_get: ('a : bits32). 'a array -> int -> 'a = "%array_unsafe_get"
+    external unsafe_set: ('a : bits32). 'a array -> int -> 'a -> unit = "%array_unsafe_set"
+
+    module Index = struct
+      external unsafe_get
+        : ('a : bits32).
+            ('a array) -> t -> 'a
+        = "%array_unsafe_get_indexed_by_int32#"
+
+      external unsafe_set
+        : ('a : bits32).
+            'a array -> t -> 'a -> unit
+        = "%array_unsafe_set_indexed_by_int32#"
+    end
+  end
+
+end
+
+type t1 = { mutable d0 : int32# ; mutable d1: int32#; mutable d2: int32#; mutable d3: int32#  }
+
+(* Currently, can't vectorize because not adjacent and have an unnecessary sign extension. *)
+let[@inline never] [@local never][@specialize never] add_mutable_record (a : t1) (b: t1) (c : t1) : t1 =
+  c.d0 <- Int32_u.add a.d0 b.d0;
+  c.d1 <- Int32_u.add a.d1 b.d1;
+  c.d2 <- Int32_u.add a.d2 b.d2;
+  c.d3 <- Int32_u.add a.d3 b.d3;
+  c
+
+let[@inline always] copy_array_one (a : Int32_u.t array)
+                      (b : Int32_u.t array) pos =
+  let x = Int32_u.Array.unsafe_get a pos in
+  Int32_u.Array.unsafe_set b pos x
+
+(* The accesses are adjacent but the use of [int] typed index results in a convoluted
+   index computation that is not yet handled by the current heuristics. *)
+let[@inline never] [@local never][@specialize never] copy_array_four (a : Int32_u.t array)
+                                                       (b : Int32_u.t array) ~pos =
+  copy_array_one a b pos;
+  copy_array_one a b (pos+1);
+  copy_array_one a b (pos+2);
+  copy_array_one a b (pos+3);
+  ()
+
+(*
+
+114:
+(id:3) a:V/61 := R:I/0[%rax]
+(id:4) b:V/62 := R:I/1[%rbx]
+(id:5) pos:I/63 := R:I/2[%rdi]
+(id:6) new_value:I/64 := signed int32  mut[a:V/61 + pos:I/63 * 2 + 6]
+(id:7) signed int32[b:V/62 + pos:I/63 * 2 + 6] := new_value:I/64 (assign)
+(id:8) Parraysetu:I/65 := 1
+(id:9) Paddint:I/66 := pos:I/63
+(id:10) Paddint:I/66 := Paddint:I/66 + 2
+(id:11) new_value:I/67 := signed int32  mut[a:V/61 + Paddint:I/66 * 2 + 6]
+(id:12) signed int32[b:V/62 + Paddint:I/66 * 2 + 6] := new_value:I/67 (assign)
+(id:13) Parraysetu:I/68 := 1
+(id:14) Paddint:I/69 := pos:I/63
+(id:15) Paddint:I/69 := Paddint:I/69 + 4
+(id:16) new_value:I/70 := signed int32  mut[a:V/61 + Paddint:I/69 * 2 + 6]
+(id:17) signed int32[b:V/62 + Paddint:I/69 * 2 + 6] := new_value:I/70 (assign)
+(id:18) Parraysetu:I/71 := 1
+(id:19) Paddint:I/72 := pos:I/63
+(id:20) Paddint:I/72 := Paddint:I/72 + 6
+(id:21) new_value:I/73 := signed int32  mut[a:V/61 + Paddint:I/72 * 2 + 6]
+(id:22) signed int32[b:V/62 + Paddint:I/72 * 2 + 6] := new_value:I/73 (assign)
+(id:23) Parraysetu:I/74 := 1
+(id:24) I/75 := 1
+(id:25) R:I/0[%rax] := I/75
+(id:26) Return R:I/0[%rax]
+
+*)
+
+let[@inline never] [@local never][@specialize never] copy_array_four_v2 (a : Int32_u.t array)
+                                                       (b : Int32_u.t array) ~pos =
+  let i0 = pos in
+  copy_array_one a b i0;
+  let i1 = i0 + 1 in
+  copy_array_one a b i1;
+  let i2 = i1 + 1 in
+  copy_array_one a b i2;
+  let i3 = i2 + 1 in
+  copy_array_one a b i3;
+  ()
+
+let[@inline always] copy_array_index_one (a : Int32_u.t array)
+                      (b : Int32_u.t array) (pos : Int32_u.t) =
+  let x = Int32_u.Array.Index.unsafe_get a pos in
+  Int32_u.Array.Index.unsafe_set b pos x
+
+(* Can't vectorize it! The accesses are adjacent and we use [Int32_u.t] as index,
+   but the compiler tags the index before using it! This index computation is not
+   yet handled by the vectorizer's heuristics. *)
+let[@inline never] [@local never][@specialize never] copy_array_index_four (a : Int32_u.t array)
+                                                       (b : Int32_u.t array) ~pos =
+  copy_array_index_one a b pos;
+  copy_array_index_one a b (Int32_u.add pos #1l);
+  copy_array_index_one a b (Int32_u.add pos #2l);
+  copy_array_index_one a b (Int32_u.add pos #3l);
+  ()
+
+let[@inline never] [@local never][@specialize never] copy_array_index_from_start (a : Int32_u.t array)
+                                                       (b : Int32_u.t array) =
+  let pos = #0l in
+  copy_array_index_one a b pos;
+  copy_array_index_one a b (Int32_u.add pos #1l);
+  copy_array_index_one a b (Int32_u.add pos #2l);
+  copy_array_index_one a b (Int32_u.add pos #3l);
+   ()
+
+                            let[@inline never] [@local never][@specialize never] copy_array_from_start (a : Int32_u.t array)
+                                                                                   (b : Int32_u.t array) =
+  let[@inline always] copy pos =
+    let x = Int32_u.Array.unsafe_get a pos in
+    Int32_u.Array.unsafe_set b pos x
+  in
+  let pos = 0 in
+  copy pos;
+  copy (pos+1);
+  copy (pos+2);
+  copy (pos+3);
+  ()
+
+(* Can't vectorize because of an unnecessary sign extension. The heuristics in the
+   vectorizer can be extended to handle this case. *)
+let[@inline never] [@local never][@specialize never] add_array_from_start (a : Int32_u.t array) (b : Int32_u.t array) =
+  let[@inline always] add pos =
+    let x = Int32_u.Array.unsafe_get a pos in
+    let y = Int32_u.Array.unsafe_get b pos in
+    Int32_u.Array.unsafe_set b pos (Int32_u.add x y)
+  in
+  let pos = 0 in
+  add pos;
+  add (pos+1);
+  add (pos+2);
+  add (pos+3);
+  ()
+
+(*
+camlTest7__add_array_from_start_7_22_code(R:I/0[%rax] R:I/1[%rbx]) {test7.ml:112,74-379}
+  a:V/61 := R:I/0[%rax]
+  b:V/62 := R:I/1[%rbx]
+  I/63 := signed int32  mut[b:V/62 + 8]{test7.ml:119,2-9;test7.ml:115,12-42}
+  I/64 := signed int32  mut[a:V/61 + 8]{test7.ml:119,2-9;test7.ml:114,12-42}
+  I/65 := I/64
+  I/65 := I/65 + I/63{test7.ml:119,2-9;test7.ml:116,35-52;test7.ml:10,41-78}
+  new_value:I/66 := sextend32 I/65{test7.ml:119,2-9;test7.ml:116,35-52;test7.ml:10,41-78}
+  signed int32[b:V/62 + 8] := new_value:I/66 (assign){test7.ml:119,2-9;test7.ml:116,4-52}
+  Parraysetu:I/67 := 1
+  I/68 := signed int32  mut[b:V/62 + 12]{test7.ml:120,2-13;test7.ml:115,12-42}
+  I/69 := signed int32  mut[a:V/61 + 12]{test7.ml:120,2-13;test7.ml:114,12-42}
+  I/70 := I/69
+  I/70 := I/70 + I/68{test7.ml:120,2-13;test7.ml:116,35-52;test7.ml:10,41-78}
+  new_value:I/71 := sextend32 I/70{test7.ml:120,2-13;test7.ml:116,35-52;test7.ml:10,41-78}
+  signed int32[b:V/62 + 12] := new_value:I/71 (assign){test7.ml:120,2-13;test7.ml:116,4-52}
+  Parraysetu:I/72 := 1
+  I/73 := signed int32  mut[b:V/62 + 16]{test7.ml:121,2-13;test7.ml:115,12-42}
+  I/74 := signed int32  mut[a:V/61 + 16]{test7.ml:121,2-13;test7.ml:114,12-42}
+  I/75 := I/74
+  I/75 := I/75 + I/73{test7.ml:121,2-13;test7.ml:116,35-52;test7.ml:10,41-78}
+  new_value:I/76 := sextend32 I/75{test7.ml:121,2-13;test7.ml:116,35-52;test7.ml:10,41-78}
+  signed int32[b:V/62 + 16] := new_value:I/76 (assign){test7.ml:121,2-13;test7.ml:116,4-52}
+  Parraysetu:I/77 := 1
+  I/78 := signed int32  mut[b:V/62 + 20]{test7.ml:122,2-13;test7.ml:115,12-42}
+  I/79 := signed int32  mut[a:V/61 + 20]{test7.ml:122,2-13;test7.ml:114,12-42}
+  I/80 := I/79
+  I/80 := I/80 + I/78{test7.ml:122,2-13;test7.ml:116,35-52;test7.ml:10,41-78}
+  new_value:I/81 := sextend32 I/80{test7.ml:122,2-13;test7.ml:116,35-52;test7.ml:10,41-78}
+  signed int32[b:V/62 + 20] := new_value:I/81 (assign){test7.ml:122,2-13;test7.ml:116,4-52}
+  Parraysetu:I/82 := 1
+  I/83 := 1
+  R:I/0[%rax] := I/83
+  return R:I/0[%rax]
+*)
+let print_t1 ppf (t1 : t1) =
+  Format.fprintf ppf "{ d0 = %ld ; d1 = %ld; d2 = %ld ; d3 = %ld }"
+    (Int32_u.to_int32 t1.d0)
+    (Int32_u.to_int32 t1.d1)
+    (Int32_u.to_int32 t1.d2)
+    (Int32_u.to_int32 t1.d3)
+
+let print_array ~len ppf ( a : Int32_u.t array)=
+  for i = 0 to len - 1 do
+    let x = Int32_u.Array.unsafe_get a i in
+    Format.fprintf ppf "%ld " (x |> Int32_u.to_int32)
+  done
+
+let create_array ~len ~init =
+  let arr = Int32_u.Array.unsafe_create len in
+  for i = 0 to len-1 do
+    Int32_u.Array.unsafe_set arr i init
+  done;
+  arr
+
+let () =
+  let a = { d0 = #8l; d1 = #96l; d2 = -#10l; d3 = #0l } in
+  let b = { d0 = #80l; d1 = #14l; d2 = -#30l; d3 = -#100l } in
+  let c = { d0 = #8l; d1 = #96l; d2 = #0l; d3 = #0l } in
+  Format.printf "add_mutable_record %a\n" print_t1
+    (add_mutable_record a b c);
+  let ar1 = create_array ~len:4 ~init:#30l in
+  let ar2 = create_array ~len:4 ~init:#0l in
+  copy_array_four ar1 ar2 ~pos:0;
+  Format.printf "copy_array_four %a\n" (print_array ~len:4) ar2;
+  copy_array_index_four ar2 ar1 ~pos:#0l;
+  Format.printf "copy_array_index_four %a\n" (print_array ~len:4) ar1;
+  add_array_from_start ar1 ar2;
+  Format.printf "add_array_from_start %a\n" (print_array ~len:4) ar2;
+  copy_array_index_from_start ar2 ar1;
+  Format.printf "copy_array_index_from_start %a\n" (print_array ~len:4) ar1;
+  copy_array_from_start ar1 ar2;
+  Format.printf "copy_array_from_start %a\n" (print_array ~len:4) ar2;
+  copy_array_four_v2 ar1 ar2 ~pos:0;
+  Format.printf "copy_array_from_start_v2 %a\n" (print_array ~len:4) ar2;
+  ()
diff --git a/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.mli b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.mli
new file mode 100644
index 0000000000..5b909d90a8
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.mli
@@ -0,0 +1 @@
+(* blank, make sure all the functions are called from top-level *)
diff --git a/flambda-backend/tests/backend/vectorizer/test_int32_unboxed_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed_vectorized.cmx.dump.expected
new file mode 100644
index 0000000000..fef3d590f8
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed_vectorized.cmx.dump.expected
@@ -0,0 +1,3 @@
+**** Vectorize selected computation: 2 groups, 8 scalar instructions, 2 vector instructions, cost = -6 (Test_int32_unboxed_vectorized.copy_array_four_v2)
+**** Vectorize selected computation: 2 groups, 8 scalar instructions, 2 vector instructions, cost = -6 (Test_int32_unboxed_vectorized.copy_array_index_from_start)
+**** Vectorize selected computation: 2 groups, 8 scalar instructions, 2 vector instructions, cost = -6 (Test_int32_unboxed_vectorized.copy_array_from_start)
diff --git a/flambda-backend/tests/backend/vectorizer/test_int64.expected b/flambda-backend/tests/backend/vectorizer/test_int64.expected
new file mode 100644
index 0000000000..21d3934339
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int64.expected
@@ -0,0 +1,7 @@
+add_mutable_record { d0 = 88 ; d1 = 110 }
+copy_mutable_record { d0 = 88 ; d1 = 110 }
+add_mutable_record_fresh { d0 = 88 ; d1 = 110 }
+copy_mutable_record_fresh { d0 = 88 ; d1 = 110 }
+add_mutable_record_t4 { d0 = 88 ; d1 = 110; d2 = 88 ; d3 = 110 }
+copy_mutable_record_t4 { d0 = 8 ; d1 = 96; d2 = 80 ; d3 = 14 }
+dup_mutable_record_t4 { d0 = 8 ; d1 = 96; d2 = 8 ; d3 = 96 }
diff --git a/flambda-backend/tests/backend/vectorizer/test_int64.ml b/flambda-backend/tests/backend/vectorizer/test_int64.ml
new file mode 100644
index 0000000000..95603dd777
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int64.ml
@@ -0,0 +1,79 @@
+[@@@ocaml.warnerror "+a-40-41-42"]
+
+type t1 =
+  { mutable d0 : int64;
+    mutable d1 : int64
+  }
+
+(* Can't vectorize because int64 are boxed. *)
+let[@inline never] [@local never] [@specialize never] add_mutable_record
+    (a : t1) (b : t1) (c : t1) : t1 =
+  c.d0 <- Int64.add a.d0 b.d0;
+  c.d1 <- Int64.add a.d1 b.d1;
+  c
+
+(* Can't vectorize because memory write requires [caml_modify]. *)
+let[@inline never] [@local never] [@specialize never] copy_mutable_record
+    (a : t1) (b : t1) : t1 =
+  b.d0 <- a.d0;
+  b.d1 <- a.d1;
+  b
+
+(* Can't vectorize because int64 are boxed *)
+let[@inline never] [@local never] [@specialize never] add_mutable_record_fresh
+    (a : t1) (b : t1) : t1 =
+  { d0 = Int64.add a.d0 b.d0; d1 = Int64.add a.d1 b.d1 }
+
+let[@inline never] [@local never] [@specialize never] copy_mutable_record_fresh
+    (a : t1) : t1 =
+  { d0 = a.d0; d1 = a.d1 }
+
+type t4 =
+  { mutable d0 : int64;
+    mutable d1 : int64;
+    mutable d2 : int64;
+    mutable d3 : int64
+  }
+
+(* Can't vectorize because int64 are boxed. *)
+let[@inline never] [@local never] [@specialize never] add_mutable_record_t4
+    (a : t1) (b : t1) (c : t4) : t4 =
+  c.d0 <- Int64.add a.d0 b.d0;
+  c.d1 <- Int64.add a.d1 b.d1;
+  c.d2 <- Int64.add a.d0 b.d0;
+  c.d3 <- Int64.add a.d1 b.d1;
+  c
+
+let[@inline never] [@local never] [@specialize never] copy_mutable_record_t4
+    (a : t1) (b : t1) : t4 =
+  { d0 = a.d0; d1 = a.d1; d2 = b.d0; d3 = b.d1 }
+
+let[@inline never] [@local never] [@specialize never] dup_mutable_record_t4
+    (a : t1) : t4 =
+  { d0 = a.d0; d1 = a.d1; d2 = a.d0; d3 = a.d1 }
+
+let print_t1 ppf (t1 : t1) =
+  Format.fprintf ppf "{ d0 = %Ld ; d1 = %Ld }" t1.d0 t1.d1
+
+let print_t4 ppf (t4 : t4) =
+  Format.fprintf ppf "{ d0 = %Ld ; d1 = %Ld; d2 = %Ld ; d3 = %Ld }" t4.d0 t4.d1
+    t4.d2 t4.d3
+
+let () =
+  let a = { d0 = 8L; d1 = 96L } in
+  let b = { d0 = 80L; d1 = 14L } in
+  let c = { d0 = 10L; d1 = -10L } in
+  let t4 = { d0 = 10L; d1 = -10L; d2 = 199L; d3 = 18L } in
+  let res = { d0 = 0L; d1 = -0L } in
+  Format.printf "add_mutable_record %a\n" print_t1 (add_mutable_record a b c);
+  Format.printf "copy_mutable_record %a\n" print_t1 (copy_mutable_record c res);
+  Format.printf "add_mutable_record_fresh %a\n" print_t1
+    (add_mutable_record_fresh a b);
+  Format.printf "copy_mutable_record_fresh %a\n" print_t1
+    (copy_mutable_record_fresh c);
+  Format.printf "add_mutable_record_t4 %a\n" print_t4
+    (add_mutable_record_t4 a b t4);
+  Format.printf "copy_mutable_record_t4 %a\n" print_t4
+    (copy_mutable_record_t4 a b);
+  Format.printf "dup_mutable_record_t4 %a\n" print_t4 (dup_mutable_record_t4 a);
+  ()
diff --git a/flambda-backend/tests/backend/vectorizer/test_int64.mli b/flambda-backend/tests/backend/vectorizer/test_int64.mli
new file mode 100644
index 0000000000..5b909d90a8
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int64.mli
@@ -0,0 +1 @@
+(* blank, make sure all the functions are called from top-level *)
diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.expected b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.expected
new file mode 100644
index 0000000000..68b6515c90
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.expected
@@ -0,0 +1,3 @@
+add_mutable_record { d0 = 88 ; d1 = 110 }
+copy_mutable_record { d0 = 88 ; d1 = 110 }
+add_fours_mutable_record { d0 = 88 ; d1 = 110; d2 = 88 ; d3 = 110 }
diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.ml b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.ml
new file mode 100644
index 0000000000..d9371e65e8
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.ml
@@ -0,0 +1,61 @@
+[@@@ocaml.warnerror "+a-40-41-42"]
+
+module Int64_u = struct
+  type t = int64#
+
+  external to_int64 : t -> (int64[@local_opt]) = "%box_int64" [@@warning "-187"]
+
+  external of_int64 : (int64[@local_opt]) -> t = "%unbox_int64" [@@warning "-187"]
+
+  let[@inline always] add x y = of_int64 (Int64.add (to_int64 x) (to_int64 y))
+end
+
+type t1 = { mutable d0 : int64# ; mutable d1: int64# }
+
+let[@inline never] [@local never][@specialize never] add_mutable_record (a : t1) (b: t1) (c : t1) : t1 =
+  c.d0 <- Int64_u.add a.d0 b.d0;
+  c.d1 <- Int64_u.add a.d1 b.d1;
+  c
+
+let[@inline never] [@local never][@specialize never] copy_mutable_record (a : t1) (b: t1) : unit =
+  b.d0 <- a.d0;
+  b.d1 <- a.d1;
+  ()
+
+type t2 = {
+  mutable d0 : int64# ;
+  mutable d1: int64# ;
+  mutable d2: int64# ;
+  mutable d3: int64# }
+
+let[@inline never] [@local never][@specialize never] add_fours_mutable_record (a : t1) (b: t1) (c : t2) : unit =
+  c.d0 <- Int64_u.add a.d0 b.d0;
+  c.d1 <- Int64_u.add a.d1 b.d1;
+  c.d2 <- Int64_u.add a.d0 b.d0;
+  c.d3 <- Int64_u.add a.d1 b.d1;
+  ()
+
+let print_t1 ppf (t1 : t1) =
+  Format.fprintf ppf "{ d0 = %Ld ; d1 = %Ld }" (Int64_u.to_int64 t1.d0)
+    (Int64_u.to_int64 t1.d1)
+
+let print_t4 ppf (t2 : t2) =
+  Format.fprintf ppf "{ d0 = %Ld ; d1 = %Ld; d2 = %Ld ; d3 = %Ld }"
+    (Int64_u.to_int64 t2.d0)
+    (Int64_u.to_int64 t2.d1)
+    (Int64_u.to_int64 t2.d2)
+    (Int64_u.to_int64 t2.d3)
+
+let () =
+  let a = { d0 = #8L; d1 = #96L } in
+  let b = { d0 = #80L; d1 = #14L } in
+  let c = { d0 = #8L; d1 = #96L } in
+  let d = { d0 = #0L; d1 = #0L; d2 = #0L; d3 = #0L } in
+  let res = { d0 = #0L; d1 = -#10L } in
+  Format.printf "add_mutable_record %a\n" print_t1
+    (add_mutable_record a b c);
+  copy_mutable_record c res;
+  Format.printf "copy_mutable_record %a\n" print_t1 res;
+  add_fours_mutable_record a b d;
+  Format.printf "add_fours_mutable_record %a\n" print_t4 d;
+  ()
diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.mli b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.mli
new file mode 100644
index 0000000000..5b909d90a8
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.mli
@@ -0,0 +1 @@
+(* blank, make sure all the functions are called from top-level *)
diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_unboxed_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed_vectorized.cmx.dump.expected
new file mode 100644
index 0000000000..61eea8dffc
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed_vectorized.cmx.dump.expected
@@ -0,0 +1,3 @@
+**** Vectorize selected computation: 5 groups, 10 scalar instructions, 5 vector instructions, cost = -5 (Test_int64_unboxed_vectorized.add_mutable_record)
+**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_int64_unboxed_vectorized.copy_mutable_record)
+**** Vectorize selected computation: 10 groups, 20 scalar instructions, 10 vector instructions, cost = -10 (Test_int64_unboxed_vectorized.add_fours_mutable_record)
diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_int64_vectorized.cmx.dump.expected
new file mode 100644
index 0000000000..6db1b67d70
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_int64_vectorized.cmx.dump.expected
@@ -0,0 +1,3 @@
+**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_int64_vectorized.copy_mutable_record_fresh)
+**** Vectorize selected computation: 4 groups, 8 scalar instructions, 4 vector instructions, cost = -4 (Test_int64_vectorized.copy_mutable_record_t4)
+**** Vectorize selected computation: 4 groups, 8 scalar instructions, 4 vector instructions, cost = -4 (Test_int64_vectorized.dup_mutable_record_t4)
diff --git a/flambda-backend/tests/backend/vectorizer/examples.expected b/flambda-backend/tests/backend/vectorizer/test_spill_valx2.expected
similarity index 100%
rename from flambda-backend/tests/backend/vectorizer/examples.expected
rename to flambda-backend/tests/backend/vectorizer/test_spill_valx2.expected
diff --git a/flambda-backend/tests/backend/vectorizer/test_spill_valx2.ml b/flambda-backend/tests/backend/vectorizer/test_spill_valx2.ml
new file mode 100644
index 0000000000..2120c1fe2a
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_spill_valx2.ml
@@ -0,0 +1,709 @@
+(* Test that spilled registers of type [Valx2] are correctly registered with the
+   GC.
+
+   Need at least 16 registers of 128 bit to trigger the spill on amd64.
+
+   Allocate enough to defeat comballoc that moves allocations to the beginning
+   of the block and out of the live range of the register that this test is
+   aiming to spill. Raise [vectorize-max-block-size] to force the resulting very
+   long block to be vectorized. *)
+type s =
+  { mutable f0 : int64;
+    mutable f1 : int64;
+    mutable f2 : int64;
+    mutable f3 : int64;
+    mutable f4 : int64;
+    mutable f5 : int64;
+    mutable f6 : int64;
+    mutable f7 : int64;
+    mutable f8 : int64;
+    mutable f9 : int64;
+    mutable f10 : int64;
+    mutable f11 : int64;
+    mutable f12 : int64;
+    mutable f13 : int64;
+    mutable f14 : int64;
+    mutable f15 : int64;
+    mutable f16 : int64;
+    mutable f17 : int64;
+    mutable f18 : int64;
+    mutable f19 : int64;
+    mutable f20 : int64;
+    mutable f21 : int64;
+    mutable f22 : int64;
+    mutable f23 : int64;
+    mutable f24 : int64;
+    mutable f25 : int64;
+    mutable f26 : int64;
+    mutable f27 : int64;
+    mutable f28 : int64;
+    mutable f29 : int64;
+    mutable f30 : int64;
+    mutable f31 : int64;
+    mutable f32 : int64;
+    mutable f33 : int64;
+    mutable f34 : int64;
+    mutable f35 : int64
+  }
+
+let ( + ) = Int64.add
+
+let[@inline never] [@local never] foo a =
+  let f0 = a.f0 in
+  let f1 = a.f1 in
+  let f2 = a.f2 in
+  let f3 = a.f3 in
+  let f4 = a.f4 in
+  let f5 = a.f5 in
+  let f6 = a.f6 in
+  let f7 = a.f7 in
+  let f8 = a.f8 in
+  let f9 = a.f9 in
+  let f10 = a.f10 in
+  let f11 = a.f11 in
+  let f12 = a.f12 in
+  let f13 = a.f13 in
+  let f14 = a.f14 in
+  let f15 = a.f15 in
+  let f16 = a.f16 in
+  let f17 = a.f17 in
+  let f18 = a.f18 in
+  let f19 = a.f19 in
+  let f20 = a.f20 in
+  let f21 = a.f21 in
+  let f22 = a.f22 in
+  let f23 = a.f23 in
+  let f24 = a.f24 in
+  let f25 = a.f25 in
+  let f26 = a.f26 in
+  let f27 = a.f27 in
+  let f28 = a.f28 in
+  let f29 = a.f29 in
+  let f30 = a.f30 in
+  let f31 = a.f31 in
+  let f32 = a.f32 in
+  let f33 = a.f33 in
+  let f34 = a.f34 in
+  let f35 = a.f35 in
+  let d0 =
+    { f0;
+      f1;
+      f2;
+      f3;
+      f4;
+      f5;
+      f6;
+      f7;
+      f8;
+      f9;
+      f10;
+      f11;
+      f12;
+      f13;
+      f14;
+      f15;
+      f16;
+      f17;
+      f18;
+      f19;
+      f20;
+      f21;
+      f22;
+      f23;
+      f24;
+      f25;
+      f26;
+      f27;
+      f28;
+      f29;
+      f30;
+      f31;
+      f32;
+      f33;
+      f34;
+      f35
+    }
+  in
+  let d1 =
+    { f0;
+      f1;
+      f2;
+      f3;
+      f4;
+      f5;
+      f6;
+      f7;
+      f8;
+      f9;
+      f10;
+      f11;
+      f12;
+      f13;
+      f14;
+      f15;
+      f16;
+      f17;
+      f18;
+      f19;
+      f20;
+      f21;
+      f22;
+      f23;
+      f24;
+      f25;
+      f26;
+      f27;
+      f28;
+      f29;
+      f30;
+      f31;
+      f32;
+      f33;
+      f34;
+      f35
+    }
+  in
+  let d2 =
+    { f0;
+      f1;
+      f2;
+      f3;
+      f4;
+      f5;
+      f6;
+      f7;
+      f8;
+      f9;
+      f10;
+      f11;
+      f12;
+      f13;
+      f14;
+      f15;
+      f16;
+      f17;
+      f18;
+      f19;
+      f20;
+      f21;
+      f22;
+      f23;
+      f24;
+      f25;
+      f26;
+      f27;
+      f28;
+      f29;
+      f30;
+      f31;
+      f32;
+      f33;
+      f34;
+      f35
+    }
+  in
+  let d3 =
+    { f0;
+      f1;
+      f2;
+      f3;
+      f4;
+      f5;
+      f6;
+      f7;
+      f8;
+      f9;
+      f10;
+      f11;
+      f12;
+      f13;
+      f14;
+      f15;
+      f16;
+      f17;
+      f18;
+      f19;
+      f20;
+      f21;
+      f22;
+      f23;
+      f24;
+      f25;
+      f26;
+      f27;
+      f28;
+      f29;
+      f30;
+      f31;
+      f32;
+      f33;
+      f34;
+      f35
+    }
+  in
+  let d4 =
+    { f0;
+      f1;
+      f2;
+      f3;
+      f4;
+      f5;
+      f6;
+      f7;
+      f8;
+      f9;
+      f10;
+      f11;
+      f12;
+      f13;
+      f14;
+      f15;
+      f16;
+      f17;
+      f18;
+      f19;
+      f20;
+      f21;
+      f22;
+      f23;
+      f24;
+      f25;
+      f26;
+      f27;
+      f28;
+      f29;
+      f30;
+      f31;
+      f32;
+      f33;
+      f34;
+      f35
+    }
+  in
+  let d5 =
+    { f0;
+      f1;
+      f2;
+      f3;
+      f4;
+      f5;
+      f6;
+      f7;
+      f8;
+      f9;
+      f10;
+      f11;
+      f12;
+      f13;
+      f14;
+      f15;
+      f16;
+      f17;
+      f18;
+      f19;
+      f20;
+      f21;
+      f22;
+      f23;
+      f24;
+      f25;
+      f26;
+      f27;
+      f28;
+      f29;
+      f30;
+      f31;
+      f32;
+      f33;
+      f34;
+      f35
+    }
+  in
+  let d6 =
+    { f0;
+      f1;
+      f2;
+      f3;
+      f4;
+      f5;
+      f6;
+      f7;
+      f8;
+      f9;
+      f10;
+      f11;
+      f12;
+      f13;
+      f14;
+      f15;
+      f16;
+      f17;
+      f18;
+      f19;
+      f20;
+      f21;
+      f22;
+      f23;
+      f24;
+      f25;
+      f26;
+      f27;
+      f28;
+      f29;
+      f30;
+      f31;
+      f32;
+      f33;
+      f34;
+      f35
+    }
+  in
+  let d7 =
+    { f0;
+      f1;
+      f2;
+      f3;
+      f4;
+      f5;
+      f6;
+      f7;
+      f8;
+      f9;
+      f10;
+      f11;
+      f12;
+      f13;
+      f14;
+      f15;
+      f16;
+      f17;
+      f18;
+      f19;
+      f20;
+      f21;
+      f22;
+      f23;
+      f24;
+      f25;
+      f26;
+      f27;
+      f28;
+      f29;
+      f30;
+      f31;
+      f32;
+      f33;
+      f34;
+      f35
+    }
+  in
+  let d8 =
+    { f0;
+      f1;
+      f2;
+      f3;
+      f4;
+      f5;
+      f6;
+      f7;
+      f8;
+      f9;
+      f10;
+      f11;
+      f12;
+      f13;
+      f14;
+      f15;
+      f16;
+      f17;
+      f18;
+      f19;
+      f20;
+      f21;
+      f22;
+      f23;
+      f24;
+      f25;
+      f26;
+      f27;
+      f28;
+      f29;
+      f30;
+      f31;
+      f32;
+      f33;
+      f34;
+      f35
+    }
+  in
+  let d9 =
+    { f0;
+      f1;
+      f2;
+      f3;
+      f4;
+      f5;
+      f6;
+      f7;
+      f8;
+      f9;
+      f10;
+      f11;
+      f12;
+      f13;
+      f14;
+      f15;
+      f16;
+      f17;
+      f18;
+      f19;
+      f20;
+      f21;
+      f22;
+      f23;
+      f24;
+      f25;
+      f26;
+      f27;
+      f28;
+      f29;
+      f30;
+      f31;
+      f32;
+      f33;
+      f34;
+      f35
+    }
+  in
+  d0, d1, d2, d3, d4, d5, d6, d7, d8, d9
+
+let () =
+  let a =
+    { f0 = 0L;
+      f1 = 1L;
+      f2 = 2L;
+      f3 = 3L;
+      f4 = 4L;
+      f5 = 5L;
+      f6 = 6L;
+      f7 = 7L;
+      f8 = 8L;
+      f9 = 9L;
+      f10 = 10L;
+      f11 = 11L;
+      f12 = 12L;
+      f13 = 13L;
+      f14 = 14L;
+      f15 = 15L;
+      f16 = 16L;
+      f17 = 17L;
+      f18 = 18L;
+      f19 = 19L;
+      f20 = 20L;
+      f21 = 21L;
+      f22 = 22L;
+      f23 = 23L;
+      f24 = 24L;
+      f25 = 25L;
+      f26 = 26L;
+      f27 = 27L;
+      f28 = 28L;
+      f29 = 29L;
+      f30 = 30L;
+      f31 = 31L;
+      f32 = 32L;
+      f33 = 0L;
+      f34 = 0L;
+      f35 = 0L
+    }
+  in
+  (* Gc.set { (Gc.get()) with Gc.verbose = 0xd }; *)
+  let rec loop n =
+    if n = 0
+    then ()
+    else
+      (* try to trigger GC inside foo *)
+      let d0, d1, d2, d3, d4, d5, d6, d7, d8, d9 = foo a in
+      assert (d0.f0 = d1.f0);
+      assert (d0.f1 = d1.f1);
+      assert (d0.f2 = d1.f2);
+      assert (d0.f3 = d1.f3);
+      assert (d0.f4 = d1.f4);
+      assert (d0.f5 = d1.f5);
+      assert (d0.f6 = d1.f6);
+      assert (d0.f7 = d1.f7);
+      assert (d0.f8 = d1.f8);
+      assert (d0.f9 = d1.f9);
+      assert (d0.f10 = d1.f10);
+      assert (d0.f11 = d1.f11);
+      assert (d0.f12 = d1.f12);
+      assert (d0.f13 = d1.f13);
+      assert (d0.f14 = d1.f14);
+      assert (d0.f15 = d1.f15);
+      assert (d0.f16 = d1.f16);
+      assert (d0.f17 = d1.f17);
+      assert (d0.f18 = d1.f18);
+      assert (d0.f19 = d1.f19);
+      assert (d0.f20 = d1.f20);
+      assert (d0.f21 = d1.f21);
+      assert (d0.f22 = d1.f22);
+      assert (d0.f23 = d1.f23);
+      assert (d0.f24 = d1.f24);
+      assert (d0.f25 = d1.f25);
+      assert (d0.f26 = d1.f26);
+      assert (d0.f27 = d1.f27);
+      assert (d0.f28 = d1.f28);
+      assert (d0.f29 = d1.f29);
+      assert (d0.f30 = d1.f30);
+      assert (d0.f31 = d1.f31);
+      assert (d0.f32 = d1.f32);
+      assert (d0.f33 = d1.f33);
+      assert (d0.f34 = d1.f34);
+      assert (d0.f35 = d1.f35);
+      assert (d0.f0 = d2.f0);
+      assert (d0.f1 = d2.f1);
+      assert (d0.f2 = d2.f2);
+      assert (d0.f3 = d2.f3);
+      assert (d0.f4 = d2.f4);
+      assert (d0.f5 = d2.f5);
+      assert (d0.f6 = d2.f6);
+      assert (d0.f7 = d2.f7);
+      assert (d0.f8 = d2.f8);
+      assert (d0.f9 = d2.f9);
+      assert (d0.f10 = d2.f10);
+      assert (d0.f11 = d2.f11);
+      assert (d0.f12 = d2.f12);
+      assert (d0.f13 = d2.f13);
+      assert (d0.f14 = d2.f14);
+      assert (d0.f15 = d2.f15);
+      assert (d0.f16 = d2.f16);
+      assert (d0.f17 = d2.f17);
+      assert (d0.f18 = d2.f18);
+      assert (d0.f19 = d2.f19);
+      assert (d0.f20 = d2.f20);
+      assert (d0.f21 = d2.f21);
+      assert (d0.f22 = d2.f22);
+      assert (d0.f23 = d2.f23);
+      assert (d0.f24 = d2.f24);
+      assert (d0.f25 = d2.f25);
+      assert (d0.f26 = d2.f26);
+      assert (d0.f27 = d2.f27);
+      assert (d0.f28 = d2.f28);
+      assert (d0.f29 = d2.f29);
+      assert (d0.f30 = d2.f30);
+      assert (d0.f31 = d2.f31);
+      assert (d0.f32 = d2.f32);
+      assert (d0.f33 = d2.f33);
+      assert (d0.f34 = d2.f34);
+      assert (d0.f35 = d2.f35);
+      assert (d0.f0 = d3.f0);
+      assert (d0.f1 = d3.f1);
+      assert (d0.f2 = d3.f2);
+      assert (d0.f3 = d3.f3);
+      assert (d0.f4 = d3.f4);
+      assert (d0.f5 = d3.f5);
+      assert (d0.f6 = d3.f6);
+      assert (d0.f7 = d3.f7);
+      assert (d0.f8 = d3.f8);
+      assert (d0.f9 = d3.f9);
+      assert (d0.f10 = d3.f10);
+      assert (d0.f11 = d3.f11);
+      assert (d0.f12 = d3.f12);
+      assert (d0.f13 = d3.f13);
+      assert (d0.f14 = d3.f14);
+      assert (d0.f15 = d3.f15);
+      assert (d0.f16 = d3.f16);
+      assert (d0.f17 = d3.f17);
+      assert (d0.f18 = d3.f18);
+      assert (d0.f19 = d3.f19);
+      assert (d0.f20 = d3.f20);
+      assert (d0.f21 = d3.f21);
+      assert (d0.f22 = d3.f22);
+      assert (d0.f23 = d3.f23);
+      assert (d0.f24 = d3.f24);
+      assert (d0.f25 = d3.f25);
+      assert (d0.f26 = d3.f26);
+      assert (d0.f27 = d3.f27);
+      assert (d0.f28 = d3.f28);
+      assert (d0.f29 = d3.f29);
+      assert (d0.f30 = d3.f30);
+      assert (d0.f31 = d3.f31);
+      assert (d0.f32 = d3.f32);
+      assert (d0.f33 = d3.f33);
+      assert (d0.f34 = d3.f34);
+      assert (d0.f35 = d3.f35);
+      assert (d1.f0 = d8.f0);
+      assert (d1.f1 = d8.f1);
+      assert (d1.f2 = d8.f2);
+      assert (d1.f3 = d8.f3);
+      assert (d1.f4 = d8.f4);
+      assert (d1.f5 = d8.f5);
+      assert (d1.f6 = d8.f6);
+      assert (d1.f7 = d8.f7);
+      assert (d1.f8 = d8.f8);
+      assert (d0.f9 = d8.f9);
+      assert (d0.f10 = d8.f10);
+      assert (d0.f11 = d8.f11);
+      assert (d0.f12 = d8.f12);
+      assert (d0.f13 = d8.f13);
+      assert (d0.f14 = d8.f14);
+      assert (d0.f15 = d8.f15);
+      assert (d0.f16 = d8.f16);
+      assert (d0.f17 = d8.f17);
+      assert (d0.f18 = d8.f18);
+      assert (d0.f19 = d8.f19);
+      assert (d0.f20 = d8.f20);
+      assert (d0.f21 = d8.f21);
+      assert (d0.f22 = d8.f22);
+      assert (d0.f23 = d8.f23);
+      assert (d0.f24 = d8.f24);
+      assert (d0.f25 = d8.f25);
+      assert (d0.f26 = d8.f26);
+      assert (d0.f27 = d8.f27);
+      assert (d0.f28 = d8.f28);
+      assert (d0.f29 = d8.f29);
+      assert (d0.f30 = d8.f30);
+      assert (d0.f31 = d8.f31);
+      assert (d0.f32 = d8.f32);
+      assert (d0.f33 = d8.f33);
+      assert (d0.f34 = d8.f34);
+      assert (d0.f35 = d8.f35);
+      assert (d0.f0 = d9.f0);
+      assert (d0.f1 = d9.f1);
+      assert (d0.f2 = d9.f2);
+      assert (d0.f3 = d9.f3);
+      assert (d0.f4 = d9.f4);
+      assert (d0.f5 = d9.f5);
+      assert (d0.f6 = d9.f6);
+      assert (d0.f7 = d9.f7);
+      assert (d0.f8 = d9.f8);
+      assert (d0.f9 = d9.f9);
+      assert (d0.f10 = d9.f10);
+      assert (d0.f11 = d9.f11);
+      assert (d0.f12 = d9.f12);
+      assert (d0.f13 = d9.f13);
+      assert (d0.f14 = d9.f14);
+      assert (d0.f15 = d9.f15);
+      assert (d0.f16 = d9.f16);
+      assert (d0.f17 = d9.f17);
+      assert (d0.f18 = d9.f18);
+      assert (d0.f19 = d9.f19);
+      assert (d0.f20 = d9.f20);
+      assert (d0.f21 = d9.f21);
+      assert (d0.f22 = d9.f22);
+      assert (d0.f23 = d9.f23);
+      assert (d0.f24 = d9.f24);
+      assert (d0.f25 = d9.f25);
+      assert (d0.f26 = d9.f26);
+      assert (d0.f27 = d9.f27);
+      assert (d0.f28 = d9.f28);
+      assert (d0.f29 = d9.f29);
+      assert (d0.f30 = d9.f30);
+      assert (d0.f31 = d9.f31);
+      assert (d0.f32 = d9.f32);
+      assert (d0.f33 = d9.f33);
+      assert (d0.f34 = d9.f34);
+      assert (d0.f35 = d9.f35);
+      loop (n - 1)
+  in
+  loop 1_000_000
diff --git a/flambda-backend/tests/backend/vectorizer/test_spill_valx2.mli b/flambda-backend/tests/backend/vectorizer/test_spill_valx2.mli
new file mode 100644
index 0000000000..5b909d90a8
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_spill_valx2.mli
@@ -0,0 +1 @@
+(* blank, make sure all the functions are called from top-level *)
diff --git a/flambda-backend/tests/backend/vectorizer/test_spill_valx2_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_spill_valx2_vectorized.cmx.dump.expected
new file mode 100644
index 0000000000..9f75a15662
--- /dev/null
+++ b/flambda-backend/tests/backend/vectorizer/test_spill_valx2_vectorized.cmx.dump.expected
@@ -0,0 +1 @@
+**** Vectorize selected computation: 198 groups, 396 scalar instructions, 198 vector instructions, cost = -198 (Test_spill_valx2_vectorized.foo)