diff --git a/backend/vectorize_utils.ml b/backend/vectorize_utils.ml index cb1b715127..7b0db46932 100644 --- a/backend/vectorize_utils.ml +++ b/backend/vectorize_utils.ml @@ -137,11 +137,13 @@ let vectorize_machtypes (pack : Reg.t list) : Cmm.machtype_component = Printreg.reglist pack; match hd.typ, List.length pack with | Addr, _ -> Misc.fatal_errorf "Unexpected machtype for %a" Printreg.reg hd - | (Int | Float), 2 | Float32, 4 -> - (* allows subregs, width should be correct by construction of [Group]. *) + | Float, 2 | Float32, 4 -> Vec128 + | Int, _ -> + (* [Int] may be used for int32, width should be correct by construction of + [Group]. *) Vec128 | Val, 2 -> Valx2 - | (Val | Int | Float | Float32), n -> + | (Val | Float | Float32), n -> Misc.fatal_errorf "Unexpected pack size %d for %a" n Printreg.reglist pack | Vec128, _ | Valx2, _ -> Misc.fatal_errorf "Unexpected machtype for %a" Printreg.reg hd) diff --git a/flambda-backend/tests/backend/vectorizer/.ocamlformat-ignore b/flambda-backend/tests/backend/vectorizer/.ocamlformat-ignore new file mode 100644 index 0000000000..7ddec40dce --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/.ocamlformat-ignore @@ -0,0 +1,4 @@ +test_int64_unboxed.ml +test_float_unboxed.ml +test_int32_unboxed.ml +test_float32_unboxed.ml diff --git a/flambda-backend/tests/backend/vectorizer/dune.inc b/flambda-backend/tests/backend/vectorizer/dune.inc index 67a51f80bb..1673ef0acb 100644 --- a/flambda-backend/tests/backend/vectorizer/dune.inc +++ b/flambda-backend/tests/backend/vectorizer/dune.inc @@ -4,7 +4,7 @@ (enabled_if (= %{context_name} "main")) (targets test1_runner.exe test1.cmx.dump) (deps test1.mli test1.ml) - (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -no-vectorize -o test1_runner.exe))) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test1_runner.exe))) (rule (alias runtest) @@ -37,7 +37,7 @@ (enabled_if (= %{context_name} "main")) (targets test1_vectorized_runner.exe test1_vectorized.cmx.dump) (deps test1_vectorized.mli test1_vectorized.ml) - (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize -o test1_vectorized_runner.exe))) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test1_vectorized_runner.exe))) (rule (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) @@ -75,12 +75,620 @@ (action (diff test1_vectorized.expected test1_vectorized.output))) +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_arrays_runner.exe test_arrays.cmx.dump) + (deps test_arrays.mli test_arrays.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test_arrays_runner.exe))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_arrays.output + (run ./test_arrays_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_arrays.expected test_arrays.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_arrays.ml test_arrays_vectorized.ml))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_arrays.mli test_arrays_vectorized.mli))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_arrays_vectorized_runner.exe test_arrays_vectorized.cmx.dump) + (deps test_arrays_vectorized.mli test_arrays_vectorized.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test_arrays_vectorized_runner.exe))) + +(rule + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (target test_arrays_vectorized.cmx.dump.output) + (deps ./filter.sh test_arrays_vectorized.cmx.dump) + (action + (with-outputs-to + %{target} + (with-accepted-exit-codes 0 + (run %{deps}))))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (diff test_arrays_vectorized.cmx.dump.expected test_arrays_vectorized.cmx.dump.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_arrays_vectorized.output + (run ./test_arrays_vectorized_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_arrays.expected test_arrays_vectorized.expected))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_arrays_vectorized.expected test_arrays_vectorized.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_int64_unboxed_runner.exe test_int64_unboxed.cmx.dump) + (deps test_int64_unboxed.mli test_int64_unboxed.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test_int64_unboxed_runner.exe))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_int64_unboxed.output + (run ./test_int64_unboxed_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_int64_unboxed.expected test_int64_unboxed.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_int64_unboxed.ml test_int64_unboxed_vectorized.ml))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_int64_unboxed.mli test_int64_unboxed_vectorized.mli))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_int64_unboxed_vectorized_runner.exe test_int64_unboxed_vectorized.cmx.dump) + (deps test_int64_unboxed_vectorized.mli test_int64_unboxed_vectorized.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test_int64_unboxed_vectorized_runner.exe))) + +(rule + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (target test_int64_unboxed_vectorized.cmx.dump.output) + (deps ./filter.sh test_int64_unboxed_vectorized.cmx.dump) + (action + (with-outputs-to + %{target} + (with-accepted-exit-codes 0 + (run %{deps}))))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (diff test_int64_unboxed_vectorized.cmx.dump.expected test_int64_unboxed_vectorized.cmx.dump.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_int64_unboxed_vectorized.output + (run ./test_int64_unboxed_vectorized_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_int64_unboxed.expected test_int64_unboxed_vectorized.expected))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_int64_unboxed_vectorized.expected test_int64_unboxed_vectorized.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_float_unboxed_runner.exe test_float_unboxed.cmx.dump) + (deps test_float_unboxed.mli test_float_unboxed.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test_float_unboxed_runner.exe))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_float_unboxed.output + (run ./test_float_unboxed_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_float_unboxed.expected test_float_unboxed.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_float_unboxed.ml test_float_unboxed_vectorized.ml))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_float_unboxed.mli test_float_unboxed_vectorized.mli))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_float_unboxed_vectorized_runner.exe test_float_unboxed_vectorized.cmx.dump) + (deps test_float_unboxed_vectorized.mli test_float_unboxed_vectorized.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test_float_unboxed_vectorized_runner.exe))) + +(rule + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (target test_float_unboxed_vectorized.cmx.dump.output) + (deps ./filter.sh test_float_unboxed_vectorized.cmx.dump) + (action + (with-outputs-to + %{target} + (with-accepted-exit-codes 0 + (run %{deps}))))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (diff test_float_unboxed_vectorized.cmx.dump.expected test_float_unboxed_vectorized.cmx.dump.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_float_unboxed_vectorized.output + (run ./test_float_unboxed_vectorized_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_float_unboxed.expected test_float_unboxed_vectorized.expected))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_float_unboxed_vectorized.expected test_float_unboxed_vectorized.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_int64_runner.exe test_int64.cmx.dump) + (deps test_int64.mli test_int64.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test_int64_runner.exe))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_int64.output + (run ./test_int64_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_int64.expected test_int64.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_int64.ml test_int64_vectorized.ml))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_int64.mli test_int64_vectorized.mli))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_int64_vectorized_runner.exe test_int64_vectorized.cmx.dump) + (deps test_int64_vectorized.mli test_int64_vectorized.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test_int64_vectorized_runner.exe))) + +(rule + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (target test_int64_vectorized.cmx.dump.output) + (deps ./filter.sh test_int64_vectorized.cmx.dump) + (action + (with-outputs-to + %{target} + (with-accepted-exit-codes 0 + (run %{deps}))))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (diff test_int64_vectorized.cmx.dump.expected test_int64_vectorized.cmx.dump.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_int64_vectorized.output + (run ./test_int64_vectorized_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_int64.expected test_int64_vectorized.expected))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_int64_vectorized.expected test_int64_vectorized.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_float_runner.exe test_float.cmx.dump) + (deps test_float.mli test_float.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test_float_runner.exe))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_float.output + (run ./test_float_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_float.expected test_float.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_float.ml test_float_vectorized.ml))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_float.mli test_float_vectorized.mli))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_float_vectorized_runner.exe test_float_vectorized.cmx.dump) + (deps test_float_vectorized.mli test_float_vectorized.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test_float_vectorized_runner.exe))) + +(rule + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (target test_float_vectorized.cmx.dump.output) + (deps ./filter.sh test_float_vectorized.cmx.dump) + (action + (with-outputs-to + %{target} + (with-accepted-exit-codes 0 + (run %{deps}))))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (diff test_float_vectorized.cmx.dump.expected test_float_vectorized.cmx.dump.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_float_vectorized.output + (run ./test_float_vectorized_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_float.expected test_float_vectorized.expected))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_float_vectorized.expected test_float_vectorized.output))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (targets test_float32_unboxed_runner.exe test_float32_unboxed.cmx.dump) + (deps test_float32_unboxed.mli test_float32_unboxed.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test_float32_unboxed_runner.exe))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (with-outputs-to + test_float32_unboxed.output + (run ./test_float32_unboxed_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (diff test_float32_unboxed.expected test_float32_unboxed.output))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (copy test_float32_unboxed.ml test_float32_unboxed_vectorized.ml))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (copy test_float32_unboxed.mli test_float32_unboxed_vectorized.mli))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (targets test_float32_unboxed_vectorized_runner.exe test_float32_unboxed_vectorized.cmx.dump) + (deps test_float32_unboxed_vectorized.mli test_float32_unboxed_vectorized.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test_float32_unboxed_vectorized_runner.exe))) + +(rule + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (target test_float32_unboxed_vectorized.cmx.dump.output) + (deps ./filter.sh test_float32_unboxed_vectorized.cmx.dump) + (action + (with-outputs-to + %{target} + (with-accepted-exit-codes 0 + (run %{deps}))))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (diff test_float32_unboxed_vectorized.cmx.dump.expected test_float32_unboxed_vectorized.cmx.dump.output))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (with-outputs-to + test_float32_unboxed_vectorized.output + (run ./test_float32_unboxed_vectorized_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (copy test_float32_unboxed.expected test_float32_unboxed_vectorized.expected))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (diff test_float32_unboxed_vectorized.expected test_float32_unboxed_vectorized.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_int32_unboxed_runner.exe test_int32_unboxed.cmx.dump) + (deps test_int32_unboxed.mli test_int32_unboxed.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test_int32_unboxed_runner.exe))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_int32_unboxed.output + (run ./test_int32_unboxed_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_int32_unboxed.expected test_int32_unboxed.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_int32_unboxed.ml test_int32_unboxed_vectorized.ml))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_int32_unboxed.mli test_int32_unboxed_vectorized.mli))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_int32_unboxed_vectorized_runner.exe test_int32_unboxed_vectorized.cmx.dump) + (deps test_int32_unboxed_vectorized.mli test_int32_unboxed_vectorized.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test_int32_unboxed_vectorized_runner.exe))) + +(rule + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (target test_int32_unboxed_vectorized.cmx.dump.output) + (deps ./filter.sh test_int32_unboxed_vectorized.cmx.dump) + (action + (with-outputs-to + %{target} + (with-accepted-exit-codes 0 + (run %{deps}))))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (diff test_int32_unboxed_vectorized.cmx.dump.expected test_int32_unboxed_vectorized.cmx.dump.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_int32_unboxed_vectorized.output + (run ./test_int32_unboxed_vectorized_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_int32_unboxed.expected test_int32_unboxed_vectorized.expected))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_int32_unboxed_vectorized.expected test_int32_unboxed_vectorized.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_spill_valx2_runner.exe test_spill_valx2.cmx.dump) + (deps test_spill_valx2.mli test_spill_valx2.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test_spill_valx2_runner.exe))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_spill_valx2.output + (run ./test_spill_valx2_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_spill_valx2.expected test_spill_valx2.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_spill_valx2.ml test_spill_valx2_vectorized.ml))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_spill_valx2.mli test_spill_valx2_vectorized.mli))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (targets test_spill_valx2_vectorized_runner.exe test_spill_valx2_vectorized.cmx.dump) + (deps test_spill_valx2_vectorized.mli test_spill_valx2_vectorized.ml) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test_spill_valx2_vectorized_runner.exe))) + +(rule + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (target test_spill_valx2_vectorized.cmx.dump.output) + (deps ./filter.sh test_spill_valx2_vectorized.cmx.dump) + (action + (with-outputs-to + %{target} + (with-accepted-exit-codes 0 + (run %{deps}))))) + +(rule + (alias runtest) + (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) + (action + (diff test_spill_valx2_vectorized.cmx.dump.expected test_spill_valx2_vectorized.cmx.dump.output))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (with-outputs-to + test_spill_valx2_vectorized.output + (run ./test_spill_valx2_vectorized_runner.exe)))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (copy test_spill_valx2.expected test_spill_valx2_vectorized.expected))) + +(rule + (alias runtest) + (enabled_if (= %{context_name} "main")) + (action + (diff test_spill_valx2_vectorized.expected test_spill_valx2_vectorized.output))) + (rule (alias runtest) (enabled_if (= %{context_name} "main")) (targets test_register_compatible_runner.exe test_register_compatible.cmx.dump) (deps test_register_compatible.mli test_register_compatible.ml) - (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -no-vectorize -o test_register_compatible_runner.exe))) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -no-vectorize -o test_register_compatible_runner.exe))) (rule (alias runtest) @@ -113,7 +721,7 @@ (enabled_if (= %{context_name} "main")) (targets test_register_compatible_vectorized_runner.exe test_register_compatible_vectorized.cmx.dump) (deps test_register_compatible_vectorized.mli test_register_compatible_vectorized.ml) - (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize -o test_register_compatible_vectorized_runner.exe))) + (action (run %{bin:ocamlopt.opt} %{deps} -S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc cfg -extension simd -vectorize-max-block-size 1000 -vectorize -o test_register_compatible_vectorized_runner.exe))) (rule (enabled_if (and (= %{context_name} "main") (= %{architecture} "amd64")) ) diff --git a/flambda-backend/tests/backend/vectorizer/gen/gen_dune.ml b/flambda-backend/tests/backend/vectorizer/gen/gen_dune.ml index 53062d52b9..f6f4e7dc47 100644 --- a/flambda-backend/tests/backend/vectorizer/gen/gen_dune.ml +++ b/flambda-backend/tests/backend/vectorizer/gen/gen_dune.ml @@ -5,7 +5,7 @@ let enabled_if_main_amd64 = let flags = "-S -O3 -g -dump-into-file -dcfg -dvectorize -dsel -dlinear -dlive -regalloc \ - cfg -extension simd" + cfg -extension simd -vectorize-max-block-size 1000" let runner name = name ^ "_runner.exe" @@ -119,18 +119,15 @@ let filter_dump ~enabled_if ~exit_code name = (run %{deps}))))) |} -let copy_source_to_vectorize name = - copy_file ~enabled_if:enabled_if_main (name |> impl) - (name |> vectorized |> impl); - copy_file ~enabled_if:enabled_if_main (name |> intf) - (name |> vectorized |> intf) +let copy_source_to_vectorize ~enabled_if name = + copy_file ~enabled_if (name |> impl) (name |> vectorized |> impl); + copy_file ~enabled_if (name |> intf) (name |> vectorized |> intf) -let compile_no_vectorizer name = - compile ~enabled_if:enabled_if_main ~extra_flags:"-no-vectorize" name +let compile_no_vectorizer ~enabled_if name = + compile ~enabled_if ~extra_flags:"-no-vectorize" name -let compile_with_vectorizer name = - compile ~enabled_if:enabled_if_main ~extra_flags:"-vectorize" - (vectorized name) +let compile_with_vectorizer ~enabled_if name = + compile ~enabled_if ~extra_flags:"-vectorize" (vectorized name) let filter_vectorizer_dump ~enabled_if ~exit_code name = filter_dump ~enabled_if ~exit_code (name |> vectorized) @@ -138,38 +135,44 @@ let filter_vectorizer_dump ~enabled_if ~exit_code name = let diff_vectorizer_dump ~enabled_if name = diff_output ~enabled_if (name |> vectorized |> cmx_dump) -let run_no_vectorizer name = run ~enabled_if:enabled_if_main name +let run_no_vectorizer ~enabled_if name = run ~enabled_if name -let run_vectorized name = run ~enabled_if:enabled_if_main (name |> vectorized) +let run_vectorized ~enabled_if name = run ~enabled_if (name |> vectorized) -let diff_output_no_vectorizer name = - diff_output ~enabled_if:enabled_if_main name +let diff_output_no_vectorizer ~enabled_if name = diff_output ~enabled_if name -let diff_output_vectorized name = - diff_output ~enabled_if:enabled_if_main (name |> vectorized) +let diff_output_vectorized ~enabled_if name = + diff_output ~enabled_if (name |> vectorized) -let copy_expected_output name = - copy_file ~enabled_if:enabled_if_main (name |> expected) - (name |> vectorized |> expected) +let copy_expected_output ~enabled_if name = + copy_file ~enabled_if (name |> expected) (name |> vectorized |> expected) -let print_test ?(filter_exit_code = 0) name = +let print_test ?(enabled_if = enabled_if_main) ?(filter_exit_code = 0) name = (* check expected test output is up to date *) - compile_no_vectorizer name; - run_no_vectorizer name; - diff_output_no_vectorizer name; + compile_no_vectorizer ~enabled_if name; + run_no_vectorizer ~enabled_if name; + diff_output_no_vectorizer ~enabled_if name; (* vectorizer *) - copy_source_to_vectorize name; - compile_with_vectorizer name; + copy_source_to_vectorize ~enabled_if name; + compile_with_vectorizer ~enabled_if name; filter_vectorizer_dump name ~exit_code:filter_exit_code ~enabled_if:enabled_if_main_amd64; diff_vectorizer_dump name ~enabled_if:enabled_if_main_amd64; - run_vectorized name; - copy_expected_output name; - diff_output_vectorized name; + run_vectorized ~enabled_if name; + copy_expected_output ~enabled_if name; + diff_output_vectorized ~enabled_if name; () let () = print_test "test1"; + print_test "test_arrays"; + print_test "test_int64_unboxed"; + print_test "test_float_unboxed"; + print_test "test_int64"; + print_test "test_float"; + print_test ~enabled_if:enabled_if_main_amd64 "test_float32_unboxed"; + print_test "test_int32_unboxed"; + print_test "test_spill_valx2"; (* can't vectorize *) print_test ~filter_exit_code:1 "test_register_compatible"; () diff --git a/flambda-backend/tests/backend/vectorizer/test_arrays.expected b/flambda-backend/tests/backend/vectorizer/test_arrays.expected new file mode 100644 index 0000000000..e86cd1806c --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_arrays.expected @@ -0,0 +1,9 @@ +add_arrays_unrolled_manually 17 18 19 20 21 22 23 24 25 26 +add_arrays_unrolled_safe 17 18 19 20 21 22 23 24 25 26 +add_arrays_rec_unrolled_attribute 17 18 19 20 21 22 23 24 25 26 +add_arrays_for 17 18 19 20 21 22 23 24 25 26 +add_arrays_rec 17 18 19 20 21 22 23 24 25 26 +initialize_array_const_unrolled_manually 0 0 0 0 0 0 0 0 0 0 +initialize_arrays_const_unrolled_manually 0 0 0 0 0 0 0 0 0 0 +initialize_array_unrolled_manually 17 17 17 17 17 17 17 17 17 17 +initialize_floatarray_unrolled_manually 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 7.700000 diff --git a/flambda-backend/tests/backend/vectorizer/test_arrays.ml b/flambda-backend/tests/backend/vectorizer/test_arrays.ml new file mode 100644 index 0000000000..106eaa5e5d --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_arrays.ml @@ -0,0 +1,141 @@ +let[@inline never] [@local never] [@specialize never] add_arrays_unrolled_manually + a b c n = + for i = 0 to (n / 2) - 1 do + Array.unsafe_set c (i * 2) + (Array.unsafe_get a (i * 2) + Array.unsafe_get b (i * 2)); + Array.unsafe_set c + ((i * 2) + 1) + (Array.unsafe_get a ((i * 2) + 1) + Array.unsafe_get b ((i * 2) + 1)) + done; + if Int.rem n 2 = 1 + then + Array.unsafe_set c (n - 1) + (Array.unsafe_get a (n - 1) + Array.unsafe_get b (n - 1)) + +(* Currently won't be vectorized. Can vectorize it but it's not worth it + according to our cost model. It will be vectorized when we add vectors beyond + 128 or arrays of elements smaller than 64-bit. *) +let[@inline never] [@local never] [@specialize never] initialize_array_const_unrolled_manually + arr n = + let i = ref 0 in + while !i < n do + Array.unsafe_set arr !i 0; + Array.unsafe_set arr (!i + 1) 0; + i := !i + 2 + done + +(* Currently, won't be vectorized. If different groups can reuse the new + register that holds the constants, this will be worth vectorizing even with + 128-bit vectors. *) +let[@inline never] [@local never] [@specialize never] initialize_arrays_const_unrolled_manually + a b c n = + let i = ref 0 in + while !i < n do + Array.unsafe_set a !i 0; + Array.unsafe_set a (!i + 1) 0; + Array.unsafe_set b !i 0; + Array.unsafe_set b (!i + 1) 0; + Array.unsafe_set c !i 0; + Array.unsafe_set c (!i + 1) 0; + i := !i + 2 + done + +(* Currently, won't be vectorized. Shuffling values into a vector is not yet + supported, only vector loads are. Also not worth it unless the shuffle is + outside the loop (loop invariant detection/motion would be needed for it). *) +let[@inline never] [@local never] [@specialize never] initialize_array_unrolled_manually + arr n (v : int) = + let i = ref 0 in + while !i < n do + Array.unsafe_set arr !i v; + Array.unsafe_set arr (!i + 1) v; + i := !i + 2 + done + +(* same as [initialize_array_unrolled_manually] except needs movddup. *) +let[@inline never] [@local never] [@specialize never] initialize_floatarray_unrolled_manually + arr n (v : float) = + let i = ref 0 in + while !i < n do + Array.unsafe_set arr !i v; + Array.unsafe_set arr (!i + 1) v; + i := !i + 2 + done + +(* cannot vectorize across basic blocks *) +let[@inline never] [@local never] [@specialize never] add_arrays_unrolled_safe a + b c n = + for i = 0 to n - 1 do + Array.set c (i * 2) (Array.get a (i * 2) + Array.get b (i * 2)); + Array.set c + ((i * 2) + 1) + (Array.get a ((i * 2) + 1) + Array.get b ((i * 2) + 1)) + done + +(* cannot vectorize across basic blocks. unroll attribute is not sufficient to + eliminate the loop condition from the unrolled body (e.g., we would need to + track the fact that the bound is even. *) +let[@inline never] [@local never] [@specialize never] add_arrays_rec_unrolled_attribute + a b c n = + let[@loop never] rec loop i a b c n = + if i < n + then ( + Array.unsafe_set c i (Array.unsafe_get a i + Array.unsafe_get b i); + (loop [@unrolled 1]) (i + 1) a b c n) + in + loop 0 a b c (2 * n) + +(* cannot vectorize for-loops *) +let[@inline never] [@local never] [@specialize never] add_arrays_for a b c n = + for i = 0 to n - 1 do + Array.unsafe_set c i (Array.unsafe_get a i + Array.unsafe_get b i) + done + +(* cannot vectorize loops expressed using recursion *) +let[@inline never] [@local never] [@specialize never] add_arrays_rec a b c n = + let rec loop i = + if i < n + then ( + Array.unsafe_set c i (Array.unsafe_get a i + Array.unsafe_get b i); + loop (i + 1)) + in + loop 0 + +let print_array ppf a = + let count = Array.length a in + for i = 0 to count - 1 do + Format.fprintf ppf "%d " a.(i) + done + +let print_floatarray ppf a = + let count = Array.length a in + for i = 0 to count - 1 do + Format.fprintf ppf "%f " a.(i) + done + +let () = + let n = Sys.opaque_identity 10 in + let a = Array.init n (fun i -> i) in + let b = Array.make n 17 in + let c = Array.make n 0 in + let d = Array.make n 0.0 in + add_arrays_unrolled_manually a b c (Sys.opaque_identity n); + Format.printf "add_arrays_unrolled_manually %a\n" print_array c; + add_arrays_unrolled_safe a b c (Sys.opaque_identity (n / 2)); + Format.printf "add_arrays_unrolled_safe %a\n" print_array c; + add_arrays_rec_unrolled_attribute a b c (n / 2); + Format.printf "add_arrays_rec_unrolled_attribute %a\n" print_array c; + add_arrays_for a b c n; + Format.printf "add_arrays_for %a\n" print_array c; + add_arrays_rec a b c n; + Format.printf "add_arrays_rec %a\n" print_array c; + initialize_array_const_unrolled_manually c n; + Format.printf "initialize_array_const_unrolled_manually %a\n" print_array c; + initialize_arrays_const_unrolled_manually a b c n; + Format.printf "initialize_arrays_const_unrolled_manually %a\n" print_array c; + initialize_array_unrolled_manually c n (Sys.opaque_identity 17); + Format.printf "initialize_array_unrolled_manually %a\n" print_array c; + initialize_floatarray_unrolled_manually d n (Sys.opaque_identity 7.7); + Format.printf "initialize_floatarray_unrolled_manually %a\n" print_floatarray + d; + () diff --git a/flambda-backend/tests/backend/vectorizer/test_arrays.mli b/flambda-backend/tests/backend/vectorizer/test_arrays.mli new file mode 100644 index 0000000000..5b909d90a8 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_arrays.mli @@ -0,0 +1 @@ +(* blank, make sure all the functions are called from top-level *) diff --git a/flambda-backend/tests/backend/vectorizer/test_arrays_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_arrays_vectorized.cmx.dump.expected new file mode 100644 index 0000000000..182c1cc730 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_arrays_vectorized.cmx.dump.expected @@ -0,0 +1 @@ +**** Vectorize selected computation: 4 groups, 8 scalar instructions, 7 vector instructions, cost = -1 (Test_arrays_vectorized.add_arrays_unrolled_manually) diff --git a/flambda-backend/tests/backend/vectorizer/test_float.expected b/flambda-backend/tests/backend/vectorizer/test_float.expected new file mode 100644 index 0000000000..00ffe66d5e --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float.expected @@ -0,0 +1,7 @@ +add_mutable_record { d0 = 88.000000 ; d1 = 110.000000 } +copy_mutable_record { d0 = 88.000000 ; d1 = 110.000000 } +add_mutable_record_fresh { d0 = 88.000000 ; d1 = 110.000000 } +copy_mutable_record_fresh { d0 = 88.000000 ; d1 = 110.000000 } +add_mutable_record_t4 { d0 = 88.000000 ; d1 = 110.000000; d2 = 88.000000 ; d3 = 110.000000 } +copy_mutable_record_t4 { d0 = 8.000000 ; d1 = 96.000000; d2 = 80.000000 ; d3 = 14.000000 } +dup_mutable_record_t4 { d0 = 8.000000 ; d1 = 96.000000; d2 = 8.000000 ; d3 = 96.000000 } diff --git a/flambda-backend/tests/backend/vectorizer/test_float.ml b/flambda-backend/tests/backend/vectorizer/test_float.ml new file mode 100644 index 0000000000..1e36c686ce --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float.ml @@ -0,0 +1,75 @@ +[@@@ocaml.warnerror "+a-40-41-42"] + +type t1 = + { mutable d0 : float; + mutable d1 : float + } + +let[@inline never] [@local never] [@specialize never] add_mutable_record + (a : t1) (b : t1) (c : t1) : t1 = + c.d0 <- Float.add a.d0 b.d0; + c.d1 <- Float.add a.d1 b.d1; + c + +let[@inline never] [@local never] [@specialize never] copy_mutable_record + (a : t1) (b : t1) : t1 = + b.d0 <- a.d0; + b.d1 <- a.d1; + b + +let[@inline never] [@local never] [@specialize never] add_mutable_record_fresh + (a : t1) (b : t1) : t1 = + { d0 = Float.add a.d0 b.d0; d1 = Float.add a.d1 b.d1 } + +let[@inline never] [@local never] [@specialize never] copy_mutable_record_fresh + (a : t1) : t1 = + { d0 = a.d0; d1 = a.d1 } + +type t4 = + { mutable d0 : float; + mutable d1 : float; + mutable d2 : float; + mutable d3 : float + } + +let[@inline never] [@local never] [@specialize never] add_mutable_record_t4 + (a : t1) (b : t1) (c : t4) : t4 = + c.d0 <- Float.add a.d0 b.d0; + c.d1 <- Float.add a.d1 b.d1; + c.d2 <- Float.add a.d0 b.d0; + c.d3 <- Float.add a.d1 b.d1; + c + +let[@inline never] [@local never] [@specialize never] copy_mutable_record_t4 + (a : t1) (b : t1) : t4 = + { d0 = a.d0; d1 = a.d1; d2 = b.d0; d3 = b.d1 } + +let[@inline never] [@local never] [@specialize never] dup_mutable_record_t4 + (a : t1) : t4 = + { d0 = a.d0; d1 = a.d1; d2 = a.d0; d3 = a.d1 } + +let print_t1 ppf (t1 : t1) = + Format.fprintf ppf "{ d0 = %f ; d1 = %f }" t1.d0 t1.d1 + +let print_t4 ppf (t4 : t4) = + Format.fprintf ppf "{ d0 = %f ; d1 = %f; d2 = %f ; d3 = %f }" t4.d0 t4.d1 + t4.d2 t4.d3 + +let () = + let a = { d0 = 8.; d1 = 96. } in + let b = { d0 = 80.; d1 = 14. } in + let c = { d0 = 10.; d1 = -10. } in + let t4 = { d0 = 10.; d1 = -10.; d2 = 199.; d3 = 18. } in + let res = { d0 = 0.; d1 = -0. } in + Format.printf "add_mutable_record %a\n" print_t1 (add_mutable_record a b c); + Format.printf "copy_mutable_record %a\n" print_t1 (copy_mutable_record c res); + Format.printf "add_mutable_record_fresh %a\n" print_t1 + (add_mutable_record_fresh a b); + Format.printf "copy_mutable_record_fresh %a\n" print_t1 + (copy_mutable_record_fresh c); + Format.printf "add_mutable_record_t4 %a\n" print_t4 + (add_mutable_record_t4 a b t4); + Format.printf "copy_mutable_record_t4 %a\n" print_t4 + (copy_mutable_record_t4 a b); + Format.printf "dup_mutable_record_t4 %a\n" print_t4 (dup_mutable_record_t4 a); + () diff --git a/flambda-backend/tests/backend/vectorizer/test_float.mli b/flambda-backend/tests/backend/vectorizer/test_float.mli new file mode 100644 index 0000000000..5b909d90a8 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float.mli @@ -0,0 +1 @@ +(* blank, make sure all the functions are called from top-level *) diff --git a/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.expected b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.expected new file mode 100644 index 0000000000..92c4b798f9 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.expected @@ -0,0 +1,5 @@ +add_unboxed_pairs_mutable_record { d0 = 88. ; d1 = 110.; d2 = 0. ; d3 = -1. } +copy_unboxed_pairs_mutable_record { d0 = 88. ; d1 = 110.; d2 = 0. ; d3 = -1. } +copy_bytes 10. 10. 10. 10. +copy_bytes_pos 10. 10. 10. 10. +copy_bytes_pos_v2 10. 10. 10. 10. diff --git a/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.ml b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.ml new file mode 100644 index 0000000000..ea552f169e --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.ml @@ -0,0 +1,225 @@ +[@@@ocaml.warnerror "+a-40-41-42"] + +module Float32 = struct + type t = float32 + + external add : (t[@local_opt]) -> (t[@local_opt]) -> (t[@local_opt]) + = "%addfloat32" + + external format : string -> t -> string = "caml_format_float32" + + let to_string f = Stdlib.valid_float_lexem (format "%.9g" f) + + module Bytes = struct + external get : bytes -> pos:int -> float32 = "%caml_bytes_getf32" + external unsafe_get : bytes -> pos:int -> float32 = "%caml_bytes_getf32u" + external set : bytes -> pos:int -> float32 -> unit = "%caml_bytes_setf32" + + external unsafe_set : bytes -> pos:int -> float32 -> unit + = "%caml_bytes_setf32u" + end +end + +module Float32_u = struct + type t = float32# + + external to_float32 : t -> (float32[@local_opt]) = "%box_float32" [@@warning "-187"] + + external of_float32 : (float32[@local_opt]) -> t = "%unbox_float32" [@@warning "-187"] + + let[@inline always] add x y = of_float32 (Float32.add (to_float32 x) (to_float32 y)) + + module Bytes = struct + let get bytes ~pos = of_float32 (Float32.Bytes.get bytes ~pos) + let unsafe_get bytes ~pos = of_float32 (Float32.Bytes.unsafe_get bytes ~pos) + let set bytes ~pos x = Float32.Bytes.set bytes ~pos (to_float32 x) + let unsafe_set bytes ~pos x = Float32.Bytes.unsafe_set bytes ~pos (to_float32 x) + end +end + +type t1 = { mutable d0 : float32# ; + mutable d1: float32#; mutable d2: float32#; mutable d3: float32# } + +(* Not vectorized because float32 fields are not adjacent in a record, they are padded +to 64-bits. *) +let[@inline never] [@local never][@specialize never] copy_mutable_record (a : t1) (b: t1) : unit = + b.d0 <- a.d0; + b.d1 <- a.d1; + b.d2 <- a.d2; + b.d3 <- a.d3; + () + +(* Not vectorized because float32 fields are not adjacent in a record, they are padded +to 64-bits. *) +let[@inline never] [@local never][@specialize never] add_mutable_record (a : t1) (b: t1) (c : t1) : t1 = + c.d0 <- Float32_u.add a.d0 b.d0; + c.d1 <- Float32_u.add a.d1 b.d1; + c.d2 <- Float32_u.add a.d2 b.d2; + c.d3 <- Float32_u.add a.d3 b.d3; + c + +(* [Float32_u.Bytes] contain packed float32_u, can vectorize. *) +let[@inline never] [@local never] [@specialize never] copy_bytes a b = + let pos = 0 in + let x = Float32_u.Bytes.unsafe_get a ~pos in + Float32_u.Bytes.unsafe_set b ~pos x; + let pos = pos + 4 in + let x = Float32_u.Bytes.unsafe_get a ~pos in + Float32_u.Bytes.unsafe_set b ~pos x; + let pos = pos + 4 in + let x = Float32_u.Bytes.unsafe_get a ~pos in + Float32_u.Bytes.unsafe_set b ~pos x; + let pos = pos + 4 in + let x = Float32_u.Bytes.unsafe_get a ~pos in + Float32_u.Bytes.unsafe_set b ~pos x; + () + +let[@inline never] [@local never] [@specialize never] init_bytes b x = + let pos = 0 in + Float32_u.Bytes.unsafe_set b ~pos x; + let pos = pos + 4 in + Float32_u.Bytes.unsafe_set b ~pos x; + let pos = pos + 4 in + Float32_u.Bytes.unsafe_set b ~pos x; + let pos = pos + 4 in + Float32_u.Bytes.unsafe_set b ~pos x; + () + +let[@inline always] copy_float32_unboxed_pos a b ~pos = + let x = Float32_u.Bytes.unsafe_get a ~pos in + Float32_u.Bytes.unsafe_set b ~pos x; + () + +(* Currently can't vectorize because [pos] untagging is repeated and the current + heuristic for detecting relations between pointers is not strong enough to + handle this case. *) +let[@inline never] [@local never] [@specialize never] copy_bytes_pos a b pos = + copy_float32_unboxed_pos a b ~pos; + copy_float32_unboxed_pos a b ~pos:(pos+1*4); + copy_float32_unboxed_pos a b ~pos:(pos+2*4); + copy_float32_unboxed_pos a b ~pos:(pos+3*4); + () + +(* 128: + * (id:3) a:V/61 := R:I/0[%rax] + * (id:4) b:V/62 := R:I/1[%rbx] + * (id:5) pos:I/63 := R:I/2[%rdi] + * (id:6) prim:I/64 := pos:I/63 + * (id:7) prim:I/64 := prim:I/64 >>s 1 + * (id:8) S/65 := float32 mut[a:V/61 + prim:I/64] + * (id:9) float32[b:V/62 + prim:I/64] := S/65 (assign) + * (id:10) Pbytes_set_f32:I/66 := 1 + * (id:11) I/67 := pos:I/63 + * (id:12) I/67 := I/67 + 8 + * (id:13) prim:I/68 := I/67 + * (id:14) prim:I/68 := prim:I/68 >>s 1 + * (id:15) S/69 := float32 mut[a:V/61 + prim:I/68] + * (id:16) float32[b:V/62 + prim:I/68] := S/69 (assign) + * (id:17) Pbytes_set_f32:I/70 := 1 + * (id:18) I/71 := pos:I/63 + * (id:19) I/71 := I/71 + 16 + * (id:20) prim:I/72 := I/71 + * (id:21) prim:I/72 := prim:I/72 >>s 1 + * (id:22) S/73 := float32 mut[a:V/61 + prim:I/72] + * (id:23) float32[b:V/62 + prim:I/72] := S/73 (assign) + * (id:24) Pbytes_set_f32:I/74 := 1 + * (id:25) I/75 := pos:I/63 + * (id:26) I/75 := I/75 + 24 + * (id:27) prim:I/76 := I/75 + * (id:28) prim:I/76 := prim:I/76 >>s 1 + * (id:29) S/77 := float32 mut[a:V/61 + prim:I/76] + * (id:30) float32[b:V/62 + prim:I/76] := S/77 (assign) + * (id:31) Pbytes_set_f32:I/78 := 1 + * (id:32) I/79 := 1 + * (id:33) R:I/0[%rax] := I/79 + * (id:34) Return R:I/0[%rax] *) + +(* Currently, can't vectorize because the index is untagged before every memory access, + instead of operating on untagged indexes throughout. *) +let[@inline never] [@local never] [@specialize never] copy_bytes_pos_v2 a b pos = + let i0 = pos in + copy_float32_unboxed_pos a b ~pos:i0; + let i1 = i0 + 4 in + copy_float32_unboxed_pos a b ~pos:i1; + let i2 = i1 + 4 in + copy_float32_unboxed_pos a b ~pos:i2; + let i3 = i2 + 4 in + copy_float32_unboxed_pos a b ~pos:i3; + () + +(* 177: + * (id:3) a:V/61 := R:I/0[%rax] + * (id:4) b:V/62 := R:I/1[%rbx] + * (id:5) pos:I/63 := R:I/2[%rdi] + * (id:6) prim:I/64 := pos:I/63 + * (id:7) prim:I/64 := prim:I/64 >>s 1 + * (id:8) S/65 := float32 mut[a:V/61 + prim:I/64] + * (id:9) float32[b:V/62 + prim:I/64] := S/65 (assign) + * (id:10) Pbytes_set_f32:I/66 := 1 + * (id:11) i1:I/67 := pos:I/63 + * (id:12) i1:I/67 := i1:I/67 + 8 + * (id:13) prim:I/68 := i1:I/67 + * (id:14) prim:I/68 := prim:I/68 >>s 1 + * (id:15) S/69 := float32 mut[a:V/61 + prim:I/68] + * (id:16) float32[b:V/62 + prim:I/68] := S/69 (assign) + * (id:17) Pbytes_set_f32:I/70 := 1 + * (id:18) i2:I/71 := i1:I/67 + * (id:19) i2:I/71 := i2:I/71 + 8 + * (id:20) prim:I/72 := i2:I/71 + * (id:21) prim:I/72 := prim:I/72 >>s 1 + * (id:22) S/73 := float32 mut[a:V/61 + prim:I/72] + * (id:23) float32[b:V/62 + prim:I/72] := S/73 (assign) + * (id:24) Pbytes_set_f32:I/74 := 1 + * (id:25) I/75 := i2:I/71 + * (id:26) I/75 := I/75 + 8 + * (id:27) prim:I/76 := I/75 + * (id:28) prim:I/76 := prim:I/76 >>s 1 + * (id:29) S/77 := float32 mut[a:V/61 + prim:I/76] + * (id:30) float32[b:V/62 + prim:I/76] := S/77 (assign) + * (id:31) Pbytes_set_f32:I/78 := 1 + * (id:32) I/79 := 1 + * (id:33) R:I/0[%rax] := I/79 + * (id:34) Return R:I/0[%rax] *) + + +let print_t1 ppf (t1 : t1) = + (* CR gyorsh: how to print Float32? *) + let to_string f = (Float32_u.to_float32 f |> Float32.to_string) in + Format.fprintf ppf "{ d0 = %s ; d1 = %s; d2 = %s ; d3 = %s }" + (to_string t1.d0) + (to_string t1.d1) + (to_string t1.d2) + (to_string t1.d3) + +let create_s length = + String.init length (fun i -> i * 7 mod 256 |> char_of_int) +;; + +let create_b length = create_s length |> Bytes.of_string + +let print_b ~len ppf b = + for i = 0 to len-1 do + Format.fprintf ppf "%s " + (Float32_u.to_float32 (Float32_u.Bytes.get b ~pos:(i*4)) |> Float32.to_string) + done + +let () = + let a = { d0 = #8.s; d1 = #96.s; d2 = #0.s; d3 = -#0.5s } in + let b = { d0 = #80.s; d1 = #14.s; d2 = #0.s; d3 = -#0.5s } in + let c = { d0 = #8.s; d1 = #96.s; d2 = #0.s; d3 = -#0.s } in + let res = { d0 = #0.s; d1 = -#10.s; d2 = #1.s; d3 = -#1.s } in + Format.printf "add_unboxed_pairs_mutable_record %a\n" print_t1 + (add_mutable_record a b c); + copy_mutable_record c res; + Format.printf "copy_unboxed_pairs_mutable_record %a\n" print_t1 res; + let b1 = create_b 16 in + let b2 = create_b 16 in + init_bytes b1 #10.s; + init_bytes b2 #0.s; + copy_bytes b1 b2; + Format.printf "copy_bytes %a\n" (print_b ~len:4) b2; + copy_bytes_pos b2 b1 (Sys.opaque_identity 0); + Format.printf "copy_bytes_pos %a\n" (print_b ~len:4) b2; + copy_bytes_pos_v2 b1 b2 (Sys.opaque_identity 0); + Format.printf "copy_bytes_pos_v2 %a\n" (print_b ~len:4) b2; + () diff --git a/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.mli b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.mli new file mode 100644 index 0000000000..5b909d90a8 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed.mli @@ -0,0 +1 @@ +(* blank, make sure all the functions are called from top-level *) diff --git a/flambda-backend/tests/backend/vectorizer/test_float32_unboxed_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed_vectorized.cmx.dump.expected new file mode 100644 index 0000000000..3178ac03fb --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float32_unboxed_vectorized.cmx.dump.expected @@ -0,0 +1 @@ +**** Vectorize selected computation: 2 groups, 8 scalar instructions, 2 vector instructions, cost = -6 (Test_float32_unboxed_vectorized.copy_bytes) diff --git a/flambda-backend/tests/backend/vectorizer/test_float_unboxed.expected b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.expected new file mode 100644 index 0000000000..bfea42ed76 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.expected @@ -0,0 +1,2 @@ +add_mutable_record { d0 = 88.000000 ; d1 = 110.000000; d2 = 0.000000 ; d3 = -1.000000 } +copy_mutable_record { d0 = 88.000000 ; d1 = 110.000000; d2 = 1.000000 ; d3 = -1.000000 } diff --git a/flambda-backend/tests/backend/vectorizer/test_float_unboxed.ml b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.ml new file mode 100644 index 0000000000..a49aaf0b84 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.ml @@ -0,0 +1,80 @@ +[@@@ocaml.warnerror "+a-40-41-42"] + +module Float_u = struct + type t = float# + + external to_float : t -> (float[@local_opt]) = "%box_float" [@@warning "-187"] + + external of_float : (float[@local_opt]) -> t = "%unbox_float" [@@warning "-187"] + + let[@inline always] add x y = of_float (Float.add (to_float x) (to_float y)) +end + +type t1 = { mutable d0: float#; + mutable d1: float#; + mutable d2: float#; + mutable d3: float# + } + + +let[@inline never] [@local never][@specialize never] copy_mutable_record (a : t1) (b: t1) : unit = + b.d0 <- a.d0; + b.d1 <- a.d1; + () + +(* Currently, can't vectorize because of the specific floatmem operation (looks like + it is treated overly conservatively. *) +let[@inline never] [@local never][@specialize never] add_mutable_record (a : t1) (b: t1) (c : t1) : t1 = + c.d0 <- Float_u.add a.d0 b.d0; + c.d1 <- Float_u.add a.d1 b.d1; + c.d2 <- Float_u.add a.d2 b.d2; + c.d3 <- Float_u.add a.d3 b.d3; + c + +(* +102: +(id:3) a:V/61 := R:I/0[%rax] +(id:4) b:V/62 := R:I/1[%rbx] +(id:5) c:V/63 := R:I/2[%rdi] +(id:6) F/64 := float64 mut[a:V/61] +(id:7) F/65 := F/64 +(id:8) F/65 := F/65 +f float64[b:V/62] +(id:9) float64[c:V/63] := F/65 (assign) +(id:10) Psetufloatfield:I/66 := 1 +(id:11) F/67 := float64 mut[a:V/61 + 8] +(id:12) F/68 := F/67 +(id:13) F/68 := F/68 +f float64[b:V/62 + 8] +(id:14) float64[c:V/63 + 8] := F/68 (assign) +(id:15) Psetufloatfield:I/69 := 1 +(id:16) F/70 := float64 mut[a:V/61 + 16] +(id:17) F/71 := F/70 +(id:18) F/71 := F/71 +f float64[b:V/62 + 16] +(id:19) float64[c:V/63 + 16] := F/71 (assign) +(id:20) Psetufloatfield:I/72 := 1 +(id:21) F/73 := float64 mut[a:V/61 + 24] +(id:22) F/74 := F/73 +(id:23) F/74 := F/74 +f float64[b:V/62 + 24] +(id:24) float64[c:V/63 + 24] := F/74 (assign) +(id:25) Psetufloatfield:I/75 := 1 +(id:26) R:I/0[%rax] := c:V/63 +(id:27) Return R:I/0[%rax] + +*) + +let print_t1 ppf (t1 : t1) = + Format.fprintf ppf "{ d0 = %f ; d1 = %f; d2 = %f ; d3 = %f }" + (Float_u.to_float t1.d0) + (Float_u.to_float t1.d1) + (Float_u.to_float t1.d2) + (Float_u.to_float t1.d3) + +let () = + let a = { d0 = #8.; d1 = #96.; d2 = #0.; d3 = -#0.5 } in + let b = { d0 = #80.; d1 = #14.; d2 = #0.; d3 = -#0.5 } in + let c = { d0 = #8.; d1 = #96.; d2 = #0.; d3 = -#0. } in + let res = { d0 = #0.; d1 = -#10.; d2 = #1.; d3 = -#1. } in + Format.printf "add_mutable_record %a\n" print_t1 + (add_mutable_record a b c); + copy_mutable_record c res; + Format.printf "copy_mutable_record %a\n" print_t1 res; + () diff --git a/flambda-backend/tests/backend/vectorizer/test_float_unboxed.mli b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.mli new file mode 100644 index 0000000000..5b909d90a8 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float_unboxed.mli @@ -0,0 +1 @@ +(* blank, make sure all the functions are called from top-level *) diff --git a/flambda-backend/tests/backend/vectorizer/test_float_unboxed_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_float_unboxed_vectorized.cmx.dump.expected new file mode 100644 index 0000000000..357dba19d9 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float_unboxed_vectorized.cmx.dump.expected @@ -0,0 +1,2 @@ +**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_float_unboxed_vectorized.copy_mutable_record) +**** Vectorize selected computation: 8 groups, 16 scalar instructions, 10 vector instructions, cost = -6 (Test_float_unboxed_vectorized.add_mutable_record) diff --git a/flambda-backend/tests/backend/vectorizer/test_float_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_float_vectorized.cmx.dump.expected new file mode 100644 index 0000000000..dc48684873 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_float_vectorized.cmx.dump.expected @@ -0,0 +1,7 @@ +**** Vectorize selected computation: 4 groups, 8 scalar instructions, 5 vector instructions, cost = -3 (Test_float_vectorized.add_mutable_record) +**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_float_vectorized.copy_mutable_record) +**** Vectorize selected computation: 4 groups, 8 scalar instructions, 5 vector instructions, cost = -3 (Test_float_vectorized.add_mutable_record_fresh) +**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_float_vectorized.copy_mutable_record_fresh) +**** Vectorize selected computation: 8 groups, 16 scalar instructions, 10 vector instructions, cost = -6 (Test_float_vectorized.add_mutable_record_t4) +**** Vectorize selected computation: 4 groups, 8 scalar instructions, 4 vector instructions, cost = -4 (Test_float_vectorized.copy_mutable_record_t4) +**** Vectorize selected computation: 4 groups, 8 scalar instructions, 4 vector instructions, cost = -4 (Test_float_vectorized.dup_mutable_record_t4) diff --git a/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.expected b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.expected new file mode 100644 index 0000000000..0207ed6b91 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.expected @@ -0,0 +1,7 @@ +add_mutable_record { d0 = 88 ; d1 = 110; d2 = -40 ; d3 = -100 } +copy_array_four 30 30 30 30 +copy_array_index_four 30 30 30 30 +add_array_from_start 60 60 60 60 +copy_array_index_from_start 60 60 60 60 +copy_array_from_start 60 60 60 60 +copy_array_from_start_v2 60 60 60 60 diff --git a/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.ml b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.ml new file mode 100644 index 0000000000..b45eaa5776 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.ml @@ -0,0 +1,229 @@ +[@@@ocaml.warnerror "+a-40-41-42"] + +module Int32_u = struct + type t = int32# + + external to_int32 : t -> (int32[@local_opt]) = "%box_int32" [@@warning "-187"] + + external of_int32 : (int32[@local_opt]) -> t = "%unbox_int32" [@@warning "-187"] + + let[@inline always] add x y = of_int32 (Int32.add (to_int32 x) (to_int32 y)) + + module Array = struct + external unsafe_create : ('a : bits32). int -> 'a array = + "caml_make_unboxed_int32_vect_bytecode" "caml_make_unboxed_int32_vect" + external unsafe_get: ('a : bits32). 'a array -> int -> 'a = "%array_unsafe_get" + external unsafe_set: ('a : bits32). 'a array -> int -> 'a -> unit = "%array_unsafe_set" + + module Index = struct + external unsafe_get + : ('a : bits32). + ('a array) -> t -> 'a + = "%array_unsafe_get_indexed_by_int32#" + + external unsafe_set + : ('a : bits32). + 'a array -> t -> 'a -> unit + = "%array_unsafe_set_indexed_by_int32#" + end + end + +end + +type t1 = { mutable d0 : int32# ; mutable d1: int32#; mutable d2: int32#; mutable d3: int32# } + +(* Currently, can't vectorize because not adjacent and have an unnecessary sign extension. *) +let[@inline never] [@local never][@specialize never] add_mutable_record (a : t1) (b: t1) (c : t1) : t1 = + c.d0 <- Int32_u.add a.d0 b.d0; + c.d1 <- Int32_u.add a.d1 b.d1; + c.d2 <- Int32_u.add a.d2 b.d2; + c.d3 <- Int32_u.add a.d3 b.d3; + c + +let[@inline always] copy_array_one (a : Int32_u.t array) + (b : Int32_u.t array) pos = + let x = Int32_u.Array.unsafe_get a pos in + Int32_u.Array.unsafe_set b pos x + +(* The accesses are adjacent but the use of [int] typed index results in a convoluted + index computation that is not yet handled by the current heuristics. *) +let[@inline never] [@local never][@specialize never] copy_array_four (a : Int32_u.t array) + (b : Int32_u.t array) ~pos = + copy_array_one a b pos; + copy_array_one a b (pos+1); + copy_array_one a b (pos+2); + copy_array_one a b (pos+3); + () + +(* + +114: +(id:3) a:V/61 := R:I/0[%rax] +(id:4) b:V/62 := R:I/1[%rbx] +(id:5) pos:I/63 := R:I/2[%rdi] +(id:6) new_value:I/64 := signed int32 mut[a:V/61 + pos:I/63 * 2 + 6] +(id:7) signed int32[b:V/62 + pos:I/63 * 2 + 6] := new_value:I/64 (assign) +(id:8) Parraysetu:I/65 := 1 +(id:9) Paddint:I/66 := pos:I/63 +(id:10) Paddint:I/66 := Paddint:I/66 + 2 +(id:11) new_value:I/67 := signed int32 mut[a:V/61 + Paddint:I/66 * 2 + 6] +(id:12) signed int32[b:V/62 + Paddint:I/66 * 2 + 6] := new_value:I/67 (assign) +(id:13) Parraysetu:I/68 := 1 +(id:14) Paddint:I/69 := pos:I/63 +(id:15) Paddint:I/69 := Paddint:I/69 + 4 +(id:16) new_value:I/70 := signed int32 mut[a:V/61 + Paddint:I/69 * 2 + 6] +(id:17) signed int32[b:V/62 + Paddint:I/69 * 2 + 6] := new_value:I/70 (assign) +(id:18) Parraysetu:I/71 := 1 +(id:19) Paddint:I/72 := pos:I/63 +(id:20) Paddint:I/72 := Paddint:I/72 + 6 +(id:21) new_value:I/73 := signed int32 mut[a:V/61 + Paddint:I/72 * 2 + 6] +(id:22) signed int32[b:V/62 + Paddint:I/72 * 2 + 6] := new_value:I/73 (assign) +(id:23) Parraysetu:I/74 := 1 +(id:24) I/75 := 1 +(id:25) R:I/0[%rax] := I/75 +(id:26) Return R:I/0[%rax] + +*) + +let[@inline never] [@local never][@specialize never] copy_array_four_v2 (a : Int32_u.t array) + (b : Int32_u.t array) ~pos = + let i0 = pos in + copy_array_one a b i0; + let i1 = i0 + 1 in + copy_array_one a b i1; + let i2 = i1 + 1 in + copy_array_one a b i2; + let i3 = i2 + 1 in + copy_array_one a b i3; + () + +let[@inline always] copy_array_index_one (a : Int32_u.t array) + (b : Int32_u.t array) (pos : Int32_u.t) = + let x = Int32_u.Array.Index.unsafe_get a pos in + Int32_u.Array.Index.unsafe_set b pos x + +(* Can't vectorize it! The accesses are adjacent and we use [Int32_u.t] as index, + but the compiler tags the index before using it! This index computation is not + yet handled by the vectorizer's heuristics. *) +let[@inline never] [@local never][@specialize never] copy_array_index_four (a : Int32_u.t array) + (b : Int32_u.t array) ~pos = + copy_array_index_one a b pos; + copy_array_index_one a b (Int32_u.add pos #1l); + copy_array_index_one a b (Int32_u.add pos #2l); + copy_array_index_one a b (Int32_u.add pos #3l); + () + +let[@inline never] [@local never][@specialize never] copy_array_index_from_start (a : Int32_u.t array) + (b : Int32_u.t array) = + let pos = #0l in + copy_array_index_one a b pos; + copy_array_index_one a b (Int32_u.add pos #1l); + copy_array_index_one a b (Int32_u.add pos #2l); + copy_array_index_one a b (Int32_u.add pos #3l); + () + + let[@inline never] [@local never][@specialize never] copy_array_from_start (a : Int32_u.t array) + (b : Int32_u.t array) = + let[@inline always] copy pos = + let x = Int32_u.Array.unsafe_get a pos in + Int32_u.Array.unsafe_set b pos x + in + let pos = 0 in + copy pos; + copy (pos+1); + copy (pos+2); + copy (pos+3); + () + +(* Can't vectorize because of an unnecessary sign extension. The heuristics in the + vectorizer can be extended to handle this case. *) +let[@inline never] [@local never][@specialize never] add_array_from_start (a : Int32_u.t array) (b : Int32_u.t array) = + let[@inline always] add pos = + let x = Int32_u.Array.unsafe_get a pos in + let y = Int32_u.Array.unsafe_get b pos in + Int32_u.Array.unsafe_set b pos (Int32_u.add x y) + in + let pos = 0 in + add pos; + add (pos+1); + add (pos+2); + add (pos+3); + () + +(* +camlTest7__add_array_from_start_7_22_code(R:I/0[%rax] R:I/1[%rbx]) {test7.ml:112,74-379} + a:V/61 := R:I/0[%rax] + b:V/62 := R:I/1[%rbx] + I/63 := signed int32 mut[b:V/62 + 8]{test7.ml:119,2-9;test7.ml:115,12-42} + I/64 := signed int32 mut[a:V/61 + 8]{test7.ml:119,2-9;test7.ml:114,12-42} + I/65 := I/64 + I/65 := I/65 + I/63{test7.ml:119,2-9;test7.ml:116,35-52;test7.ml:10,41-78} + new_value:I/66 := sextend32 I/65{test7.ml:119,2-9;test7.ml:116,35-52;test7.ml:10,41-78} + signed int32[b:V/62 + 8] := new_value:I/66 (assign){test7.ml:119,2-9;test7.ml:116,4-52} + Parraysetu:I/67 := 1 + I/68 := signed int32 mut[b:V/62 + 12]{test7.ml:120,2-13;test7.ml:115,12-42} + I/69 := signed int32 mut[a:V/61 + 12]{test7.ml:120,2-13;test7.ml:114,12-42} + I/70 := I/69 + I/70 := I/70 + I/68{test7.ml:120,2-13;test7.ml:116,35-52;test7.ml:10,41-78} + new_value:I/71 := sextend32 I/70{test7.ml:120,2-13;test7.ml:116,35-52;test7.ml:10,41-78} + signed int32[b:V/62 + 12] := new_value:I/71 (assign){test7.ml:120,2-13;test7.ml:116,4-52} + Parraysetu:I/72 := 1 + I/73 := signed int32 mut[b:V/62 + 16]{test7.ml:121,2-13;test7.ml:115,12-42} + I/74 := signed int32 mut[a:V/61 + 16]{test7.ml:121,2-13;test7.ml:114,12-42} + I/75 := I/74 + I/75 := I/75 + I/73{test7.ml:121,2-13;test7.ml:116,35-52;test7.ml:10,41-78} + new_value:I/76 := sextend32 I/75{test7.ml:121,2-13;test7.ml:116,35-52;test7.ml:10,41-78} + signed int32[b:V/62 + 16] := new_value:I/76 (assign){test7.ml:121,2-13;test7.ml:116,4-52} + Parraysetu:I/77 := 1 + I/78 := signed int32 mut[b:V/62 + 20]{test7.ml:122,2-13;test7.ml:115,12-42} + I/79 := signed int32 mut[a:V/61 + 20]{test7.ml:122,2-13;test7.ml:114,12-42} + I/80 := I/79 + I/80 := I/80 + I/78{test7.ml:122,2-13;test7.ml:116,35-52;test7.ml:10,41-78} + new_value:I/81 := sextend32 I/80{test7.ml:122,2-13;test7.ml:116,35-52;test7.ml:10,41-78} + signed int32[b:V/62 + 20] := new_value:I/81 (assign){test7.ml:122,2-13;test7.ml:116,4-52} + Parraysetu:I/82 := 1 + I/83 := 1 + R:I/0[%rax] := I/83 + return R:I/0[%rax] +*) +let print_t1 ppf (t1 : t1) = + Format.fprintf ppf "{ d0 = %ld ; d1 = %ld; d2 = %ld ; d3 = %ld }" + (Int32_u.to_int32 t1.d0) + (Int32_u.to_int32 t1.d1) + (Int32_u.to_int32 t1.d2) + (Int32_u.to_int32 t1.d3) + +let print_array ~len ppf ( a : Int32_u.t array)= + for i = 0 to len - 1 do + let x = Int32_u.Array.unsafe_get a i in + Format.fprintf ppf "%ld " (x |> Int32_u.to_int32) + done + +let create_array ~len ~init = + let arr = Int32_u.Array.unsafe_create len in + for i = 0 to len-1 do + Int32_u.Array.unsafe_set arr i init + done; + arr + +let () = + let a = { d0 = #8l; d1 = #96l; d2 = -#10l; d3 = #0l } in + let b = { d0 = #80l; d1 = #14l; d2 = -#30l; d3 = -#100l } in + let c = { d0 = #8l; d1 = #96l; d2 = #0l; d3 = #0l } in + Format.printf "add_mutable_record %a\n" print_t1 + (add_mutable_record a b c); + let ar1 = create_array ~len:4 ~init:#30l in + let ar2 = create_array ~len:4 ~init:#0l in + copy_array_four ar1 ar2 ~pos:0; + Format.printf "copy_array_four %a\n" (print_array ~len:4) ar2; + copy_array_index_four ar2 ar1 ~pos:#0l; + Format.printf "copy_array_index_four %a\n" (print_array ~len:4) ar1; + add_array_from_start ar1 ar2; + Format.printf "add_array_from_start %a\n" (print_array ~len:4) ar2; + copy_array_index_from_start ar2 ar1; + Format.printf "copy_array_index_from_start %a\n" (print_array ~len:4) ar1; + copy_array_from_start ar1 ar2; + Format.printf "copy_array_from_start %a\n" (print_array ~len:4) ar2; + copy_array_four_v2 ar1 ar2 ~pos:0; + Format.printf "copy_array_from_start_v2 %a\n" (print_array ~len:4) ar2; + () diff --git a/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.mli b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.mli new file mode 100644 index 0000000000..5b909d90a8 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed.mli @@ -0,0 +1 @@ +(* blank, make sure all the functions are called from top-level *) diff --git a/flambda-backend/tests/backend/vectorizer/test_int32_unboxed_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed_vectorized.cmx.dump.expected new file mode 100644 index 0000000000..fef3d590f8 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int32_unboxed_vectorized.cmx.dump.expected @@ -0,0 +1,3 @@ +**** Vectorize selected computation: 2 groups, 8 scalar instructions, 2 vector instructions, cost = -6 (Test_int32_unboxed_vectorized.copy_array_four_v2) +**** Vectorize selected computation: 2 groups, 8 scalar instructions, 2 vector instructions, cost = -6 (Test_int32_unboxed_vectorized.copy_array_index_from_start) +**** Vectorize selected computation: 2 groups, 8 scalar instructions, 2 vector instructions, cost = -6 (Test_int32_unboxed_vectorized.copy_array_from_start) diff --git a/flambda-backend/tests/backend/vectorizer/test_int64.expected b/flambda-backend/tests/backend/vectorizer/test_int64.expected new file mode 100644 index 0000000000..21d3934339 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int64.expected @@ -0,0 +1,7 @@ +add_mutable_record { d0 = 88 ; d1 = 110 } +copy_mutable_record { d0 = 88 ; d1 = 110 } +add_mutable_record_fresh { d0 = 88 ; d1 = 110 } +copy_mutable_record_fresh { d0 = 88 ; d1 = 110 } +add_mutable_record_t4 { d0 = 88 ; d1 = 110; d2 = 88 ; d3 = 110 } +copy_mutable_record_t4 { d0 = 8 ; d1 = 96; d2 = 80 ; d3 = 14 } +dup_mutable_record_t4 { d0 = 8 ; d1 = 96; d2 = 8 ; d3 = 96 } diff --git a/flambda-backend/tests/backend/vectorizer/test_int64.ml b/flambda-backend/tests/backend/vectorizer/test_int64.ml new file mode 100644 index 0000000000..95603dd777 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int64.ml @@ -0,0 +1,79 @@ +[@@@ocaml.warnerror "+a-40-41-42"] + +type t1 = + { mutable d0 : int64; + mutable d1 : int64 + } + +(* Can't vectorize because int64 are boxed. *) +let[@inline never] [@local never] [@specialize never] add_mutable_record + (a : t1) (b : t1) (c : t1) : t1 = + c.d0 <- Int64.add a.d0 b.d0; + c.d1 <- Int64.add a.d1 b.d1; + c + +(* Can't vectorize because memory write requires [caml_modify]. *) +let[@inline never] [@local never] [@specialize never] copy_mutable_record + (a : t1) (b : t1) : t1 = + b.d0 <- a.d0; + b.d1 <- a.d1; + b + +(* Can't vectorize because int64 are boxed *) +let[@inline never] [@local never] [@specialize never] add_mutable_record_fresh + (a : t1) (b : t1) : t1 = + { d0 = Int64.add a.d0 b.d0; d1 = Int64.add a.d1 b.d1 } + +let[@inline never] [@local never] [@specialize never] copy_mutable_record_fresh + (a : t1) : t1 = + { d0 = a.d0; d1 = a.d1 } + +type t4 = + { mutable d0 : int64; + mutable d1 : int64; + mutable d2 : int64; + mutable d3 : int64 + } + +(* Can't vectorize because int64 are boxed. *) +let[@inline never] [@local never] [@specialize never] add_mutable_record_t4 + (a : t1) (b : t1) (c : t4) : t4 = + c.d0 <- Int64.add a.d0 b.d0; + c.d1 <- Int64.add a.d1 b.d1; + c.d2 <- Int64.add a.d0 b.d0; + c.d3 <- Int64.add a.d1 b.d1; + c + +let[@inline never] [@local never] [@specialize never] copy_mutable_record_t4 + (a : t1) (b : t1) : t4 = + { d0 = a.d0; d1 = a.d1; d2 = b.d0; d3 = b.d1 } + +let[@inline never] [@local never] [@specialize never] dup_mutable_record_t4 + (a : t1) : t4 = + { d0 = a.d0; d1 = a.d1; d2 = a.d0; d3 = a.d1 } + +let print_t1 ppf (t1 : t1) = + Format.fprintf ppf "{ d0 = %Ld ; d1 = %Ld }" t1.d0 t1.d1 + +let print_t4 ppf (t4 : t4) = + Format.fprintf ppf "{ d0 = %Ld ; d1 = %Ld; d2 = %Ld ; d3 = %Ld }" t4.d0 t4.d1 + t4.d2 t4.d3 + +let () = + let a = { d0 = 8L; d1 = 96L } in + let b = { d0 = 80L; d1 = 14L } in + let c = { d0 = 10L; d1 = -10L } in + let t4 = { d0 = 10L; d1 = -10L; d2 = 199L; d3 = 18L } in + let res = { d0 = 0L; d1 = -0L } in + Format.printf "add_mutable_record %a\n" print_t1 (add_mutable_record a b c); + Format.printf "copy_mutable_record %a\n" print_t1 (copy_mutable_record c res); + Format.printf "add_mutable_record_fresh %a\n" print_t1 + (add_mutable_record_fresh a b); + Format.printf "copy_mutable_record_fresh %a\n" print_t1 + (copy_mutable_record_fresh c); + Format.printf "add_mutable_record_t4 %a\n" print_t4 + (add_mutable_record_t4 a b t4); + Format.printf "copy_mutable_record_t4 %a\n" print_t4 + (copy_mutable_record_t4 a b); + Format.printf "dup_mutable_record_t4 %a\n" print_t4 (dup_mutable_record_t4 a); + () diff --git a/flambda-backend/tests/backend/vectorizer/test_int64.mli b/flambda-backend/tests/backend/vectorizer/test_int64.mli new file mode 100644 index 0000000000..5b909d90a8 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int64.mli @@ -0,0 +1 @@ +(* blank, make sure all the functions are called from top-level *) diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.expected b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.expected new file mode 100644 index 0000000000..68b6515c90 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.expected @@ -0,0 +1,3 @@ +add_mutable_record { d0 = 88 ; d1 = 110 } +copy_mutable_record { d0 = 88 ; d1 = 110 } +add_fours_mutable_record { d0 = 88 ; d1 = 110; d2 = 88 ; d3 = 110 } diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.ml b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.ml new file mode 100644 index 0000000000..d9371e65e8 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.ml @@ -0,0 +1,61 @@ +[@@@ocaml.warnerror "+a-40-41-42"] + +module Int64_u = struct + type t = int64# + + external to_int64 : t -> (int64[@local_opt]) = "%box_int64" [@@warning "-187"] + + external of_int64 : (int64[@local_opt]) -> t = "%unbox_int64" [@@warning "-187"] + + let[@inline always] add x y = of_int64 (Int64.add (to_int64 x) (to_int64 y)) +end + +type t1 = { mutable d0 : int64# ; mutable d1: int64# } + +let[@inline never] [@local never][@specialize never] add_mutable_record (a : t1) (b: t1) (c : t1) : t1 = + c.d0 <- Int64_u.add a.d0 b.d0; + c.d1 <- Int64_u.add a.d1 b.d1; + c + +let[@inline never] [@local never][@specialize never] copy_mutable_record (a : t1) (b: t1) : unit = + b.d0 <- a.d0; + b.d1 <- a.d1; + () + +type t2 = { + mutable d0 : int64# ; + mutable d1: int64# ; + mutable d2: int64# ; + mutable d3: int64# } + +let[@inline never] [@local never][@specialize never] add_fours_mutable_record (a : t1) (b: t1) (c : t2) : unit = + c.d0 <- Int64_u.add a.d0 b.d0; + c.d1 <- Int64_u.add a.d1 b.d1; + c.d2 <- Int64_u.add a.d0 b.d0; + c.d3 <- Int64_u.add a.d1 b.d1; + () + +let print_t1 ppf (t1 : t1) = + Format.fprintf ppf "{ d0 = %Ld ; d1 = %Ld }" (Int64_u.to_int64 t1.d0) + (Int64_u.to_int64 t1.d1) + +let print_t4 ppf (t2 : t2) = + Format.fprintf ppf "{ d0 = %Ld ; d1 = %Ld; d2 = %Ld ; d3 = %Ld }" + (Int64_u.to_int64 t2.d0) + (Int64_u.to_int64 t2.d1) + (Int64_u.to_int64 t2.d2) + (Int64_u.to_int64 t2.d3) + +let () = + let a = { d0 = #8L; d1 = #96L } in + let b = { d0 = #80L; d1 = #14L } in + let c = { d0 = #8L; d1 = #96L } in + let d = { d0 = #0L; d1 = #0L; d2 = #0L; d3 = #0L } in + let res = { d0 = #0L; d1 = -#10L } in + Format.printf "add_mutable_record %a\n" print_t1 + (add_mutable_record a b c); + copy_mutable_record c res; + Format.printf "copy_mutable_record %a\n" print_t1 res; + add_fours_mutable_record a b d; + Format.printf "add_fours_mutable_record %a\n" print_t4 d; + () diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.mli b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.mli new file mode 100644 index 0000000000..5b909d90a8 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed.mli @@ -0,0 +1 @@ +(* blank, make sure all the functions are called from top-level *) diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_unboxed_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed_vectorized.cmx.dump.expected new file mode 100644 index 0000000000..61eea8dffc --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int64_unboxed_vectorized.cmx.dump.expected @@ -0,0 +1,3 @@ +**** Vectorize selected computation: 5 groups, 10 scalar instructions, 5 vector instructions, cost = -5 (Test_int64_unboxed_vectorized.add_mutable_record) +**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_int64_unboxed_vectorized.copy_mutable_record) +**** Vectorize selected computation: 10 groups, 20 scalar instructions, 10 vector instructions, cost = -10 (Test_int64_unboxed_vectorized.add_fours_mutable_record) diff --git a/flambda-backend/tests/backend/vectorizer/test_int64_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_int64_vectorized.cmx.dump.expected new file mode 100644 index 0000000000..6db1b67d70 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_int64_vectorized.cmx.dump.expected @@ -0,0 +1,3 @@ +**** Vectorize selected computation: 2 groups, 4 scalar instructions, 2 vector instructions, cost = -2 (Test_int64_vectorized.copy_mutable_record_fresh) +**** Vectorize selected computation: 4 groups, 8 scalar instructions, 4 vector instructions, cost = -4 (Test_int64_vectorized.copy_mutable_record_t4) +**** Vectorize selected computation: 4 groups, 8 scalar instructions, 4 vector instructions, cost = -4 (Test_int64_vectorized.dup_mutable_record_t4) diff --git a/flambda-backend/tests/backend/vectorizer/examples.expected b/flambda-backend/tests/backend/vectorizer/test_spill_valx2.expected similarity index 100% rename from flambda-backend/tests/backend/vectorizer/examples.expected rename to flambda-backend/tests/backend/vectorizer/test_spill_valx2.expected diff --git a/flambda-backend/tests/backend/vectorizer/test_spill_valx2.ml b/flambda-backend/tests/backend/vectorizer/test_spill_valx2.ml new file mode 100644 index 0000000000..2120c1fe2a --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_spill_valx2.ml @@ -0,0 +1,709 @@ +(* Test that spilled registers of type [Valx2] are correctly registered with the + GC. + + Need at least 16 registers of 128 bit to trigger the spill on amd64. + + Allocate enough to defeat comballoc that moves allocations to the beginning + of the block and out of the live range of the register that this test is + aiming to spill. Raise [vectorize-max-block-size] to force the resulting very + long block to be vectorized. *) +type s = + { mutable f0 : int64; + mutable f1 : int64; + mutable f2 : int64; + mutable f3 : int64; + mutable f4 : int64; + mutable f5 : int64; + mutable f6 : int64; + mutable f7 : int64; + mutable f8 : int64; + mutable f9 : int64; + mutable f10 : int64; + mutable f11 : int64; + mutable f12 : int64; + mutable f13 : int64; + mutable f14 : int64; + mutable f15 : int64; + mutable f16 : int64; + mutable f17 : int64; + mutable f18 : int64; + mutable f19 : int64; + mutable f20 : int64; + mutable f21 : int64; + mutable f22 : int64; + mutable f23 : int64; + mutable f24 : int64; + mutable f25 : int64; + mutable f26 : int64; + mutable f27 : int64; + mutable f28 : int64; + mutable f29 : int64; + mutable f30 : int64; + mutable f31 : int64; + mutable f32 : int64; + mutable f33 : int64; + mutable f34 : int64; + mutable f35 : int64 + } + +let ( + ) = Int64.add + +let[@inline never] [@local never] foo a = + let f0 = a.f0 in + let f1 = a.f1 in + let f2 = a.f2 in + let f3 = a.f3 in + let f4 = a.f4 in + let f5 = a.f5 in + let f6 = a.f6 in + let f7 = a.f7 in + let f8 = a.f8 in + let f9 = a.f9 in + let f10 = a.f10 in + let f11 = a.f11 in + let f12 = a.f12 in + let f13 = a.f13 in + let f14 = a.f14 in + let f15 = a.f15 in + let f16 = a.f16 in + let f17 = a.f17 in + let f18 = a.f18 in + let f19 = a.f19 in + let f20 = a.f20 in + let f21 = a.f21 in + let f22 = a.f22 in + let f23 = a.f23 in + let f24 = a.f24 in + let f25 = a.f25 in + let f26 = a.f26 in + let f27 = a.f27 in + let f28 = a.f28 in + let f29 = a.f29 in + let f30 = a.f30 in + let f31 = a.f31 in + let f32 = a.f32 in + let f33 = a.f33 in + let f34 = a.f34 in + let f35 = a.f35 in + let d0 = + { f0; + f1; + f2; + f3; + f4; + f5; + f6; + f7; + f8; + f9; + f10; + f11; + f12; + f13; + f14; + f15; + f16; + f17; + f18; + f19; + f20; + f21; + f22; + f23; + f24; + f25; + f26; + f27; + f28; + f29; + f30; + f31; + f32; + f33; + f34; + f35 + } + in + let d1 = + { f0; + f1; + f2; + f3; + f4; + f5; + f6; + f7; + f8; + f9; + f10; + f11; + f12; + f13; + f14; + f15; + f16; + f17; + f18; + f19; + f20; + f21; + f22; + f23; + f24; + f25; + f26; + f27; + f28; + f29; + f30; + f31; + f32; + f33; + f34; + f35 + } + in + let d2 = + { f0; + f1; + f2; + f3; + f4; + f5; + f6; + f7; + f8; + f9; + f10; + f11; + f12; + f13; + f14; + f15; + f16; + f17; + f18; + f19; + f20; + f21; + f22; + f23; + f24; + f25; + f26; + f27; + f28; + f29; + f30; + f31; + f32; + f33; + f34; + f35 + } + in + let d3 = + { f0; + f1; + f2; + f3; + f4; + f5; + f6; + f7; + f8; + f9; + f10; + f11; + f12; + f13; + f14; + f15; + f16; + f17; + f18; + f19; + f20; + f21; + f22; + f23; + f24; + f25; + f26; + f27; + f28; + f29; + f30; + f31; + f32; + f33; + f34; + f35 + } + in + let d4 = + { f0; + f1; + f2; + f3; + f4; + f5; + f6; + f7; + f8; + f9; + f10; + f11; + f12; + f13; + f14; + f15; + f16; + f17; + f18; + f19; + f20; + f21; + f22; + f23; + f24; + f25; + f26; + f27; + f28; + f29; + f30; + f31; + f32; + f33; + f34; + f35 + } + in + let d5 = + { f0; + f1; + f2; + f3; + f4; + f5; + f6; + f7; + f8; + f9; + f10; + f11; + f12; + f13; + f14; + f15; + f16; + f17; + f18; + f19; + f20; + f21; + f22; + f23; + f24; + f25; + f26; + f27; + f28; + f29; + f30; + f31; + f32; + f33; + f34; + f35 + } + in + let d6 = + { f0; + f1; + f2; + f3; + f4; + f5; + f6; + f7; + f8; + f9; + f10; + f11; + f12; + f13; + f14; + f15; + f16; + f17; + f18; + f19; + f20; + f21; + f22; + f23; + f24; + f25; + f26; + f27; + f28; + f29; + f30; + f31; + f32; + f33; + f34; + f35 + } + in + let d7 = + { f0; + f1; + f2; + f3; + f4; + f5; + f6; + f7; + f8; + f9; + f10; + f11; + f12; + f13; + f14; + f15; + f16; + f17; + f18; + f19; + f20; + f21; + f22; + f23; + f24; + f25; + f26; + f27; + f28; + f29; + f30; + f31; + f32; + f33; + f34; + f35 + } + in + let d8 = + { f0; + f1; + f2; + f3; + f4; + f5; + f6; + f7; + f8; + f9; + f10; + f11; + f12; + f13; + f14; + f15; + f16; + f17; + f18; + f19; + f20; + f21; + f22; + f23; + f24; + f25; + f26; + f27; + f28; + f29; + f30; + f31; + f32; + f33; + f34; + f35 + } + in + let d9 = + { f0; + f1; + f2; + f3; + f4; + f5; + f6; + f7; + f8; + f9; + f10; + f11; + f12; + f13; + f14; + f15; + f16; + f17; + f18; + f19; + f20; + f21; + f22; + f23; + f24; + f25; + f26; + f27; + f28; + f29; + f30; + f31; + f32; + f33; + f34; + f35 + } + in + d0, d1, d2, d3, d4, d5, d6, d7, d8, d9 + +let () = + let a = + { f0 = 0L; + f1 = 1L; + f2 = 2L; + f3 = 3L; + f4 = 4L; + f5 = 5L; + f6 = 6L; + f7 = 7L; + f8 = 8L; + f9 = 9L; + f10 = 10L; + f11 = 11L; + f12 = 12L; + f13 = 13L; + f14 = 14L; + f15 = 15L; + f16 = 16L; + f17 = 17L; + f18 = 18L; + f19 = 19L; + f20 = 20L; + f21 = 21L; + f22 = 22L; + f23 = 23L; + f24 = 24L; + f25 = 25L; + f26 = 26L; + f27 = 27L; + f28 = 28L; + f29 = 29L; + f30 = 30L; + f31 = 31L; + f32 = 32L; + f33 = 0L; + f34 = 0L; + f35 = 0L + } + in + (* Gc.set { (Gc.get()) with Gc.verbose = 0xd }; *) + let rec loop n = + if n = 0 + then () + else + (* try to trigger GC inside foo *) + let d0, d1, d2, d3, d4, d5, d6, d7, d8, d9 = foo a in + assert (d0.f0 = d1.f0); + assert (d0.f1 = d1.f1); + assert (d0.f2 = d1.f2); + assert (d0.f3 = d1.f3); + assert (d0.f4 = d1.f4); + assert (d0.f5 = d1.f5); + assert (d0.f6 = d1.f6); + assert (d0.f7 = d1.f7); + assert (d0.f8 = d1.f8); + assert (d0.f9 = d1.f9); + assert (d0.f10 = d1.f10); + assert (d0.f11 = d1.f11); + assert (d0.f12 = d1.f12); + assert (d0.f13 = d1.f13); + assert (d0.f14 = d1.f14); + assert (d0.f15 = d1.f15); + assert (d0.f16 = d1.f16); + assert (d0.f17 = d1.f17); + assert (d0.f18 = d1.f18); + assert (d0.f19 = d1.f19); + assert (d0.f20 = d1.f20); + assert (d0.f21 = d1.f21); + assert (d0.f22 = d1.f22); + assert (d0.f23 = d1.f23); + assert (d0.f24 = d1.f24); + assert (d0.f25 = d1.f25); + assert (d0.f26 = d1.f26); + assert (d0.f27 = d1.f27); + assert (d0.f28 = d1.f28); + assert (d0.f29 = d1.f29); + assert (d0.f30 = d1.f30); + assert (d0.f31 = d1.f31); + assert (d0.f32 = d1.f32); + assert (d0.f33 = d1.f33); + assert (d0.f34 = d1.f34); + assert (d0.f35 = d1.f35); + assert (d0.f0 = d2.f0); + assert (d0.f1 = d2.f1); + assert (d0.f2 = d2.f2); + assert (d0.f3 = d2.f3); + assert (d0.f4 = d2.f4); + assert (d0.f5 = d2.f5); + assert (d0.f6 = d2.f6); + assert (d0.f7 = d2.f7); + assert (d0.f8 = d2.f8); + assert (d0.f9 = d2.f9); + assert (d0.f10 = d2.f10); + assert (d0.f11 = d2.f11); + assert (d0.f12 = d2.f12); + assert (d0.f13 = d2.f13); + assert (d0.f14 = d2.f14); + assert (d0.f15 = d2.f15); + assert (d0.f16 = d2.f16); + assert (d0.f17 = d2.f17); + assert (d0.f18 = d2.f18); + assert (d0.f19 = d2.f19); + assert (d0.f20 = d2.f20); + assert (d0.f21 = d2.f21); + assert (d0.f22 = d2.f22); + assert (d0.f23 = d2.f23); + assert (d0.f24 = d2.f24); + assert (d0.f25 = d2.f25); + assert (d0.f26 = d2.f26); + assert (d0.f27 = d2.f27); + assert (d0.f28 = d2.f28); + assert (d0.f29 = d2.f29); + assert (d0.f30 = d2.f30); + assert (d0.f31 = d2.f31); + assert (d0.f32 = d2.f32); + assert (d0.f33 = d2.f33); + assert (d0.f34 = d2.f34); + assert (d0.f35 = d2.f35); + assert (d0.f0 = d3.f0); + assert (d0.f1 = d3.f1); + assert (d0.f2 = d3.f2); + assert (d0.f3 = d3.f3); + assert (d0.f4 = d3.f4); + assert (d0.f5 = d3.f5); + assert (d0.f6 = d3.f6); + assert (d0.f7 = d3.f7); + assert (d0.f8 = d3.f8); + assert (d0.f9 = d3.f9); + assert (d0.f10 = d3.f10); + assert (d0.f11 = d3.f11); + assert (d0.f12 = d3.f12); + assert (d0.f13 = d3.f13); + assert (d0.f14 = d3.f14); + assert (d0.f15 = d3.f15); + assert (d0.f16 = d3.f16); + assert (d0.f17 = d3.f17); + assert (d0.f18 = d3.f18); + assert (d0.f19 = d3.f19); + assert (d0.f20 = d3.f20); + assert (d0.f21 = d3.f21); + assert (d0.f22 = d3.f22); + assert (d0.f23 = d3.f23); + assert (d0.f24 = d3.f24); + assert (d0.f25 = d3.f25); + assert (d0.f26 = d3.f26); + assert (d0.f27 = d3.f27); + assert (d0.f28 = d3.f28); + assert (d0.f29 = d3.f29); + assert (d0.f30 = d3.f30); + assert (d0.f31 = d3.f31); + assert (d0.f32 = d3.f32); + assert (d0.f33 = d3.f33); + assert (d0.f34 = d3.f34); + assert (d0.f35 = d3.f35); + assert (d1.f0 = d8.f0); + assert (d1.f1 = d8.f1); + assert (d1.f2 = d8.f2); + assert (d1.f3 = d8.f3); + assert (d1.f4 = d8.f4); + assert (d1.f5 = d8.f5); + assert (d1.f6 = d8.f6); + assert (d1.f7 = d8.f7); + assert (d1.f8 = d8.f8); + assert (d0.f9 = d8.f9); + assert (d0.f10 = d8.f10); + assert (d0.f11 = d8.f11); + assert (d0.f12 = d8.f12); + assert (d0.f13 = d8.f13); + assert (d0.f14 = d8.f14); + assert (d0.f15 = d8.f15); + assert (d0.f16 = d8.f16); + assert (d0.f17 = d8.f17); + assert (d0.f18 = d8.f18); + assert (d0.f19 = d8.f19); + assert (d0.f20 = d8.f20); + assert (d0.f21 = d8.f21); + assert (d0.f22 = d8.f22); + assert (d0.f23 = d8.f23); + assert (d0.f24 = d8.f24); + assert (d0.f25 = d8.f25); + assert (d0.f26 = d8.f26); + assert (d0.f27 = d8.f27); + assert (d0.f28 = d8.f28); + assert (d0.f29 = d8.f29); + assert (d0.f30 = d8.f30); + assert (d0.f31 = d8.f31); + assert (d0.f32 = d8.f32); + assert (d0.f33 = d8.f33); + assert (d0.f34 = d8.f34); + assert (d0.f35 = d8.f35); + assert (d0.f0 = d9.f0); + assert (d0.f1 = d9.f1); + assert (d0.f2 = d9.f2); + assert (d0.f3 = d9.f3); + assert (d0.f4 = d9.f4); + assert (d0.f5 = d9.f5); + assert (d0.f6 = d9.f6); + assert (d0.f7 = d9.f7); + assert (d0.f8 = d9.f8); + assert (d0.f9 = d9.f9); + assert (d0.f10 = d9.f10); + assert (d0.f11 = d9.f11); + assert (d0.f12 = d9.f12); + assert (d0.f13 = d9.f13); + assert (d0.f14 = d9.f14); + assert (d0.f15 = d9.f15); + assert (d0.f16 = d9.f16); + assert (d0.f17 = d9.f17); + assert (d0.f18 = d9.f18); + assert (d0.f19 = d9.f19); + assert (d0.f20 = d9.f20); + assert (d0.f21 = d9.f21); + assert (d0.f22 = d9.f22); + assert (d0.f23 = d9.f23); + assert (d0.f24 = d9.f24); + assert (d0.f25 = d9.f25); + assert (d0.f26 = d9.f26); + assert (d0.f27 = d9.f27); + assert (d0.f28 = d9.f28); + assert (d0.f29 = d9.f29); + assert (d0.f30 = d9.f30); + assert (d0.f31 = d9.f31); + assert (d0.f32 = d9.f32); + assert (d0.f33 = d9.f33); + assert (d0.f34 = d9.f34); + assert (d0.f35 = d9.f35); + loop (n - 1) + in + loop 1_000_000 diff --git a/flambda-backend/tests/backend/vectorizer/test_spill_valx2.mli b/flambda-backend/tests/backend/vectorizer/test_spill_valx2.mli new file mode 100644 index 0000000000..5b909d90a8 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_spill_valx2.mli @@ -0,0 +1 @@ +(* blank, make sure all the functions are called from top-level *) diff --git a/flambda-backend/tests/backend/vectorizer/test_spill_valx2_vectorized.cmx.dump.expected b/flambda-backend/tests/backend/vectorizer/test_spill_valx2_vectorized.cmx.dump.expected new file mode 100644 index 0000000000..9f75a15662 --- /dev/null +++ b/flambda-backend/tests/backend/vectorizer/test_spill_valx2_vectorized.cmx.dump.expected @@ -0,0 +1 @@ +**** Vectorize selected computation: 198 groups, 396 scalar instructions, 198 vector instructions, cost = -198 (Test_spill_valx2_vectorized.foo)