Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8341194: [REDO] Implement C2 VectorizedHashCode on AArch64 #88

Merged
merged 1 commit into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions src/hotspot/cpu/aarch64/aarch64.ad
Original file line number Diff line number Diff line change
Expand Up @@ -17174,6 +17174,32 @@ instruct array_equalsC(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result,
ins_pipe(pipe_class_memory);
%}

instruct arrays_hashcode(iRegP_R1 ary, iRegI_R2 cnt, iRegI_R0 result, immI basic_type,
vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, vRegD_V3 vtmp3,
vRegD_V4 vtmp4, vRegD_V5 vtmp5, vRegD_V6 vtmp6, vRegD_V7 vtmp7,
vRegD_V12 vtmp8, vRegD_V13 vtmp9, rFlagsReg cr)
%{
match(Set result (VectorizedHashCode (Binary ary cnt) (Binary result basic_type)));
effect(TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP vtmp5, TEMP vtmp6,
TEMP vtmp7, TEMP vtmp8, TEMP vtmp9, USE_KILL ary, USE_KILL cnt, USE basic_type, KILL cr);

format %{ "Array HashCode array[] $ary,$cnt,$result,$basic_type -> $result // KILL all" %}
ins_encode %{
address tpc = __ arrays_hashcode($ary$$Register, $cnt$$Register, $result$$Register,
$vtmp3$$FloatRegister, $vtmp2$$FloatRegister,
$vtmp1$$FloatRegister, $vtmp0$$FloatRegister,
$vtmp4$$FloatRegister, $vtmp5$$FloatRegister,
$vtmp6$$FloatRegister, $vtmp7$$FloatRegister,
$vtmp8$$FloatRegister, $vtmp9$$FloatRegister,
(BasicType)$basic_type$$constant);
if (tpc == nullptr) {
ciEnv::current()->record_failure("CodeCache is full");
return;
}
%}
ins_pipe(pipe_class_memory);
%}

instruct count_positives(iRegP_R1 ary1, iRegI_R2 len, iRegI_R0 result, rFlagsReg cr)
%{
match(Set result (CountPositives ary1 len));
Expand Down
70 changes: 68 additions & 2 deletions src/hotspot/cpu/aarch64/assembler_aarch64.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
* Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -286,6 +286,11 @@ class Instruction_aarch64 {
f(r->raw_encoding(), lsb + 4, lsb);
}

//<0-15>reg: As `rf(FloatRegister)`, but only the lower 16 FloatRegisters are allowed.
void lrf(FloatRegister r, int lsb) {
f(r->raw_encoding(), lsb + 3, lsb);
}

void prf(PRegister r, int lsb) {
f(r->raw_encoding(), lsb + 3, lsb);
}
Expand Down Expand Up @@ -763,6 +768,7 @@ class Assembler : public AbstractAssembler {
#define f current_insn.f
#define sf current_insn.sf
#define rf current_insn.rf
#define lrf current_insn.lrf
#define srf current_insn.srf
#define zrf current_insn.zrf
#define prf current_insn.prf
Expand Down Expand Up @@ -1588,6 +1594,16 @@ class Assembler : public AbstractAssembler {

#undef INSN

// Load/store a register, but with a BasicType parameter. Loaded signed integer values are
// extended to 64 bits.
void load(Register Rt, const Address &adr, BasicType bt) {
int op = (is_signed_subword_type(bt) || bt == T_INT) ? 0b10 : 0b01;
ld_st2(Rt, adr, exact_log2(type2aelembytes(bt)), op);
}
void store(Register Rt, const Address &adr, BasicType bt) {
ld_st2(Rt, adr, exact_log2(type2aelembytes(bt)), 0b00);
}

/* SIMD extensions
*
* We just use FloatRegister in the following. They are exactly the same
Expand Down Expand Up @@ -2606,6 +2622,7 @@ template<typename R, typename... Rx>
INSN(addpv, 0, 0b101111, true); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
INSN(smullv, 0, 0b110000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
INSN(umullv, 1, 0b110000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
INSN(smlalv, 0, 0b100000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
INSN(umlalv, 1, 0b100000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
INSN(maxv, 0, 0b011001, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
INSN(minv, 0, 0b011011, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
Expand Down Expand Up @@ -2878,6 +2895,28 @@ template<typename R, typename... Rx>
// FMULX - Vector - Scalar
INSN(fmulxvs, 1, 0b1001);

#undef INSN

#define INSN(NAME, op1, op2) \
void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm, int index) { \
starti; \
assert(T == T4H || T == T8H || T == T2S || T == T4S, "invalid arrangement"); \
assert(index >= 0 && \
((T == T2S && index <= 1) || (T != T2S && index <= 3) || (T == T8H && index <= 7)), \
"invalid index"); \
assert((T != T4H && T != T8H) || Vm->encoding() < 16, "invalid source SIMD&FP register"); \
f(0, 31), f((int)T & 1, 30), f(op1, 29), f(0b01111, 28, 24); \
if (T == T4H || T == T8H) { \
f(0b01, 23, 22), f(index & 0b11, 21, 20), lrf(Vm, 16), f(index >> 2 & 1, 11); \
} else { \
f(0b10, 23, 22), f(index & 1, 21), rf(Vm, 16), f(index >> 1, 11); \
} \
f(op2, 15, 12), f(0, 10), rf(Vn, 5), rf(Vd, 0); \
}

// MUL - Vector - Scalar
INSN(mulvs, 0, 0b1000);

#undef INSN

// Floating-point Reciprocal Estimate
Expand Down Expand Up @@ -3041,6 +3080,33 @@ template<typename R, typename... Rx>
umov(Xd, Vn, T, index);
}

protected:
void _xaddwv(bool is_unsigned, FloatRegister Vd, FloatRegister Vn, SIMD_Arrangement Ta,
FloatRegister Vm, SIMD_Arrangement Tb) {
starti;
assert((Tb >> 1) + 1 == (Ta >> 1), "Incompatible arrangement");
f(0, 31), f((int)Tb & 1, 30), f(is_unsigned ? 1 : 0, 29), f(0b01110, 28, 24);
f((int)(Ta >> 1) - 1, 23, 22), f(1, 21), rf(Vm, 16), f(0b000100, 15, 10), rf(Vn, 5), rf(Vd, 0);
}

public:
#define INSN(NAME, assertion, is_unsigned) \
void NAME(FloatRegister Vd, FloatRegister Vn, SIMD_Arrangement Ta, FloatRegister Vm, \
SIMD_Arrangement Tb) { \
assert((assertion), "invalid arrangement"); \
_xaddwv(is_unsigned, Vd, Vn, Ta, Vm, Tb); \
}

public:

INSN(uaddwv, Tb == T8B || Tb == T4H || Tb == T2S, /*is_unsigned*/true)
INSN(uaddwv2, Tb == T16B || Tb == T8H || Tb == T4S, /*is_unsigned*/true)
INSN(saddwv, Tb == T8B || Tb == T4H || Tb == T2S, /*is_unsigned*/false)
INSN(saddwv2, Tb == T16B || Tb == T8H || Tb == T4S, /*is_unsigned*/false)

#undef INSN


private:
void _pmull(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
starti;
Expand Down
96 changes: 96 additions & 0 deletions src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "opto/output.hpp"
#include "opto/subnode.hpp"
#include "runtime/stubRoutines.hpp"
#include "utilities/powerOfTwo.hpp"

#ifdef PRODUCT
#define BLOCK_COMMENT(str) /* nothing */
Expand All @@ -45,6 +46,101 @@

typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);

// jdk.internal.util.ArraysSupport.vectorizedHashCode
address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
FloatRegister vdata0, FloatRegister vdata1,
FloatRegister vdata2, FloatRegister vdata3,
FloatRegister vmul0, FloatRegister vmul1,
FloatRegister vmul2, FloatRegister vmul3,
FloatRegister vpow, FloatRegister vpowm,
BasicType eltype) {
ARRAYS_HASHCODE_REGISTERS;

Register tmp1 = rscratch1, tmp2 = rscratch2;

Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;

// Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
// use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
// use 4H for chars and shorts instead, but using 8H gives better performance.
const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
: eltype == T_CHAR || eltype == T_SHORT ? 8
: eltype == T_INT ? 4
: 0;
guarantee(vf, "unsupported eltype");

// Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
const size_t unroll_factor = 4;

switch (eltype) {
case T_BOOLEAN:
BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
break;
case T_CHAR:
BLOCK_COMMENT("arrays_hashcode(char) {");
break;
case T_BYTE:
BLOCK_COMMENT("arrays_hashcode(byte) {");
break;
case T_SHORT:
BLOCK_COMMENT("arrays_hashcode(short) {");
break;
case T_INT:
BLOCK_COMMENT("arrays_hashcode(int) {");
break;
default:
ShouldNotReachHere();
}

// large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
// implemented by the stub executes just once. Call the stub only if at least two iterations will
// be executed.
const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
cmpw(cnt, large_threshold);
br(Assembler::HS, LARGE);

bind(TAIL);

// The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
// uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
// Iteration eats up the remainder, uf elements at a time.
assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
andr(tmp2, cnt, unroll_factor - 1);
adr(tmp1, BR_BASE);
sub(tmp1, tmp1, tmp2, ext::sxtw, 3);
movw(tmp2, 0x1f);
br(tmp1);

bind(LOOP);
for (size_t i = 0; i < unroll_factor; ++i) {
load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
maddw(result, result, tmp2, tmp1);
}
bind(BR_BASE);
subsw(cnt, cnt, unroll_factor);
br(Assembler::HS, LOOP);

b(DONE);

bind(LARGE);

RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
address tpc = trampoline_call(stub);
if (tpc == nullptr) {
DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
postcond(pc() == badAddress);
return nullptr;
}

bind(DONE);

BLOCK_COMMENT("} // arrays_hashcode");

postcond(pc() != badAddress);
return pc();
}

void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
Register tmp2Reg, Register tmp3Reg) {
Register oop = objectReg;
Expand Down
7 changes: 7 additions & 0 deletions src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@
enum shift_kind kind = Assembler::LSL, unsigned shift = 0);

public:
// jdk.internal.util.ArraysSupport.vectorizedHashCode
address arrays_hashcode(Register ary, Register cnt, Register result, FloatRegister vdata0,
FloatRegister vdata1, FloatRegister vdata2, FloatRegister vdata3,
FloatRegister vmul0, FloatRegister vmul1, FloatRegister vmul2,
FloatRegister vmul3, FloatRegister vpow, FloatRegister vpowm,
BasicType eltype);

// Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
// See full description in macroAssembler_aarch64.cpp.
void fast_lock(Register object, Register box, Register tmp, Register tmp2, Register tmp3);
Expand Down
18 changes: 18 additions & 0 deletions src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1398,6 +1398,24 @@ class MacroAssembler: public Assembler {
address arrays_equals(Register a1, Register a2, Register result, Register cnt1,
Register tmp1, Register tmp2, Register tmp3, int elem_size);

// Ensure that the inline code and the stub use the same registers.
#define ARRAYS_HASHCODE_REGISTERS \
do { \
assert(result == r0 && \
ary == r1 && \
cnt == r2 && \
vdata0 == v3 && \
vdata1 == v2 && \
vdata2 == v1 && \
vdata3 == v0 && \
vmul0 == v4 && \
vmul1 == v5 && \
vmul2 == v6 && \
vmul3 == v7 && \
vpow == v12 && \
vpowm == v13, "registers must match aarch64.ad"); \
} while (0)

void string_equals(Register a1, Register a2, Register result, Register cnt1,
int elem_size);

Expand Down
Loading
Loading