diff --git a/src/hotspot/share/opto/addnode.cpp b/src/hotspot/share/opto/addnode.cpp index bdde4fe8dfe..1450b4d15ea 100644 --- a/src/hotspot/share/opto/addnode.cpp +++ b/src/hotspot/share/opto/addnode.cpp @@ -704,9 +704,9 @@ Node* AddPNode::Ideal_base_and_offset(Node* ptr, PhaseTransform* phase, //------------------------------unpack_offsets---------------------------------- // Collect the AddP offset values into the elements array, giving up // if there are more than length. -int AddPNode::unpack_offsets(Node* elements[], int length) { +int AddPNode::unpack_offsets(Node* elements[], int length) const { int count = 0; - Node* addr = this; + Node const* addr = this; Node* base = addr->in(AddPNode::Base); while (addr->is_AddP()) { if (addr->in(AddPNode::Base) != base) { diff --git a/src/hotspot/share/opto/addnode.hpp b/src/hotspot/share/opto/addnode.hpp index 30319a7150c..1897d013a7a 100644 --- a/src/hotspot/share/opto/addnode.hpp +++ b/src/hotspot/share/opto/addnode.hpp @@ -154,7 +154,7 @@ class AddPNode : public Node { // Collect the AddP offset values into the elements array, giving up // if there are more than length. - int unpack_offsets(Node* elements[], int length); + int unpack_offsets(Node* elements[], int length) const; // Do not match base-ptr edge virtual uint match_edge(uint idx) const; diff --git a/src/hotspot/share/opto/c2_globals.hpp b/src/hotspot/share/opto/c2_globals.hpp index ea5cd8299cd..363783e72d2 100644 --- a/src/hotspot/share/opto/c2_globals.hpp +++ b/src/hotspot/share/opto/c2_globals.hpp @@ -354,6 +354,12 @@ notproduct(bool, TraceNewVectors, false, \ "Trace creation of Vector nodes") \ \ + diagnostic(bool, MergeStores, true, \ + "Optimize stores by combining values into larger store") \ + \ + develop(bool, TraceMergeStores, false, \ + "Trace creation of merged stores") \ + \ product_pd(bool, OptoBundling, \ "Generate nops to fill i-cache lines") \ \ diff --git a/src/hotspot/share/opto/memnode.cpp b/src/hotspot/share/opto/memnode.cpp index c33c488588a..6705729efbe 100644 --- a/src/hotspot/share/opto/memnode.cpp +++ b/src/hotspot/share/opto/memnode.cpp @@ -2561,6 +2561,687 @@ uint StoreNode::hash() const { return NO_HASH; } +// Class to parse array pointers, and determine if they are adjacent. We parse the form: +// +// pointer = base +// + constant_offset +// + LShiftL( ConvI2L(int_offset + int_con), int_offset_shift) +// + sum(other_offsets) +// +// +// Note: we accumulate all constant offsets into constant_offset, even the int constant behind +// the "LShiftL(ConvI2L(...))" pattern. We convert "ConvI2L(int_offset + int_con)" to +// "ConvI2L(int_offset) + int_con", which is only safe if we can assume that either all +// compared addresses have an overflow for "int_offset + int_con" or none. +// For loads and stores on arrays, we know that if one overflows and the other not, then +// the two addresses lay almost max_int indices apart, but the maximal array size is +// only about half of that. Therefore, the RangeCheck on at least one of them must have +// failed. +// +// constant_offset += LShiftL( ConvI2L(int_con), int_offset_shift) +// +// pointer = base +// + constant_offset +// + LShiftL( ConvI2L(int_offset), int_offset_shift) +// + sum(other_offsets) +// +class ArrayPointer { +private: + const bool _is_valid; // The parsing succeeded + const Node* _pointer; // The final pointer to the position in the array + const Node* _base; // Base address of the array + const jlong _constant_offset; // Sum of collected constant offsets + const Node* _int_offset; // (optional) Offset behind LShiftL and ConvI2L + const jint _int_offset_shift; // (optional) Shift value for int_offset + const GrowableArray* _other_offsets; // List of other AddP offsets + + ArrayPointer(const bool is_valid, + const Node* pointer, + const Node* base, + const jlong constant_offset, + const Node* int_offset, + const jint int_offset_shift, + const GrowableArray* other_offsets) : + _is_valid(is_valid), + _pointer(pointer), + _base(base), + _constant_offset(constant_offset), + _int_offset(int_offset), + _int_offset_shift(int_offset_shift), + _other_offsets(other_offsets) + { + assert(_pointer != NULL, "must always have pointer"); + assert(is_valid == (_base != NULL), "have base exactly if valid"); + assert(is_valid == (_other_offsets != NULL), "have other_offsets exactly if valid"); + } + + static ArrayPointer make_invalid(const Node* pointer) { + return ArrayPointer(false, pointer, NULL, 0, NULL, 0, NULL); + } + + static bool parse_int_offset(Node* offset, Node*& int_offset, jint& int_offset_shift) { + // offset = LShiftL( ConvI2L(int_offset), int_offset_shift) + if (offset->Opcode() == Op_LShiftL && + offset->in(1)->Opcode() == Op_ConvI2L && + offset->in(2)->Opcode() == Op_ConI) { + int_offset = offset->in(1)->in(1); // LShiftL -> ConvI2L -> int_offset + int_offset_shift = offset->in(2)->get_int(); // LShiftL -> int_offset_shift + return true; + } + + // offset = ConvI2L(int_offset) = LShiftL( ConvI2L(int_offset), 0) + if (offset->Opcode() == Op_ConvI2L) { + int_offset = offset->in(1); + int_offset_shift = 0; + return true; + } + + // parse failed + return false; + } + +public: + // Parse the structure above the pointer + static ArrayPointer make(PhaseGVN* phase, const Node* pointer) { + assert(phase->type(pointer)->isa_aryptr() != NULL, "must be array pointer"); + if (!pointer->is_AddP()) { return ArrayPointer::make_invalid(pointer); } + + const Node* base = pointer->in(AddPNode::Base); + if (base == NULL) { return ArrayPointer::make_invalid(pointer); } + + const int search_depth = 5; + Node* offsets[search_depth]; + int count = pointer->as_AddP()->unpack_offsets(offsets, search_depth); + + // We expect at least a constant each + if (count <= 0) { return ArrayPointer::make_invalid(pointer); } + + // We extract the form: + // + // pointer = base + // + constant_offset + // + LShiftL( ConvI2L(int_offset + int_con), int_offset_shift) + // + sum(other_offsets) + // + jlong constant_offset = 0; + Node* int_offset = NULL; + jint int_offset_shift = 0; + GrowableArray* other_offsets = new GrowableArray(count); + + for (int i = 0; i < count; i++) { + Node* offset = offsets[i]; + if (offset->Opcode() == Op_ConI) { + // Constant int offset + constant_offset += offset->get_int(); + } else if (offset->Opcode() == Op_ConL) { + // Constant long offset + constant_offset += offset->get_long(); + } else if(int_offset == NULL && parse_int_offset(offset, int_offset, int_offset_shift)) { + // LShiftL( ConvI2L(int_offset), int_offset_shift) + int_offset = int_offset->uncast(); + if (int_offset->Opcode() == Op_AddI && int_offset->in(2)->Opcode() == Op_ConI) { + // LShiftL( ConvI2L(int_offset + int_con), int_offset_shift) + constant_offset += ((jlong)int_offset->in(2)->get_int()) << int_offset_shift; + int_offset = int_offset->in(1); + } + } else { + // All others + other_offsets->append(offset); + } + } + + return ArrayPointer(true, pointer, base, constant_offset, int_offset, int_offset_shift, other_offsets); + } + + bool is_adjacent_to_and_before(const ArrayPointer& other, const jlong data_size) const { + if (!_is_valid || !other._is_valid) { return false; } + + // Offset adjacent? + if (this->_constant_offset + data_size != other._constant_offset) { return false; } + + // All other components identical? + if (this->_base != other._base || + this->_int_offset != other._int_offset || + this->_int_offset_shift != other._int_offset_shift || + this->_other_offsets->length() != other._other_offsets->length()) { + return false; + } + + for (int i = 0; i < this->_other_offsets->length(); i++) { + Node* o1 = this->_other_offsets->at(i); + Node* o2 = other._other_offsets->at(i); + if (o1 != o2) { return false; } + } + + return true; + } + +#ifndef PRODUCT + void dump() { + if (!_is_valid) { + tty->print("ArrayPointer[%d %s, invalid]", _pointer->_idx, _pointer->Name()); + return; + } + tty->print("ArrayPointer[%d %s, base[%d %s] + %lld", + _pointer->_idx, _pointer->Name(), + _base->_idx, _base->Name(), + (long long)_constant_offset); + if (_int_offset != 0) { + tty->print(" + I2L[%d %s] << %d", + _int_offset->_idx, _int_offset->Name(), _int_offset_shift); + } + for (int i = 0; i < _other_offsets->length(); i++) { + Node* n = _other_offsets->at(i); + tty->print(" + [%d %s]", n->_idx, n->Name()); + } + tty->print_cr("]"); + } +#endif +}; + +// Link together multiple stores (B/S/C/I) into a longer one. +// +// Example: _store = StoreB[i+3] +// +// RangeCheck[i+0] RangeCheck[i+0] +// StoreB[i+0] +// RangeCheck[i+1] RangeCheck[i+1] +// StoreB[i+1] --> pass: fail: +// StoreB[i+2] StoreI[i+0] StoreB[i+0] +// StoreB[i+3] +// +// The 4 StoreB are merged into a single StoreI node. We have to be careful with RangeCheck[i+1]: before +// the optimization, if this RangeCheck[i+1] fails, then we execute only StoreB[i+0], and then trap. After +// the optimization, the new StoreI[i+0] is on the passing path of RangeCheck[i+1], and StoreB[i+0] on the +// failing path. +// +// Note: For normal array stores, every store at first has a RangeCheck. But they can be removed with: +// - RCE (RangeCheck Elimination): the RangeChecks in the loop are hoisted out and before the loop, +// and possibly no RangeChecks remain between the stores. +// - RangeCheck smearing: the earlier RangeChecks are adjusted such that they cover later RangeChecks, +// and those later RangeChecks can be removed. Example: +// +// RangeCheck[i+0] RangeCheck[i+0] <- before first store +// StoreB[i+0] StoreB[i+0] <- first store +// RangeCheck[i+1] --> smeared --> RangeCheck[i+3] <- only RC between first and last store +// StoreB[i+0] StoreB[i+1] <- second store +// RangeCheck[i+2] --> removed +// StoreB[i+0] StoreB[i+2] +// RangeCheck[i+3] --> removed +// StoreB[i+0] StoreB[i+3] <- last store +// +// Thus, it is a common pattern that between the first and last store in a chain +// of adjacent stores there remains exactly one RangeCheck, located between the +// first and the second store (e.g. RangeCheck[i+3]). +// +class MergePrimitiveArrayStores : public StackObj { +private: + PhaseGVN* _phase; + StoreNode* _store; + +public: + MergePrimitiveArrayStores(PhaseGVN* phase, StoreNode* store) : _phase(phase), _store(store) {} + + StoreNode* run(); + +private: + bool is_compatible_store(const StoreNode* other_store) const; + bool is_adjacent_pair(const StoreNode* use_store, const StoreNode* def_store) const; + bool is_adjacent_input_pair(const Node* n1, const Node* n2, const int memory_size) const; + static bool is_con_RShift(const Node* n, Node const*& base_out, jint& shift_out); + enum CFGStatus { CFG_SuccessNoRangeCheck, CFG_SuccessWithRangeCheck, CFG_Failure }; + static CFGStatus cfg_status_for_pair(const StoreNode* use_store, const StoreNode* def_store); + + class Status { + private: + StoreNode* _found_store; + bool _found_range_check; + + Status(StoreNode* found_store, bool found_range_check) + : _found_store(found_store), _found_range_check(found_range_check) {} + + public: + StoreNode* found_store() const { return _found_store; } + bool found_range_check() const { return _found_range_check; } + static Status make_failure() { return Status(NULL, false); } + + static Status make(StoreNode* found_store, const CFGStatus cfg_status) { + if (cfg_status == CFG_Failure) { + return Status::make_failure(); + } + return Status(found_store, cfg_status == CFG_SuccessWithRangeCheck); + } + }; + + Status find_adjacent_use_store(const StoreNode* def_store) const; + Status find_adjacent_def_store(const StoreNode* use_store) const; + Status find_use_store(const StoreNode* def_store) const; + Status find_def_store(const StoreNode* use_store) const; + Status find_use_store_unidirectional(const StoreNode* def_store) const; + Status find_def_store_unidirectional(const StoreNode* use_store) const; + + void collect_merge_list(Node_List& merge_list) const; + Node* make_merged_input_value(const Node_List& merge_list); + StoreNode* make_merged_store(const Node_List& merge_list, Node* merged_input_value); + + DEBUG_ONLY( void trace(const Node_List& merge_list, const Node* merged_input_value, const StoreNode* merged_store) const; ) +}; + +StoreNode* MergePrimitiveArrayStores::run() { + // Check for B/S/C/I + int opc = _store->Opcode(); + if (opc != Op_StoreB && opc != Op_StoreC && opc != Op_StoreI) { + return NULL; + } + + // Only merge stores on arrays, and the stores must have the same size as the elements. + const TypeAryPtr* aryptr_t = _store->adr_type()->isa_aryptr(); + if (aryptr_t == NULL || + type2aelembytes(aryptr_t->elem()->array_element_basic_type()) != _store->memory_size()) { + return NULL; + } + + // The _store must be the "last" store in a chain. If we find a use we could merge with + // then that use or a store further down is the "last" store. + Status status_use = find_adjacent_use_store(_store); + if (status_use.found_store() != NULL) { + return NULL; + } + + // Check if we can merge with at least one def, so that we have at least 2 stores to merge. + Status status_def = find_adjacent_def_store(_store); + if (status_def.found_store() == NULL) { + return NULL; + } + + ResourceMark rm; + Node_List merge_list; + collect_merge_list(merge_list); + + Node* merged_input_value = make_merged_input_value(merge_list); + if (merged_input_value == NULL) { return NULL; } + + StoreNode* merged_store = make_merged_store(merge_list, merged_input_value); + + DEBUG_ONLY( if(TraceMergeStores) { trace(merge_list, merged_input_value, merged_store); } ) + + return merged_store; +} + +// Check compatibility between _store and other_store. +bool MergePrimitiveArrayStores::is_compatible_store(const StoreNode* other_store) const { + int opc = _store->Opcode(); + assert(opc == Op_StoreB || opc == Op_StoreC || opc == Op_StoreI, "precondition"); + assert(_store->adr_type()->isa_aryptr() != NULL, "must be array store"); + + if (other_store == NULL || + _store->Opcode() != other_store->Opcode() || + other_store->adr_type()->isa_aryptr() == NULL) { + return false; + } + + // Check that the size of the stores, and the array elements are all the same. + const TypeAryPtr* aryptr_t1 = _store->adr_type()->is_aryptr(); + const TypeAryPtr* aryptr_t2 = other_store->adr_type()->is_aryptr(); + int size1 = type2aelembytes(aryptr_t1->elem()->array_element_basic_type()); + int size2 = type2aelembytes(aryptr_t2->elem()->array_element_basic_type()); + if (size1 != size2 || + size1 != _store->memory_size() || + _store->memory_size() != other_store->memory_size()) { + return false; + } + return true; +} + +bool MergePrimitiveArrayStores::is_adjacent_pair(const StoreNode* use_store, const StoreNode* def_store) const { + if (!is_adjacent_input_pair(def_store->in(MemNode::ValueIn), + use_store->in(MemNode::ValueIn), + def_store->memory_size())) { + return false; + } + + ResourceMark rm; + ArrayPointer array_pointer_use = ArrayPointer::make(_phase, use_store->in(MemNode::Address)); + ArrayPointer array_pointer_def = ArrayPointer::make(_phase, def_store->in(MemNode::Address)); + if (!array_pointer_def.is_adjacent_to_and_before(array_pointer_use, use_store->memory_size())) { + return false; + } + + return true; +} + +bool MergePrimitiveArrayStores::is_adjacent_input_pair(const Node* n1, const Node* n2, const int memory_size) const { + // Pattern: [n1 = ConI, n2 = ConI] + if (n1->Opcode() == Op_ConI) { + return n2->Opcode() == Op_ConI; + } + + // Pattern: [n1 = base >> shift, n2 = base >> (shift + memory_size)] + Node const* base_n2; + jint shift_n2; + if (!is_con_RShift(n2, base_n2, shift_n2)) { + return false; + } + if (n1->Opcode() == Op_ConvL2I) { + // look through + n1 = n1->in(1); + } + Node const* base_n1; + jint shift_n1; + if (n1 == base_n2) { + // n1 = base = base >> 0 + base_n1 = n1; + shift_n1 = 0; + } else if (!is_con_RShift(n1, base_n1, shift_n1)) { + return false; + } + int bits_per_store = memory_size * 8; + if (base_n1 != base_n2 || + shift_n1 + bits_per_store != shift_n2 || + shift_n1 % bits_per_store != 0) { + return false; + } + + // both load from same value with correct shift + return true; +} + +// Detect pattern: n = base_out >> shift_out +bool MergePrimitiveArrayStores::is_con_RShift(const Node* n, Node const*& base_out, jint& shift_out) { + assert(n != NULL, "precondition"); + + int opc = n->Opcode(); + if (opc == Op_ConvL2I) { + n = n->in(1); + opc = n->Opcode(); + } + + if ((opc == Op_RShiftI || + opc == Op_RShiftL || + opc == Op_URShiftI || + opc == Op_URShiftL) && + n->in(2)->is_ConI()) { + base_out = n->in(1); + shift_out = n->in(2)->get_int(); + assert(shift_out >= 0, "must be positive"); + return true; + } + return false; +} + +// Check if there is nothing between the two stores, except optionally a RangeCheck leading to an uncommon trap. +MergePrimitiveArrayStores::CFGStatus MergePrimitiveArrayStores::cfg_status_for_pair(const StoreNode* use_store, const StoreNode* def_store) { + assert(use_store->in(MemNode::Memory) == def_store, "use-def relationship"); + + Node* ctrl_use = use_store->in(MemNode::Control); + Node* ctrl_def = def_store->in(MemNode::Control); + if (ctrl_use == NULL || ctrl_def == NULL) { + return CFG_Failure; + } + + if (ctrl_use == ctrl_def) { + // Same ctrl -> no RangeCheck in between. + // Check: use_store must be the only use of def_store. + if (def_store->outcnt() > 1) { + return CFG_Failure; + } + return CFG_SuccessNoRangeCheck; + } + + // Different ctrl -> could have RangeCheck in between. + // Check: 1. def_store only has these uses: use_store and MergeMem for uncommon trap, and + // 2. ctrl separated by RangeCheck. + if (def_store->outcnt() != 2) { + return CFG_Failure; // Cannot have exactly these uses: use_store and MergeMem for uncommon trap. + } + int use_store_out_idx = def_store->raw_out(0) == use_store ? 0 : 1; + Node* merge_mem = def_store->raw_out(1 - use_store_out_idx)->isa_MergeMem(); + if (merge_mem == NULL || + merge_mem->outcnt() != 1) { + return CFG_Failure; // Does not have MergeMem for uncommon trap. + } + if (!ctrl_use->is_IfProj() || + !ctrl_use->in(0)->is_RangeCheck() || + ctrl_use->in(0)->outcnt() != 2) { + return CFG_Failure; // Not RangeCheck. + } + ProjNode* other_proj = ctrl_use->as_IfProj()->other_if_proj(); + Node* trap = other_proj->is_uncommon_trap_proj(Deoptimization::Reason_range_check); + if (trap != merge_mem->unique_out() || + ctrl_use->in(0)->in(0) != ctrl_def) { + return CFG_Failure; // Not RangeCheck with merge_mem leading to uncommon trap. + } + + return CFG_SuccessWithRangeCheck; +} + +MergePrimitiveArrayStores::Status MergePrimitiveArrayStores::find_adjacent_use_store(const StoreNode* def_store) const { + Status status_use = find_use_store(def_store); + StoreNode* use_store = status_use.found_store(); + if (use_store != NULL && !is_adjacent_pair(use_store, def_store)) { + return Status::make_failure(); + } + return status_use; +} + +MergePrimitiveArrayStores::Status MergePrimitiveArrayStores::find_adjacent_def_store(const StoreNode* use_store) const { + Status status_def = find_def_store(use_store); + StoreNode* def_store = status_def.found_store(); + if (def_store != NULL && !is_adjacent_pair(use_store, def_store)) { + return Status::make_failure(); + } + return status_def; +} + +MergePrimitiveArrayStores::Status MergePrimitiveArrayStores::find_use_store(const StoreNode* def_store) const { + Status status_use = find_use_store_unidirectional(def_store); + +#ifdef ASSERT + StoreNode* use_store = status_use.found_store(); + if (use_store != NULL) { + Status status_def = find_def_store_unidirectional(use_store); + assert(status_def.found_store() == def_store && + status_def.found_range_check() == status_use.found_range_check(), + "find_use_store and find_def_store must be symmetric"); + } +#endif + + return status_use; +} + +MergePrimitiveArrayStores::Status MergePrimitiveArrayStores::find_def_store(const StoreNode* use_store) const { + Status status_def = find_def_store_unidirectional(use_store); + +#ifdef ASSERT + StoreNode* def_store = status_def.found_store(); + if (def_store != NULL) { + Status status_use = find_use_store_unidirectional(def_store); + assert(status_use.found_store() == use_store && + status_use.found_range_check() == status_def.found_range_check(), + "find_use_store and find_def_store must be symmetric"); + } +#endif + + return status_def; +} + +MergePrimitiveArrayStores::Status MergePrimitiveArrayStores::find_use_store_unidirectional(const StoreNode* def_store) const { + assert(is_compatible_store(def_store), "precondition: must be compatible with _store"); + + for (DUIterator_Fast imax, i = def_store->fast_outs(imax); i < imax; i++) { + StoreNode* use_store = def_store->fast_out(i)->isa_Store(); + if (is_compatible_store(use_store)) { + return Status::make(use_store, cfg_status_for_pair(use_store, def_store)); + } + } + + return Status::make_failure(); +} + +MergePrimitiveArrayStores::Status MergePrimitiveArrayStores::find_def_store_unidirectional(const StoreNode* use_store) const { + assert(is_compatible_store(use_store), "precondition: must be compatible with _store"); + + StoreNode* def_store = use_store->in(MemNode::Memory)->isa_Store(); + if (!is_compatible_store(def_store)) { + return Status::make_failure(); + } + + return Status::make(def_store, cfg_status_for_pair(use_store, def_store)); +} + +static int round_down_power_of_2(uint value) { + return 1 << log2_uint(value); +} + +void MergePrimitiveArrayStores::collect_merge_list(Node_List& merge_list) const { + // The merged store can be at most 8 bytes. + const uint merge_list_max_size = 8 / _store->memory_size(); + assert(merge_list_max_size >= 2 && + merge_list_max_size <= 8 && + is_power_of_2(merge_list_max_size), + "must be 2, 4 or 8"); + + // Traverse up the chain of adjacent def stores. + StoreNode* current = _store; + merge_list.push(current); + while (current != NULL && merge_list.size() < merge_list_max_size) { + Status status = find_adjacent_def_store(current); + current = status.found_store(); + if (current != NULL) { + merge_list.push(current); + + // We can have at most one RangeCheck. + if (status.found_range_check()) { + break; + } + } + } + + // Truncate the merge_list to a power of 2. + const uint pow2size = round_down_power_of_2(merge_list.size()); + assert(pow2size >= 2, "must be merging at least 2 stores"); + while (merge_list.size() > pow2size) { merge_list.pop(); } +} + +// Merge the input values of the smaller stores to a single larger input value. +Node* MergePrimitiveArrayStores::make_merged_input_value(const Node_List& merge_list) { + int new_memory_size = _store->memory_size() * merge_list.size(); + Node* first = merge_list.at(merge_list.size()-1); + Node* merged_input_value = NULL; + if (_store->in(MemNode::ValueIn)->Opcode() == Op_ConI) { + // Pattern: [ConI, ConI, ...] -> new constant + jlong con = 0; + jlong bits_per_store = _store->memory_size() * 8; + jlong mask = (((jlong)1) << bits_per_store) - 1; + for (uint i = 0; i < merge_list.size(); i++) { + jlong con_i = merge_list.at(i)->in(MemNode::ValueIn)->get_int(); + con = con << bits_per_store; + con = con | (mask & con_i); + } + merged_input_value = _phase->longcon(con); + } else { + // Pattern: [base >> 24, base >> 16, base >> 8, base] -> base + // | | + // _store first + // + merged_input_value = first->in(MemNode::ValueIn); + Node const* base_last; + jint shift_last; + bool is_true = is_con_RShift(_store->in(MemNode::ValueIn), base_last, shift_last); + assert(is_true, "must detect con RShift"); + if (merged_input_value != base_last && merged_input_value->Opcode() == Op_ConvL2I) { + // look through + merged_input_value = merged_input_value->in(1); + } + if (merged_input_value != base_last) { + // merged_input_value is not the base + return NULL; + } + } + + if (_phase->type(merged_input_value)->isa_long() != NULL && new_memory_size <= 4) { + // Example: + // + // long base = ...; + // a[0] = (byte)(base >> 0); + // a[1] = (byte)(base >> 8); + // + merged_input_value = _phase->transform(new ConvL2INode(merged_input_value)); + } + + assert((_phase->type(merged_input_value)->isa_int() != NULL && new_memory_size <= 4) || + (_phase->type(merged_input_value)->isa_long() != NULL && new_memory_size == 8), + "merged_input_value is either int or long, and new_memory_size is small enough"); + + return merged_input_value; +} + +// // +// first_ctrl first_mem first_adr first_ctrl first_mem first_adr // +// | | | | | | // +// | | | | +---------------+ | // +// | | | | | | | // +// | | +---------+ | | +---------------+ // +// | | | | | | | | // +// +--------------+ | | v1 +------------------------------+ | | v1 // +// | | | | | | | | | | | | // +// RangeCheck first_store RangeCheck | | first_store // +// | | | | | | | // +// last_ctrl | +----> unc_trap last_ctrl | | +----> unc_trap // +// | | ===> | | | // +// +--------------+ | a2 v2 | | | // +// | | | | | | | | // +// | second_store | | | // +// | | | | | [v1 v2 ... vn] // +// ... ... | | | | // +// | | | | | v // +// +--------------+ | an vn +--------------+ | | merged_input_value // +// | | | | | | | | // +// last_store (= _store) merged_store // +// // +StoreNode* MergePrimitiveArrayStores::make_merged_store(const Node_List& merge_list, Node* merged_input_value) { + Node* first_store = merge_list.at(merge_list.size()-1); + Node* last_ctrl = _store->in(MemNode::Control); // after (optional) RangeCheck + Node* first_mem = first_store->in(MemNode::Memory); + Node* first_adr = first_store->in(MemNode::Address); + + const TypePtr* new_adr_type = _store->adr_type(); + + int new_memory_size = _store->memory_size() * merge_list.size(); + BasicType bt = T_ILLEGAL; + switch (new_memory_size) { + case 2: bt = T_SHORT; break; + case 4: bt = T_INT; break; + case 8: bt = T_LONG; break; + } + + StoreNode* merged_store = StoreNode::make(*_phase, last_ctrl, first_mem, first_adr, + new_adr_type, merged_input_value, bt, MemNode::unordered); + + // Marking the store mismatched is sufficient to prevent reordering, since array stores + // are all on the same slice. Hence, we need no barriers. + merged_store->set_mismatched_access(); + + // Constants above may now also be be packed -> put candidate on worklist + _phase->is_IterGVN()->_worklist.push(first_mem); + + return merged_store; +} + +#ifdef ASSERT +void MergePrimitiveArrayStores::trace(const Node_List& merge_list, const Node* merged_input_value, const StoreNode* merged_store) const { + stringStream ss; + ss.print_cr("[TraceMergeStores]: Replace"); + for (int i = (int)merge_list.size() - 1; i >= 0; i--) { + merge_list.at(i)->dump("\n", false, &ss); + } + ss.print_cr("[TraceMergeStores]: with"); + merged_input_value->dump("\n", false, &ss); + merged_store->dump("\n", false, &ss); + tty->print("%s", ss.as_string()); +} +#endif + //------------------------------Ideal------------------------------------------ // Change back-to-back Store(, p, x) -> Store(m, p, y) to Store(m, p, x). // When a store immediately follows a relevant allocation/initialization, @@ -2634,6 +3315,18 @@ Node *StoreNode::Ideal(PhaseGVN *phase, bool can_reshape) { } } +#ifdef VM_LITTLE_ENDIAN + if (MergeStores && UseUnalignedAccesses) { + if (phase->C->post_loop_opts_phase()) { + MergePrimitiveArrayStores merge(phase, this); + Node* progress = merge.run(); + if (progress != NULL) { return progress; } + } else { + phase->C->record_for_post_loop_opts_igvn(this); + } + } +#endif + return NULL; // No further progress } diff --git a/src/hotspot/share/opto/node.hpp b/src/hotspot/share/opto/node.hpp index 1a8b0d0296f..2ddd824798c 100644 --- a/src/hotspot/share/opto/node.hpp +++ b/src/hotspot/share/opto/node.hpp @@ -61,6 +61,7 @@ class CmpNode; class CodeBuffer; class ConstraintCastNode; class ConNode; +class ConINode; class CompareAndSwapNode; class CompareAndExchangeNode; class CountedLoopNode; @@ -689,6 +690,8 @@ class Node { #if INCLUDE_SHENANDOAHGC DEFINE_CLASS_ID(ShenandoahBarrier, Type, 7) #endif + DEFINE_CLASS_ID(Con, Type, 8) + DEFINE_CLASS_ID(ConI, Con, 0) DEFINE_CLASS_ID(Proj, Node, 3) DEFINE_CLASS_ID(CatchProj, Proj, 0) @@ -825,6 +828,7 @@ class Node { DEFINE_CLASS_QUERY(CatchProj) DEFINE_CLASS_QUERY(CheckCastPP) DEFINE_CLASS_QUERY(CastII) + DEFINE_CLASS_QUERY(ConI) DEFINE_CLASS_QUERY(ConstraintCast) DEFINE_CLASS_QUERY(ClearArray) DEFINE_CLASS_QUERY(CMove) diff --git a/src/hotspot/share/opto/phaseX.cpp b/src/hotspot/share/opto/phaseX.cpp index 9cf53dc10e0..bcce06fb259 100644 --- a/src/hotspot/share/opto/phaseX.cpp +++ b/src/hotspot/share/opto/phaseX.cpp @@ -2230,7 +2230,15 @@ void PhasePeephole::print_statistics() { //------------------------------set_req_X-------------------------------------- void Node::set_req_X( uint i, Node *n, PhaseIterGVN *igvn ) { assert( is_not_dead(n), "can not use dead node"); - assert( igvn->hash_find(this) != this, "Need to remove from hash before changing edges" ); +#ifdef ASSERT + if (igvn->hash_find(this) == this) { + tty->print_cr("Need to remove from hash before changing edges"); + this->dump(1); + tty->print_cr("Set at i = %d", i); + n->dump(); + assert(false, "Need to remove from hash before changing edges"); + } +#endif Node *old = in(i); set_req(i, n); diff --git a/test/micro/org/openjdk/bench/vm/compiler/MergeStores.java b/test/micro/org/openjdk/bench/vm/compiler/MergeStores.java new file mode 100644 index 00000000000..26e11a3af83 --- /dev/null +++ b/test/micro/org/openjdk/bench/vm/compiler/MergeStores.java @@ -0,0 +1,724 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package org.openjdk.bench.vm.compiler; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; + + +import jdk.internal.misc.Unsafe; +// import jdk.internal.util.ByteArrayLittleEndian; +import java.util.concurrent.TimeUnit; + +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Warmup(iterations = 3, time = 3) +@Measurement(iterations = 3, time = 3) +@Fork(value = 3, jvmArgsAppend = { + "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED", + "--add-exports", "java.base/jdk.internal.util=ALL-UNNAMED"}) +@State(Scope.Benchmark) +public class MergeStores { + + public static final int RANGE = 100; + + static Unsafe UNSAFE = Unsafe.getUnsafe(); + + @Param("1") + public static short vS; + + @Param("1") + public static int vI; + + @Param("1") + public static long vL; + + public static int offset = 5; + public static byte[] aB = new byte[RANGE]; + public static short[] aS = new short[RANGE]; + public static int[] aI = new int[RANGE]; + + // ------------------------------------------- + // ------- Little-Endian API ---------- + // ------------------------------------------- + + // Store a short LE into an array using store bytes in an array + static void storeShortLE(byte[] bytes, int offset, short value) { + storeBytes(bytes, offset, (byte)(value >> 0), + (byte)(value >> 8)); + } + + // Store an int LE into an array using store bytes in an array + static void storeIntLE(byte[] bytes, int offset, int value) { + storeBytes(bytes, offset, (byte)(value >> 0 ), + (byte)(value >> 8 ), + (byte)(value >> 16), + (byte)(value >> 24)); + } + + // Store an int LE into an array using store bytes in an array + static void storeLongLE(byte[] bytes, int offset, long value) { + storeBytes(bytes, offset, (byte)(value >> 0 ), + (byte)(value >> 8 ), + (byte)(value >> 16), + (byte)(value >> 24), + (byte)(value >> 32), + (byte)(value >> 40), + (byte)(value >> 48), + (byte)(value >> 56)); + } + + // Store 2 bytes into an array + static void storeBytes(byte[] bytes, int offset, byte b0, byte b1) { + bytes[offset + 0] = b0; + bytes[offset + 1] = b1; + } + + // Store 4 bytes into an array + static void storeBytes(byte[] bytes, int offset, byte b0, byte b1, byte b2, byte b3) { + bytes[offset + 0] = b0; + bytes[offset + 1] = b1; + bytes[offset + 2] = b2; + bytes[offset + 3] = b3; + } + + // Store 8 bytes into an array + static void storeBytes(byte[] bytes, int offset, byte b0, byte b1, byte b2, byte b3, + byte b4, byte b5, byte b6, byte b7) { + bytes[offset + 0] = b0; + bytes[offset + 1] = b1; + bytes[offset + 2] = b2; + bytes[offset + 3] = b3; + bytes[offset + 4] = b4; + bytes[offset + 5] = b5; + bytes[offset + 6] = b6; + bytes[offset + 7] = b7; + } + + // -------------------------------- BENCHMARKS -------------------------------- + + @Benchmark + public void baseline() { + } + + @Benchmark + public byte[] baseline_allocate() { + byte[] aB = new byte[RANGE]; + return aB; + } + + @Benchmark + public byte[] store_B2_con_adr0_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[0] = (byte)0x01; + aB[1] = (byte)0x02; + return aB; + } + + @Benchmark + public byte[] store_B2_con_adr1_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[1] = (byte)0x01; + aB[2] = (byte)0x02; + return aB; + } + + @Benchmark + public byte[] store_B2_con_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)0x01; + aB[offset + 1] = (byte)0x02; + return aB; + } + + @Benchmark + public byte[] store_B2_con_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, (short)0x0201); + return aB; + } + + /* + @Benchmark + public byte[] store_B2_con_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setShort(aB, offset, (short)0x0201); + return aB; + } + */ + + @Benchmark + public byte[] store_B2_con_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeShortLE(aB, offset, (short)0x0201); + return aB; + } + + @Benchmark + public byte[] store_B2_con_offs_nonalloc_direct() { + aB[offset + 0] = (byte)0x01; + aB[offset + 1] = (byte)0x02; + return aB; + } + + @Benchmark + public byte[] store_B2_con_offs_nonalloc_unsafe() { + UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, (short)0x0201); + return aB; + } + + /* + @Benchmark + public byte[] store_B2_con_offs_nonalloc_bale() { + ByteArrayLittleEndian.setShort(aB, offset, (short)0x0201); + return aB; + } + */ + + @Benchmark + public byte[] store_B2_con_offs_nonalloc_leapi() { + storeShortLE(aB, offset, (short)0x0201); + return aB; + } + + @Benchmark + public byte[] store_B2_S_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)(vS >> 0 ); + aB[offset + 1] = (byte)(vS >> 8 ); + return aB; + } + + @Benchmark + public byte[] store_B2_S_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vS); + return aB; + } + + /* + @Benchmark + public byte[] store_B2_S_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setShort(aB, offset, vS); + return aB; + } + */ + + @Benchmark + public byte[] store_B2_S_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeShortLE(aB, offset, vS); + return aB; + } + + @Benchmark + public byte[] store_B2_S_offs_nonalloc_direct() { + aB[offset + 0] = (byte)(vS >> 0 ); + aB[offset + 1] = (byte)(vS >> 8 ); + return aB; + } + + @Benchmark + public byte[] store_B2_S_offs_nonalloc_unsafe() { + UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vS); + return aB; + } + + /* + @Benchmark + public byte[] store_B2_S_offs_nonalloc_bale() { + ByteArrayLittleEndian.setShort(aB, offset, vS); + return aB; + } + */ + + @Benchmark + public byte[] store_B2_S_offs_nonalloc_leapi() { + storeShortLE(aB, offset, vS); + return aB; + } + + @Benchmark + public byte[] store_B4_con_adr0_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[0] = (byte)0x01; + aB[1] = (byte)0x02; + aB[2] = (byte)0x03; + aB[3] = (byte)0x04; + return aB; + } + + @Benchmark + public byte[] store_B4_con_adr1_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[1] = (byte)0x01; + aB[2] = (byte)0x02; + aB[3] = (byte)0x03; + aB[4] = (byte)0x04; + return aB; + } + + @Benchmark + public byte[] store_B4_con_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)0x01; + aB[offset + 1] = (byte)0x02; + aB[offset + 2] = (byte)0x03; + aB[offset + 3] = (byte)0x04; + return aB; + } + + @Benchmark + public byte[] store_B4_con_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x04030201); + return aB; + } + + /* + @Benchmark + public byte[] store_B4_con_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setInt(aB, offset, 0x04030201); + return aB; + } + */ + + @Benchmark + public byte[] store_B4_con_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeIntLE(aB, offset, 0x04030201); + return aB; + } + + @Benchmark + public byte[] store_B4_con_offs_nonalloc_direct() { + aB[offset + 0] = (byte)0x01; + aB[offset + 1] = (byte)0x02; + aB[offset + 2] = (byte)0x03; + aB[offset + 3] = (byte)0x04; + return aB; + } + + @Benchmark + public byte[] store_B4_con_offs_nonalloc_unsafe() { + UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x04030201); + return aB; + } + + /* + @Benchmark + public byte[] store_B4_con_offs_nonalloc_bale() { + ByteArrayLittleEndian.setInt(aB, offset, 0x04030201); + return aB; + } + */ + + @Benchmark + public byte[] store_B4_con_offs_nonalloc_leapi() { + storeIntLE(aB, offset, 0x04030201); + return aB; + } + + @Benchmark + public byte[] store_B4_I_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)(vI >> 0 ); + aB[offset + 1] = (byte)(vI >> 8 ); + aB[offset + 2] = (byte)(vI >> 16); + aB[offset + 3] = (byte)(vI >> 24); + return aB; + } + + @Benchmark + public byte[] store_B4_I_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vI); + return aB; + } + + /* + @Benchmark + public byte[] store_B4_I_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setInt(aB, offset, vI); + return aB; + } + */ + + @Benchmark + public byte[] store_B4_I_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeIntLE(aB, offset, vI); + return aB; + } + + @Benchmark + public byte[] store_B4_I_offs_nonalloc_direct() { + aB[offset + 0] = (byte)(vI >> 0 ); + aB[offset + 1] = (byte)(vI >> 8 ); + aB[offset + 2] = (byte)(vI >> 16); + aB[offset + 3] = (byte)(vI >> 24); + return aB; + } + + @Benchmark + public byte[] store_B4_I_offs_nonalloc_unsafe() { + UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vI); + return aB; + } + + /* + @Benchmark + public byte[] store_B4_I_offs_nonalloc_bale() { + ByteArrayLittleEndian.setInt(aB, offset, vI); + return aB; + } + */ + + @Benchmark + public byte[] store_B4_I_offs_nonalloc_leapi() { + storeIntLE(aB, offset, vI); + return aB; + } + + @Benchmark + public byte[] store_B8_con_adr0_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[0] = (byte)0x01; + aB[1] = (byte)0x02; + aB[2] = (byte)0x03; + aB[3] = (byte)0x04; + aB[4] = (byte)0x05; + aB[5] = (byte)0x06; + aB[6] = (byte)0x07; + aB[7] = (byte)0x08; + return aB; + } + + @Benchmark + public byte[] store_B8_con_adr1_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[1] = (byte)0x01; + aB[2] = (byte)0x02; + aB[3] = (byte)0x03; + aB[4] = (byte)0x04; + aB[5] = (byte)0x05; + aB[6] = (byte)0x06; + aB[7] = (byte)0x07; + aB[8] = (byte)0x08; + return aB; + } + + @Benchmark + public byte[] store_B8_con_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)0x01; + aB[offset + 1] = (byte)0x02; + aB[offset + 2] = (byte)0x03; + aB[offset + 3] = (byte)0x04; + aB[offset + 4] = (byte)0x05; + aB[offset + 5] = (byte)0x06; + aB[offset + 6] = (byte)0x07; + aB[offset + 7] = (byte)0x08; + return aB; + } + + @Benchmark + public byte[] store_B8_con_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x0807060504030201L); + return aB; + } + + /* + @Benchmark + public byte[] store_B8_con_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setLong(aB, offset, 0x0807060504030201L); + return aB; + } + */ + + @Benchmark + public byte[] store_B8_con_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeLongLE(aB, offset, 0x0807060504030201L); + return aB; + } + + @Benchmark + public byte[] store_B8_con_offs_nonalloc_direct() { + aB[offset + 0] = (byte)0x01; + aB[offset + 1] = (byte)0x02; + aB[offset + 2] = (byte)0x03; + aB[offset + 3] = (byte)0x04; + aB[offset + 4] = (byte)0x05; + aB[offset + 5] = (byte)0x06; + aB[offset + 6] = (byte)0x07; + aB[offset + 7] = (byte)0x08; + return aB; + } + + @Benchmark + public byte[] store_B8_con_offs_nonalloc_unsafe() { + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x0807060504030201L); + return aB; + } + + /* + @Benchmark + public byte[] store_B8_con_offs_nonalloc_bale() { + ByteArrayLittleEndian.setLong(aB, offset, 0x0807060504030201L); + return aB; + } + */ + + @Benchmark + public byte[] store_B8_con_offs_nonalloc_leapi() { + storeLongLE(aB, offset, 0x0807060504030201L); + return aB; + } + + @Benchmark + public byte[] store_B8_L_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)(vL >> 0 ); + aB[offset + 1] = (byte)(vL >> 8 ); + aB[offset + 2] = (byte)(vL >> 16); + aB[offset + 3] = (byte)(vL >> 24); + aB[offset + 4] = (byte)(vL >> 32); + aB[offset + 5] = (byte)(vL >> 40); + aB[offset + 6] = (byte)(vL >> 48); + aB[offset + 7] = (byte)(vL >> 56); + return aB; + } + + @Benchmark + public byte[] store_B8_L_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vL); + return aB; + } + + /* + @Benchmark + public byte[] store_B8_L_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setLong(aB, offset, vL); + return aB; + } + */ + + @Benchmark + public byte[] store_B8_L_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeLongLE(aB, offset, vL); + return aB; + } + + @Benchmark + public byte[] store_B8_L_offs_nonalloc_direct() { + aB[offset + 0] = (byte)(vL >> 0 ); + aB[offset + 1] = (byte)(vL >> 8 ); + aB[offset + 2] = (byte)(vL >> 16); + aB[offset + 3] = (byte)(vL >> 24); + aB[offset + 4] = (byte)(vL >> 32); + aB[offset + 5] = (byte)(vL >> 40); + aB[offset + 6] = (byte)(vL >> 48); + aB[offset + 7] = (byte)(vL >> 56); + return aB; + } + + @Benchmark + public byte[] store_B8_L_offs_nonalloc_unsafe() { + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vL); + return aB; + } + + /* + @Benchmark + public byte[] store_B8_L_offs_nonalloc_bale() { + ByteArrayLittleEndian.setLong(aB, offset, vL); + return aB; + } + */ + + @Benchmark + public byte[] store_B8_L_offs_nonalloc_leapi() { + storeLongLE(aB, offset, vL); + return aB; + } + + @Benchmark + public byte[] store_B8_I2_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)(vI >> 0 ); + aB[offset + 1] = (byte)(vI >> 8 ); + aB[offset + 2] = (byte)(vI >> 16); + aB[offset + 3] = (byte)(vI >> 24); + aB[offset + 4] = (byte)(vI >> 0 ); + aB[offset + 5] = (byte)(vI >> 8 ); + aB[offset + 6] = (byte)(vI >> 16); + aB[offset + 7] = (byte)(vI >> 24); + return aB; + } + + @Benchmark + public byte[] store_B8_I2_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 0, vI); + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 4, vI); + return aB; + } + + /* + @Benchmark + public byte[] store_B8_I2_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setInt(aB, offset + 0, vI); + ByteArrayLittleEndian.setInt(aB, offset + 4, vI); + return aB; + } + */ + + @Benchmark + public byte[] store_B8_I2_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeIntLE(aB, offset + 0, vI); + storeIntLE(aB, offset + 4, vI); + return aB; + } + + @Benchmark + public byte[] store_B8_I2_offs_nonalloc_direct() { + aB[offset + 0] = (byte)(vI >> 0 ); + aB[offset + 1] = (byte)(vI >> 8 ); + aB[offset + 2] = (byte)(vI >> 16); + aB[offset + 3] = (byte)(vI >> 24); + aB[offset + 4] = (byte)(vI >> 0 ); + aB[offset + 5] = (byte)(vI >> 8 ); + aB[offset + 6] = (byte)(vI >> 16); + aB[offset + 7] = (byte)(vI >> 24); + return aB; + } + + @Benchmark + public byte[] store_B8_I2_offs_nonalloc_unsafe() { + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 0, vI); + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 4, vI); + return aB; + } + + /* + @Benchmark + public byte[] store_B8_I2_offs_nonalloc_bale() { + ByteArrayLittleEndian.setInt(aB, offset + 0, vI); + ByteArrayLittleEndian.setInt(aB, offset + 4, vI); + return aB; + } + */ + + @Benchmark + public byte[] store_B8_I2_offs_nonalloc_leapi() { + storeIntLE(aB, offset + 0, vI); + storeIntLE(aB, offset + 4, vI); + return aB; + } + + @Benchmark + public short[] store_S2_con_offs_allocate_direct() { + short[] aS = new short[RANGE]; + aS[offset + 0] = (short)0x0102; + aS[offset + 1] = (short)0x0304; + return aS; + } + + @Benchmark + public short[] store_S2_con_offs_nonalloc_direct() { + aS[offset + 0] = (short)0x0102; + aS[offset + 1] = (short)0x0304; + return aS; + } + + @Benchmark + public short[] store_S4_con_offs_allocate_direct() { + short[] aS = new short[RANGE]; + aS[offset + 0] = (short)0x0102; + aS[offset + 1] = (short)0x0304; + aS[offset + 2] = (short)0x0506; + aS[offset + 3] = (short)0x0708; + return aS; + } + + @Benchmark + public short[] store_S4_con_offs_nonalloc_direct() { + aS[offset + 0] = (short)0x0102; + aS[offset + 1] = (short)0x0304; + aS[offset + 2] = (short)0x0506; + aS[offset + 3] = (short)0x0708; + return aS; + } + + @Benchmark + public int[] store_I2_con_offs_allocate_direct() { + int[] aI = new int[RANGE]; + aI[offset + 0] = 0x01020304; + aI[offset + 1] = 0x05060708; + return aI; + } + + @Benchmark + public int[] store_I2_con_offs_nonalloc_direct() { + aI[offset + 0] = 0x01020304; + aI[offset + 1] = 0x05060708; + return aI; + } + + @Benchmark + public int[] store_I2_zero_offs_allocate_direct() { + int[] aI = new int[RANGE]; + aI[offset + 0] = 0; + aI[offset + 1] = 0; + return aI; + } + + @Benchmark + public int[] store_I2_zero_offs_nonalloc_direct() { + aI[offset + 0] = 0; + aI[offset + 1] = 0; + return aI; + } +}